Пример #1
0
def test_score_function():
    lr = SoftmaxRegression(epochs=200,
                           eta=0.005,
                           minibatches=1,
                           random_seed=1)
    lr.fit(X, y)
    acc = lr.score(X, y)
    assert acc == 1.0, acc
Пример #2
0
def test_score_function():
    lr = SoftmaxRegression(epochs=200,
                           eta=0.005,
                           minibatches=1,
                           random_seed=1)
    lr.fit(X, y)
    acc = lr.score(X, y)
    assert acc == 1.0, acc
Пример #3
0
def main():
    #SETUP!!!
    train = 0.9  #percetage of data for training
    dev = 0.05  #percetage of data for development
    test = 0.05  #percetage of data for test

    n_features = 1500  #this could be adjusted later by the algorithm

    #this is setting the  CountVectorizer from sklearn.feature_extraction.text
    vectorizer = CountVectorizer(
        min_df=20,  #you may want to adjust this
        max_features=n_features,
        lowercase=False)

    DO_STANDARDIZE_DATA = 1  #1 yes, 0 no

    regularization_lambda = 0.1
    ETA = 0.00005
    EPOCHS = 50
    model_sm = SoftmaxRegression(
        eta=ETA,
        epochs=EPOCHS,
        l2=regularization_lambda,
        #n_classes=U,
        minibatches=1,
        random_seed=1,
        print_progress=3)

    print("-----------------------------")
    print("METHOD - SOFTMAX REGRESSION")
    print("-----------------------------")

    print("Hello,\nwe will use Softmax Regression to classify twitter users\n")
    setpath()

    #get the users
    screen_names = get_users(FILE_USERS)
    info_data = get_info()
    U = len(screen_names)  #number of users
    for i in range(U):
        print("For", screen_names[i], " one has ", info_data[i, 1], "tweets")

    if os.path.isfile(FOLDER + "/update_SM" + str(U) + ".txt") == True:
        with open(FOLDER + "/update_SM" + str(U) + ".txt", "r") as h:
            update = h.read()
            h.close()

        print("We load the dataset.")
        file = FOLDER + "/X_train_politic" + update + ".npy"
        with open(file, 'rb') as f:
            X_train = pickle.load(f)

        file = FOLDER + "/Y_train_politic" + update + ".npy"
        with open(file, 'rb') as f:
            Y_train = pickle.load(f)

        file = FOLDER + "/X_dev_politic" + update + ".npy"
        with open(file, 'rb') as f:
            X_dev = pickle.load(f)

        file = FOLDER + "/Y_dev_politic" + update + ".npy"
        with open(file, 'rb') as f:
            Y_dev = pickle.load(f)

        file = FOLDER + "/X_test_politic" + update + ".npy"
        with open(file, 'rb') as f:
            X_test = pickle.load(f)

        file = FOLDER + "/Y_test_politic" + update + ".npy"
        with open(file, 'rb') as f:
            Y_test = pickle.load(f)
    else:
        all_tweets = load_data()
        random.shuffle(all_tweets)
        random.shuffle(
            all_tweets)  #Always shuffle your opponent cards when you play :)

        tweets = []
        YY = []
        for i in range(len(all_tweets)):
            tweets.append(all_tweets[i][2])
            YY.append(all_tweets[i][0])

        if len(tweets) == len(all_tweets):
            print("We load the data and we create the data set!")

        Y = np.array(YY)  #this is the output label vector

        print("-----------------------------")
        m = len(tweets)
        X_train_1, x_appoggio, Y_train, y_appoggio = train_test_split(
            tweets, Y, test_size=(dev + test))
        X_dev_1, X_test_1, Y_dev, Y_test = train_test_split(
            x_appoggio, y_appoggio, test_size=(test / (dev + test)))
        print("We will train with the", train * 100, " % of the data;")
        print(dev * 100,
              "% of the data is reserve for the method development;")
        print(test * 100, "% of the data is for the test.")

        vectorizer.fit(X_train_1)
        X_train = vectorizer.transform(X_train_1)
        X_dev = vectorizer.transform(X_dev_1)
        X_test = vectorizer.transform(X_test_1)

        if DO_STANDARDIZE_DATA == 0:
            print("We don't standardize data")
        else:
            print(
                "We will provide to the model with standardize data, mean zero and variance 1"
            )
            X_train, X_dev, X_test = standardize_data(X_train, X_dev, X_test)

        del (all_tweets)
        del (X_train_1, X_dev_1, X_test_1, x_appoggio, y_appoggio)

        today = date.today()
        today_string = today.strftime("%y_%b_%d")
        #we save the data we have prepared
        with open(
                FOLDER + "/X_train_politic" + today_string + "_SM" + str(U) +
                ".npy", 'wb') as f:
            pickle.dump(X_train, f)

        file = FOLDER + "/X_train_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(X_train, f)

        file = FOLDER + "/Y_train_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(Y_train, f)

        file = FOLDER + "/X_dev_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(X_dev, f)

        file = FOLDER + "/Y_dev_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(Y_dev, f)

        file = FOLDER + "/X_test_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(X_test, f)

        file = FOLDER + "/Y_test_politic" + today_string + "_SM" + str(
            U) + ".npy"
        with open(file, "wb") as f:
            pickle.dump(Y_test, f)

        with open(FOLDER + "/update_SM" + str(U) + ".txt", "w") as h:
            h.write(today_string + "_SM" + str(U))
            h.close()

    D = X_test.toarray().shape[1]  #this is the lengh of the input vector

    print("\n")
    if n_features > D:
        n_features = D
    print("The # of features is", n_features)
    print("The regularization parameter is", regularization_lambda)
    print("The learning step is", ETA)
    print("The # of cycle is", EPOCHS)
    print("\n")

    #WE START TRAINING THE MODEL
    model_sm.fit(X_train.toarray(), Y_train)

    acc = model_sm.score(X_train.toarray(), Y_train)
    acc_dev = model_sm.score(X_dev.toarray(), Y_dev)
    print("\n")
    print("Accuracy on the training set", acc)
    print("Accuracy on the development set", acc_dev)

    #print some statistics about the model
    df_score, df_fp, df_pre = compute_accuracies(model_sm, 1, screen_names,
                                                 X_train, X_dev, Y_train,
                                                 Y_dev)
Пример #4
0
y = data[:, data.shape[1] - 1]  # Label - shape: 150, 1
X = data[:, 0:data.shape[1] - 1].astype(float)  # Data - shape: 150, 4
X_train = X[0:105, :]  #shape: 120, 4
X_test = X[105:X.shape[0], :]  #30, 4
y_train = y[0:105]  #shape: 120, 4
y_test = y[105:y.shape[0]]  #30, 4
del data, X, y

# Map label sang 0, 1, 2
classes = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
y_train = [classes[item] for item in y_train]
y_test = [classes[item] for item in y_test]
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

# Softmax
softmax = SoftmaxRegression(eta=1 / (10 ^ 4),
                            epochs=500,
                            minibatches=1,
                            random_seed=0,
                            print_progress=3)
softmax.fit(X_train, y_train, init_params=True)
"""
plt.plot(range(len(softmax.cost_)), softmax.cost_)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()
"""
accuracy = softmax.score(X_test, y_test)
print(accuracy)