def test_score_function(): lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) acc = lr.score(X, y) assert acc == 1.0, acc
def main(): #SETUP!!! train = 0.9 #percetage of data for training dev = 0.05 #percetage of data for development test = 0.05 #percetage of data for test n_features = 1500 #this could be adjusted later by the algorithm #this is setting the CountVectorizer from sklearn.feature_extraction.text vectorizer = CountVectorizer( min_df=20, #you may want to adjust this max_features=n_features, lowercase=False) DO_STANDARDIZE_DATA = 1 #1 yes, 0 no regularization_lambda = 0.1 ETA = 0.00005 EPOCHS = 50 model_sm = SoftmaxRegression( eta=ETA, epochs=EPOCHS, l2=regularization_lambda, #n_classes=U, minibatches=1, random_seed=1, print_progress=3) print("-----------------------------") print("METHOD - SOFTMAX REGRESSION") print("-----------------------------") print("Hello,\nwe will use Softmax Regression to classify twitter users\n") setpath() #get the users screen_names = get_users(FILE_USERS) info_data = get_info() U = len(screen_names) #number of users for i in range(U): print("For", screen_names[i], " one has ", info_data[i, 1], "tweets") if os.path.isfile(FOLDER + "/update_SM" + str(U) + ".txt") == True: with open(FOLDER + "/update_SM" + str(U) + ".txt", "r") as h: update = h.read() h.close() print("We load the dataset.") file = FOLDER + "/X_train_politic" + update + ".npy" with open(file, 'rb') as f: X_train = pickle.load(f) file = FOLDER + "/Y_train_politic" + update + ".npy" with open(file, 'rb') as f: Y_train = pickle.load(f) file = FOLDER + "/X_dev_politic" + update + ".npy" with open(file, 'rb') as f: X_dev = pickle.load(f) file = FOLDER + "/Y_dev_politic" + update + ".npy" with open(file, 'rb') as f: Y_dev = pickle.load(f) file = FOLDER + "/X_test_politic" + update + ".npy" with open(file, 'rb') as f: X_test = pickle.load(f) file = FOLDER + "/Y_test_politic" + update + ".npy" with open(file, 'rb') as f: Y_test = pickle.load(f) else: all_tweets = load_data() random.shuffle(all_tweets) random.shuffle( all_tweets) #Always shuffle your opponent cards when you play :) tweets = [] YY = [] for i in range(len(all_tweets)): tweets.append(all_tweets[i][2]) YY.append(all_tweets[i][0]) if len(tweets) == len(all_tweets): print("We load the data and we create the data set!") Y = np.array(YY) #this is the output label vector print("-----------------------------") m = len(tweets) X_train_1, x_appoggio, Y_train, y_appoggio = train_test_split( tweets, Y, test_size=(dev + test)) X_dev_1, X_test_1, Y_dev, Y_test = train_test_split( x_appoggio, y_appoggio, test_size=(test / (dev + test))) print("We will train with the", train * 100, " % of the data;") print(dev * 100, "% of the data is reserve for the method development;") print(test * 100, "% of the data is for the test.") vectorizer.fit(X_train_1) X_train = vectorizer.transform(X_train_1) X_dev = vectorizer.transform(X_dev_1) X_test = vectorizer.transform(X_test_1) if DO_STANDARDIZE_DATA == 0: print("We don't standardize data") else: print( "We will provide to the model with standardize data, mean zero and variance 1" ) X_train, X_dev, X_test = standardize_data(X_train, X_dev, X_test) del (all_tweets) del (X_train_1, X_dev_1, X_test_1, x_appoggio, y_appoggio) today = date.today() today_string = today.strftime("%y_%b_%d") #we save the data we have prepared with open( FOLDER + "/X_train_politic" + today_string + "_SM" + str(U) + ".npy", 'wb') as f: pickle.dump(X_train, f) file = FOLDER + "/X_train_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_train, f) file = FOLDER + "/Y_train_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_train, f) file = FOLDER + "/X_dev_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_dev, f) file = FOLDER + "/Y_dev_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_dev, f) file = FOLDER + "/X_test_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_test, f) file = FOLDER + "/Y_test_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_test, f) with open(FOLDER + "/update_SM" + str(U) + ".txt", "w") as h: h.write(today_string + "_SM" + str(U)) h.close() D = X_test.toarray().shape[1] #this is the lengh of the input vector print("\n") if n_features > D: n_features = D print("The # of features is", n_features) print("The regularization parameter is", regularization_lambda) print("The learning step is", ETA) print("The # of cycle is", EPOCHS) print("\n") #WE START TRAINING THE MODEL model_sm.fit(X_train.toarray(), Y_train) acc = model_sm.score(X_train.toarray(), Y_train) acc_dev = model_sm.score(X_dev.toarray(), Y_dev) print("\n") print("Accuracy on the training set", acc) print("Accuracy on the development set", acc_dev) #print some statistics about the model df_score, df_fp, df_pre = compute_accuracies(model_sm, 1, screen_names, X_train, X_dev, Y_train, Y_dev)
y = data[:, data.shape[1] - 1] # Label - shape: 150, 1 X = data[:, 0:data.shape[1] - 1].astype(float) # Data - shape: 150, 4 X_train = X[0:105, :] #shape: 120, 4 X_test = X[105:X.shape[0], :] #30, 4 y_train = y[0:105] #shape: 120, 4 y_test = y[105:y.shape[0]] #30, 4 del data, X, y # Map label sang 0, 1, 2 classes = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} y_train = [classes[item] for item in y_train] y_test = [classes[item] for item in y_test] y_train = np.asarray(y_train) y_test = np.asarray(y_test) # Softmax softmax = SoftmaxRegression(eta=1 / (10 ^ 4), epochs=500, minibatches=1, random_seed=0, print_progress=3) softmax.fit(X_train, y_train, init_params=True) """ plt.plot(range(len(softmax.cost_)), softmax.cost_) plt.xlabel('Iterations') plt.ylabel('Cost') plt.show() """ accuracy = softmax.score(X_test, y_test) print(accuracy)