def evaluate(self, test_data_file): ID_test, X_test, Y_test = csv_to_np(test_data_file[0]) X_test = average_word_embeddings(X_test, self.word_to_vec_mapping) Y_test = integer_to_one_hot(Y_test) predictions = self.model.predict(X_test) pred_dict = dict() for i in range(len(X_test)): pred_dict[str(ID_test[i])] = label_to_sentiment(np.argmax(predictions[i])) loss, accuracy = self.model.evaluate(X_test, Y_test) print() print("Loss = ", loss) print("Test accuracy = " + str(accuracy*100) + "%") evaluation.evaluate(pred_dict, test_data_file[1], type(self).__name__) evaluation.confusion(pred_dict, test_data_file[1], type(self).__name__) return(predictions, Y_test)
# ans_num = model.predict(t_F3) # model = joblib.load('F3_and_SVM.pkl') # ans_num = model.predict(t_F3) # ans_num = model.predict(t_F5) # Xt = np.concatenate((tF1, tF2, tF3, tF4, tF5, tF6), axis=1) # Xt = np.concatenate((tF1, tF2, tF3), axis=1) # Xt = np.concatenate((tF1, tF3, tF5, tF6, tF7), axis=1) Xt = np.concatenate(( tF2, tF3, tF4), axis=1) # Xt = np.concatenate((tF1), axis=1) # Xt = tF4 # Xt = tF1 # Xt = tF2 ans_num = model.predict(Xt) # ans_num = model.predict(t_F1) # ans_num = model.predict(t_F2) # # print(ans) # # print(len(ans)) array_to_labels = {0: "positive", 2: "negative", 1: "neutral"} labels = [array_to_labels[i] for i in ans_num] # # print(labels) # # ans_dic = {} predictions = dict(zip(list(testdic.keys()), labels)) # print(ans_dictionary) # predictions = {'163361196206957578': 'neutral', '768006053969268950': 'neutral', '742616104384772304': 'neutral', ' # 102313285628711403': 'neutral', '653274888624828198': 'neutral'} # TODO: Remove this line, 'predictions' should be populated with the outputs of your classifier # predictions = ans_dictionary evaluation.evaluate(predictions, testset, classifier) evaluation.confusion(predictions, testset, classifier)
print('Training the model... ', end='') #classifier = MultinomialNB() classifier = LogisticRegression() model = classifier.fit(features, train['rating']) print('Done.') print('Loading test data... ', end='') df2 = pd.read_csv('data_test.csv') print('Done.') print('Processing test data... ', end='') test = pd.DataFrame() test['rating'] = df2['overall'] test['review'] = df2['reviewText'] test = test[~test['review'].isnull()] test = test[~test['rating'].isnull()] test['review'] = test['review'].apply(clean_review) test_features = vectorizer.transform(test['review'].tolist()) print('Done.') print('Testing model... ', end='') predict = model.predict(test_features) test['predict'] = predict #test.to_csv('reviews_home_binary_test_predict_LogisticRegression.csv', index=False) print('Done.') confusion(predict, test['rating']) plt.hist(predict, bins=range(0, 3)) plt.show()
wv_mean = test.tweet.apply(tweet_vectors, args=(300, 'mean')).apply(pd.Series) test = pd.concat([test, wv_sum, wv_mean], axis=1) X_test = scaler.transform( test.drop(['id', 'tweet', 'target', 'label'], axis=1)) elif classifier == 'combined_model': for func in process_funcs: data.tweet = data.tweet.apply(func) X_test = tfidf_vect.transform(test.tweet) test['count_pos'], test['count_neg'], \ test['count_neu']= zip(*test.tweet.apply(count_opinon_lexicons2, args=(wp, wn, stopWords))) wv_sum_test = test.tweet.apply(tweet_vectors, args=(300, 'sum')).apply(pd.Series) wv_mean_test = test.tweet.apply(tweet_vectors, args=(300, 'mean')).apply(pd.Series) counts_test = scaler.transform( test[['count_pos', 'count_neg', 'count_neu']]) X_test = hstack( [X_test, wv_sum_test.values, wv_mean_test.values, counts_test]) # creating predictions dictionary test_pred = sentiment_clf.predict(X=X_test) target_names = list(target_dict.keys()) predictions = dict( zip(test['id'].values, map(lambda x: target_names[x], test_pred))) evaluation.evaluate(predictions, test_path, classifier) evaluation.confusion(predictions, test_path, classifier)
from sklearn.model_selection import StratifiedShuffleSplit from sklearn.linear_model import LogisticRegression from matplotlib import pyplot as plt df = pd.read_csv('creditcard.csv') X = df.iloc[:, 1:29] y = df['Class'] # split data into training and testing sets spl = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0) for idx_train, idx_test in spl.split(X, y): X_train = X.iloc[idx_train] y_train = y.iloc[idx_train] X_test = X.iloc[idx_test] y_test = y.iloc[idx_test] # train the model using train data classifier = LogisticRegression() classifier.fit(X_train, y_train) # test the model on test data y_predict = classifier.predict(X_test) y_score = classifier.predict_proba(X_test) import evaluation as ev ev.confusion(y_predict, y_test) xy = ev.KS_chart(y_score[:, 0], y_test) plt.plot(xy[:, 0], xy[:, 1]) plt.plot(xy[:, 0], xy[:, 2]) plt.show()
) #get the user_ids in test file for produce the dictionary of Predictions sens_test = pre_text(test) #preprocess test data if classifier == 'Lexicon_SVM': test_feature = get_lexicon_feature( sens_test) #produce features for test elif classifier == 'Ngram_tfidf_MultinomialNB': ngram_test = get_ngram_sentence(sens_test) test_feature = get_ngram_tfidf( ngram_test) #produce features for test # elif classifier == 'word2vec': # test_feature = getAvgFeatureVecs(sens_test, model, num_features) else: test_feature = get_lexicon_feature( sens_test) #produce features for test test_pred = clf.predict( test_feature ) #use trained model and test feature to get predicted sentiments print( np.mean(test_pred == test["sentiment"])) #calculate the accuaracy of prediction predictions = dict(zip(test_id, test_pred)) #get the dictionary of Predictions evaluation.evaluate(predictions, testset, classifier) # calculate f1 evaluation.confusion(predictions, testset, classifier) # calculate confusion metrix # print(test_pred) end = time.time() #record end time print("running time is", end - start, "second") #print running time
#trains sentiment classifier train_vect = vect.fit_transform([t[1] for t in train_set]) for testset in testsets.testsets: #classifies tweets in test set test = prep(testset) test_features = vect.transform([t[1] for t in test]) #Depending on the sentiment classifier, a correpsonding tweet classifier is used to obtained the best results. if classifier == "HashingVectorizer": classif = Perceptron(max_iter=300) classif.fit(train_vect,[(t[0]) for t in train_set]) elif classifier == "TfidfVectorizer": classif = BernoulliNB() classif.fit(train_vect,[(t[0]) for t in train_set]) else: classif = MultinomialNB() classif.fit(train_vect,[(t[0]) for t in train_set]) #Predicts the sentiment type of the test features based on the training classifiers predictions = classif.predict(test_features) id_list = ids(testset) #calls the ids function for list of Tweet IDs id_list_predict = list(zip(id_list,list(predictions))) diction = dict(id_list_predict) #Creates a dictionary of Tweet IDs and corresponding sentiment evaluation.evaluate(diction, testset, classifier) evaluation.confusion(diction, testset, classifier)