def test_aggregated_sentiments(): sub_clf = classifier.get_optimal_subjectivity_classifier() pol_clf = classifier.get_optimal_polarity_classifier() tweets = utils.get_pickles(2) sentimentvalues = utils.get_sentimentvalues(2) sub_train_tweets, sub_train_targets, _, _, sub_train_sentiments, _ = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues, splitvalue=1.0 ) pol_train_tweets, pol_train_targets, _, _, pol_train_sentiments, _ = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues, splitvalue=1.0 ) sub_predictions = sub_clf.classify(sub_train_tweets, sub_train_sentiments) pol_predictions = pol_clf.classify(pol_train_tweets, pol_train_sentiments) print pol_train_targets, pol_predictions days, targets, predicts, total_frequencies = utils.temporally_aggregate_subjectivity( sub_train_tweets, sub_predictions, targets=sub_train_targets ) data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, total_frequencies]} plotting.plot_subjectivity_aggregates(data, "aggregated_subjectivity") days, targets, predicts, frequencies = utils.temporally_aggregate_polarity( pol_train_tweets, pol_predictions, targets=pol_train_targets ) for i in range(len(days)): targets[i] = targets[i] * 1.0 / frequencies[i] predicts[i] = predicts[i] * 1.0 / frequencies[i] frequencies[i] = frequencies[i] * 1.0 / total_frequencies[i] data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, frequencies]} plotting.plot_polarity_aggregates(data, "aggregated_polarity")
def perform_grid_search_on_featureset_SA_and_PA(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity")
def train_polarity_and_test_on_feature_set(tweets, feature_set, sentimentvalues, reduce_dataset=1): """ Performs training and testing with a given feature set key """ kfolds = range(0, 10) nbaccuracy_avgs = [] nbprecision_avgs = [] nbrecall_avgs = [] nbf1_avgs = [] svmaccuracy_avgs = [] svmprecision_avgs = [] svmrecall_avgs = [] svmf1_avgs = [] meaccuracy_avgs = [] meprecision_avgs = [] merecall_avgs = [] mef1_avgs = [] for kfoldcounter in kfolds: print "--------------------------KFOLD NR ", kfoldcounter, "----------------------------------" train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues, splitvalue=kfoldcounter * 0.1, reduce_dataset=reduce_dataset ) # for tweet, target in zip(tweets,targets): # try: # print unicode(tweet.text), " ", target # except UnicodeEncodeError: # print tweet.text.encode('utf8'), " ", target # except UnicodeDecodeError: # print tweet.text, " ", target # TRAINING NB vect_options = {"ngram_range": (1, 1), "max_df": 0.5} tfidf_options = {"sublinear_tf": True, "use_idf": True, "smooth_idf": True} print "Training NB polarity with feature set ", feature_set clf = NB(train_tweets, train_targets, vect_options, tfidf_options) clf.set_feature_set(feature_set, train_sentimentvalues) clf.train_on_feature_set() print "Testing..." nb_accuracy, nb_precision, nb_recall, nb_f1_score = clf.test_and_return_results( test_tweets, test_targets, test_sentimentvalues ) nbaccuracy_avgs.append(nb_accuracy) nbprecision_avgs.append(nb_precision) nbrecall_avgs.append(nb_recall) nbf1_avgs.append(nb_f1_score) # TRAINING SVM vect_options = {"ngram_range": (1, 1), "max_df": 0.5} tfidf_options = {"sublinear_tf": True, "use_idf": True, "smooth_idf": True} print "Training SVM polarity on dataset of length ", len(train_tweets) clf = SVM(train_tweets, train_targets, vect_options, tfidf_options) clf.set_feature_set(feature_set, train_sentimentvalues) clf.train_on_feature_set() print "Testing..." svm_accuracy, svm_precision, svm_recall, svm_f1_score = clf.test_and_return_results( test_tweets, test_targets, test_sentimentvalues ) svmaccuracy_avgs.append(svm_accuracy) svmprecision_avgs.append(svm_precision) svmrecall_avgs.append(svm_recall) svmf1_avgs.append(svm_f1_score) # TRAINING MAXENT vect_options = {"ngram_range": (1, 1), "max_df": 0.5} tfidf_options = {"sublinear_tf": True, "use_idf": True, "smooth_idf": True} print "Training MaxEnt polarity on dataset of length ", len(train_tweets) clf = ME(train_tweets, train_targets, vect_options, tfidf_options) clf.set_feature_set(feature_set, train_sentimentvalues) clf.train_on_feature_set() print "Testing..." me_accuracy, me_precision, me_recall, me_f1_score = clf.test_and_return_results( test_tweets, test_targets, test_sentimentvalues ) meaccuracy_avgs.append(me_accuracy) meprecision_avgs.append(me_precision) merecall_avgs.append(me_recall) mef1_avgs.append(me_f1_score) print "Averages" nb_accuracy = reduce(lambda x, y: x + y, nbaccuracy_avgs) / len(nbaccuracy_avgs) nb_precision = reduce(lambda x, y: x + y, nbprecision_avgs) / len(nbprecision_avgs) nb_recall = reduce(lambda x, y: x + y, nbrecall_avgs) / len(nbrecall_avgs) nb_f1_score = reduce(lambda x, y: x + y, nbf1_avgs) / len(nbf1_avgs) svm_accuracy = reduce(lambda x, y: x + y, svmaccuracy_avgs) / len(svmaccuracy_avgs) svm_precision = reduce(lambda x, y: x + y, svmprecision_avgs) / len(svmprecision_avgs) svm_recall = reduce(lambda x, y: x + y, svmrecall_avgs) / len(svmrecall_avgs) svm_f1_score = reduce(lambda x, y: x + y, svmf1_avgs) / len(svmf1_avgs) me_accuracy = reduce(lambda x, y: x + y, meaccuracy_avgs) / len(meaccuracy_avgs) me_precision = reduce(lambda x, y: x + y, meprecision_avgs) / len(meprecision_avgs) me_recall = reduce(lambda x, y: x + y, merecall_avgs) / len(merecall_avgs) me_f1_score = reduce(lambda x, y: x + y, mef1_avgs) / len(mef1_avgs) data = { "Naive Bayes": [nb_accuracy, nb_precision, nb_recall, nb_f1_score], "SVM": [svm_accuracy, svm_precision, svm_recall, svm_f1_score], "Maximum Entropy": [me_accuracy, me_precision, me_recall, me_f1_score], } plotting.plot_performance_histogram(data, "polarity_" + feature_set) return data