def test_aggregated_sentiments(): sub_clf = classifier.get_optimal_subjectivity_classifier() pol_clf = classifier.get_optimal_polarity_classifier() tweets = utils.get_pickles(2) sentimentvalues = utils.get_sentimentvalues(2) sub_train_tweets, sub_train_targets, _, _, sub_train_sentiments, _ = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues, splitvalue=1.0 ) pol_train_tweets, pol_train_targets, _, _, pol_train_sentiments, _ = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues, splitvalue=1.0 ) sub_predictions = sub_clf.classify(sub_train_tweets, sub_train_sentiments) pol_predictions = pol_clf.classify(pol_train_tweets, pol_train_sentiments) print pol_train_targets, pol_predictions days, targets, predicts, total_frequencies = utils.temporally_aggregate_subjectivity( sub_train_tweets, sub_predictions, targets=sub_train_targets ) data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, total_frequencies]} plotting.plot_subjectivity_aggregates(data, "aggregated_subjectivity") days, targets, predicts, frequencies = utils.temporally_aggregate_polarity( pol_train_tweets, pol_predictions, targets=pol_train_targets ) for i in range(len(days)): targets[i] = targets[i] * 1.0 / frequencies[i] predicts[i] = predicts[i] * 1.0 / frequencies[i] frequencies[i] = frequencies[i] * 1.0 / total_frequencies[i] data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, frequencies]} plotting.plot_polarity_aggregates(data, "aggregated_polarity")
def perform_grid_search_on_featureset_SA_and_PA(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity")
def google_lexicon_lookup(): """ Fetches the tweets and performs lexicon translatino and lookup. """ tweets = utils.get_pickles(0) words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets) print "Storing..." utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_random_dataset") tweets = utils.get_pickles(1) words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets) print "Storing..." utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_rosenborg_dataset") tweets = utils.get_pickles(2) words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets) print "Storing..." utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_erna_dataset")
def train_and_store_results(self): """ Trains the given model on the dataset using the three different models, and different feature sets. Stores the results of the runs. """ dataset = "random_dataset" tweets = utils.get_pickles(dataset) self.model.set_feature_set('A') self.model.train_on_feature_set()
def pos_analyze(): """ Unpickles preprocessed tweets and performs pos-analysis of them. Then stores the stats in a diagram. """ tweets = utils.get_pickles(3) subjectivity_data, polarity_data = pos_tag_analyze(tweets) plotting.plot_pos_analysis(subjectivity_data, "sub_analysis") plotting.plot_pos_analysis(polarity_data, "pos_analysis") return True
def train_and_test_subjectivity_and_polarity(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) # train_subjectivity_and_test_on_feature_set(tweets, 'SA', sentimentvalues) train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues) train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)
def re_analyze(): """ Unpickles preprocessed tweets and performs reanalyzis of these, then stores stats. """ labels = ["random",'"rosenborg"','"erna solberg"'] data = {} worddata = {} for i in xrange(3): tweets = utils.get_pickles(i) analyzer = Analyzer(utils.annotated_datasets[i], tweets) avg_list,words_list= analyzer.analyze() print avg_list worddata[labels[i]] = words_list data[labels[i]] = avg_list plotting.average_wordclasses(worddata, "averages") plotting.detailed_average_wordclasses(data, "averages2")
def get_optimal_polarity_classifier(): """ Trains and returns the optimal polarity classifier. """ tweets = utils.get_pickles(3) tweets, targets = utils.make_polarity_targets(tweets) vect_options = { 'ngram_range': (1,1), 'max_df': 0.5 } tfidf_options = { 'sublinear_tf': False, 'use_idf': True, 'smooth_idf': True, } clf = SVM(tweets, targets, vect_options, tfidf_options) clf.set_feature_set('PC2', features.get_google_sentiment_values(3)) clf.train_on_feature_set() return clf
def train_and_test_dataset_increase(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) accuracy_data = { "NB(SA)": [], "NB(SB)": [], "NB(SC)": [], "SVM(SA)": [], "SVM(SB)": [], "SVM(SC)": [], "MaxEnt(SA)": [], "MaxEnt(SB)": [], "MaxEnt(SC)": [], "NB(PA)": [], "NB(PB)": [], "NB(PC)": [], "SVM(PA)": [], "SVM(PB)": [], "SVM(PC)": [], "MaxEnt(PA)": [], "MaxEnt(PB)": [], "MaxEnt(PC)": [], } f1_data = { "NB(SA)": [], "NB(SB)": [], "NB(SC)": [], "SVM(SA)": [], "SVM(SB)": [], "SVM(SC)": [], "MaxEnt(SA)": [], "MaxEnt(SB)": [], "MaxEnt(SC)": [], "NB(PA)": [], "NB(PB)": [], "NB(PC)": [], "SVM(PA)": [], "SVM(PB)": [], "SVM(PC)": [], "MaxEnt(PA)": [], "MaxEnt(PB)": [], "MaxEnt(PC)": [], } for i in range(5, 101, 5): print "=============================DATAPOINT NR. ", i, "========================================" data = train_subjectivity_and_test_on_feature_set(tweets, "SA", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SA)"].append(data["Naive Bayes"][0]) f1_data["NB(SA)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SA)"].append(data["SVM"][0]) f1_data["SVM(SA)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SA)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SA)"].append(data["Maximum Entropy"][3]) data = train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SB)"].append(data["Naive Bayes"][0]) f1_data["NB(SB)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SB)"].append(data["SVM"][0]) f1_data["SVM(SB)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SB)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SB)"].append(data["Maximum Entropy"][3]) data = train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SC)"].append(data["Naive Bayes"][0]) f1_data["NB(SC)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SC)"].append(data["SVM"][0]) f1_data["SVM(SC)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SC)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SC)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PA", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PA)"].append(data["Naive Bayes"][0]) f1_data["NB(PA)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PA)"].append(data["SVM"][0]) f1_data["SVM(PA)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PA)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PA)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PB", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PB)"].append(data["Naive Bayes"][0]) f1_data["NB(PB)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PB)"].append(data["SVM"][0]) f1_data["SVM(PB)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PB)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PB)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PC)"].append(data["Naive Bayes"][0]) f1_data["NB(PC)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PC)"].append(data["SVM"][0]) f1_data["SVM(PC)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PC)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PC)"].append(data["Maximum Entropy"][3]) out = open("incremental_acc" + str(i), "wb") pickle.dump(accuracy_data, out) out = open("incremental_f1" + str(i), "wb") pickle.dump(f1_data, out) plotting.plot_temporal_sentiment(accuracy_data, filename="incremental_accuracy") plotting.plot_temporal_sentiment(f1_data, filename="incremental_f1")
pickle.dump(data, open("topically_aggregated_polarity", "wb")) def preprocess_temporal_dataset(): tweetlines = utils.get_dataset(utils.complete_datasets[3]) tweets = [] for line in tweetlines: if len(line) > 1: tweets.append(tweet.to_tweet(line)) tweets = preprocessing.preprocess_tweets(tweets) sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets) pickle.dump(sentiments, open("temporal_sentiments", "wb")) pickle.dump(tweets, open("temporal_tweets2", "wb")) if __name__ == "__main__": datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_subjectivity_and_test_on_feature_set(tweets, "SA", datasetnr) train_subjectivity_and_test_on_feature_set(tweets, "SB", datasetnr) train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues) train_polarity_and_test_on_feature_set(tweets, "PA", datasetnr) train_polarity_and_test_on_feature_set(tweets, "PB", datasetnr) train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues)