def perform_grid_search_on_featureset_SA_and_PA(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("SA", None) clf.grid_search_on_text_features(file_postfix="subjectivity") train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets( tweets, sentimentvalues ) clf = SVM(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = NB(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity") clf = ME(train_tweets, train_targets, None) clf.set_feature_set("PA", None) clf.grid_search_on_text_features(file_postfix="polarity")
def train_and_test_subjectivity_and_polarity(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) # train_subjectivity_and_test_on_feature_set(tweets, 'SA', sentimentvalues) train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues) train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)
def train_and_test_dataset_increase(): datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) accuracy_data = { "NB(SA)": [], "NB(SB)": [], "NB(SC)": [], "SVM(SA)": [], "SVM(SB)": [], "SVM(SC)": [], "MaxEnt(SA)": [], "MaxEnt(SB)": [], "MaxEnt(SC)": [], "NB(PA)": [], "NB(PB)": [], "NB(PC)": [], "SVM(PA)": [], "SVM(PB)": [], "SVM(PC)": [], "MaxEnt(PA)": [], "MaxEnt(PB)": [], "MaxEnt(PC)": [], } f1_data = { "NB(SA)": [], "NB(SB)": [], "NB(SC)": [], "SVM(SA)": [], "SVM(SB)": [], "SVM(SC)": [], "MaxEnt(SA)": [], "MaxEnt(SB)": [], "MaxEnt(SC)": [], "NB(PA)": [], "NB(PB)": [], "NB(PC)": [], "SVM(PA)": [], "SVM(PB)": [], "SVM(PC)": [], "MaxEnt(PA)": [], "MaxEnt(PB)": [], "MaxEnt(PC)": [], } for i in range(5, 101, 5): print "=============================DATAPOINT NR. ", i, "========================================" data = train_subjectivity_and_test_on_feature_set(tweets, "SA", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SA)"].append(data["Naive Bayes"][0]) f1_data["NB(SA)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SA)"].append(data["SVM"][0]) f1_data["SVM(SA)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SA)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SA)"].append(data["Maximum Entropy"][3]) data = train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SB)"].append(data["Naive Bayes"][0]) f1_data["NB(SB)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SB)"].append(data["SVM"][0]) f1_data["SVM(SB)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SB)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SB)"].append(data["Maximum Entropy"][3]) data = train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(SC)"].append(data["Naive Bayes"][0]) f1_data["NB(SC)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(SC)"].append(data["SVM"][0]) f1_data["SVM(SC)"].append(data["SVM"][3]) accuracy_data["MaxEnt(SC)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(SC)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PA", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PA)"].append(data["Naive Bayes"][0]) f1_data["NB(PA)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PA)"].append(data["SVM"][0]) f1_data["SVM(PA)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PA)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PA)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PB", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PB)"].append(data["Naive Bayes"][0]) f1_data["NB(PB)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PB)"].append(data["SVM"][0]) f1_data["SVM(PB)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PB)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PB)"].append(data["Maximum Entropy"][3]) data = train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues, reduce_dataset=i * 0.01) print "DATA -- ", data accuracy_data["NB(PC)"].append(data["Naive Bayes"][0]) f1_data["NB(PC)"].append(data["Naive Bayes"][3]) accuracy_data["SVM(PC)"].append(data["SVM"][0]) f1_data["SVM(PC)"].append(data["SVM"][3]) accuracy_data["MaxEnt(PC)"].append(data["Maximum Entropy"][0]) f1_data["MaxEnt(PC)"].append(data["Maximum Entropy"][3]) out = open("incremental_acc" + str(i), "wb") pickle.dump(accuracy_data, out) out = open("incremental_f1" + str(i), "wb") pickle.dump(f1_data, out) plotting.plot_temporal_sentiment(accuracy_data, filename="incremental_accuracy") plotting.plot_temporal_sentiment(f1_data, filename="incremental_f1")
pickle.dump(data, open("topically_aggregated_polarity", "wb")) def preprocess_temporal_dataset(): tweetlines = utils.get_dataset(utils.complete_datasets[3]) tweets = [] for line in tweetlines: if len(line) > 1: tweets.append(tweet.to_tweet(line)) tweets = preprocessing.preprocess_tweets(tweets) sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets) pickle.dump(sentiments, open("temporal_sentiments", "wb")) pickle.dump(tweets, open("temporal_tweets2", "wb")) if __name__ == "__main__": datasetnr = 3 tweets = utils.get_pickles(datasetnr) sentimentvalues = feat_utils.get_sentiment_values(datasetnr) tweets = preprocessing.remove_link_classes(tweets) tweets = preprocessing.lower_case(tweets) tweets = preprocessing.remove_specialchars_round2(tweets) train_subjectivity_and_test_on_feature_set(tweets, "SA", datasetnr) train_subjectivity_and_test_on_feature_set(tweets, "SB", datasetnr) train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues) train_polarity_and_test_on_feature_set(tweets, "PA", datasetnr) train_polarity_and_test_on_feature_set(tweets, "PB", datasetnr) train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues)