Exemplo n.º 1
0
def test_aggregated_sentiments():
    sub_clf = classifier.get_optimal_subjectivity_classifier()
    pol_clf = classifier.get_optimal_polarity_classifier()
    tweets = utils.get_pickles(2)
    sentimentvalues = utils.get_sentimentvalues(2)
    sub_train_tweets, sub_train_targets, _, _, sub_train_sentiments, _ = utils.make_subjectivity_train_and_test_and_targets(
        tweets, sentimentvalues, splitvalue=1.0
    )
    pol_train_tweets, pol_train_targets, _, _, pol_train_sentiments, _ = utils.make_polarity_train_and_test_and_targets(
        tweets, sentimentvalues, splitvalue=1.0
    )

    sub_predictions = sub_clf.classify(sub_train_tweets, sub_train_sentiments)
    pol_predictions = pol_clf.classify(pol_train_tweets, pol_train_sentiments)
    print pol_train_targets, pol_predictions
    days, targets, predicts, total_frequencies = utils.temporally_aggregate_subjectivity(
        sub_train_tweets, sub_predictions, targets=sub_train_targets
    )
    data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, total_frequencies]}
    plotting.plot_subjectivity_aggregates(data, "aggregated_subjectivity")
    days, targets, predicts, frequencies = utils.temporally_aggregate_polarity(
        pol_train_tweets, pol_predictions, targets=pol_train_targets
    )
    for i in range(len(days)):
        targets[i] = targets[i] * 1.0 / frequencies[i]
        predicts[i] = predicts[i] * 1.0 / frequencies[i]
        frequencies[i] = frequencies[i] * 1.0 / total_frequencies[i]
    data = {"Targets": [days, targets], "Predictions": [days, predicts], "Frequencies": [days, frequencies]}
    plotting.plot_polarity_aggregates(data, "aggregated_polarity")
Exemplo n.º 2
0
def perform_grid_search_on_featureset_SA_and_PA():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
def google_lexicon_lookup():
    """
    Fetches the tweets and performs lexicon translatino and lookup.
    """
    tweets = utils.get_pickles(0)
    words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    print "Storing..."
    utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_random_dataset")
    tweets = utils.get_pickles(1)
    words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    print "Storing..."
    utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_rosenborg_dataset")
    tweets = utils.get_pickles(2)
    words_with_values = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    print "Storing..."
    utils.store_sentimentvalues(words_with_values, "models/google_sentimentvalues_erna_dataset")
Exemplo n.º 4
0
 def train_and_store_results(self):
     """
     Trains the given model on the dataset using the three different models, and different feature sets. Stores the results of the runs.
     """
     dataset = "random_dataset"
     tweets = utils.get_pickles(dataset)
     self.model.set_feature_set('A')
     self.model.train_on_feature_set()
def pos_analyze():
    """
    Unpickles preprocessed tweets and performs pos-analysis of them. Then stores the stats in a diagram.
    """
    tweets = utils.get_pickles(3)
    subjectivity_data, polarity_data = pos_tag_analyze(tweets)
    plotting.plot_pos_analysis(subjectivity_data, "sub_analysis")
    plotting.plot_pos_analysis(polarity_data, "pos_analysis")
    return True
Exemplo n.º 6
0
def train_and_test_subjectivity_and_polarity():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    #    train_subjectivity_and_test_on_feature_set(tweets, 'SA', sentimentvalues)
    train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues)
    train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)
def re_analyze():
    """
    Unpickles preprocessed tweets and performs reanalyzis of these, then stores stats.
    """
    labels = ["random",'"rosenborg"','"erna solberg"']
    data = {}
    worddata = {}
    for i in xrange(3):
        tweets = utils.get_pickles(i)
        analyzer = Analyzer(utils.annotated_datasets[i], tweets)
        
        avg_list,words_list= analyzer.analyze()
        print avg_list
        worddata[labels[i]] = words_list
        data[labels[i]] = avg_list
    plotting.average_wordclasses(worddata, "averages")

    plotting.detailed_average_wordclasses(data, "averages2")
Exemplo n.º 8
0
def get_optimal_polarity_classifier():
    """
    Trains and returns the optimal polarity classifier.
    """
    tweets = utils.get_pickles(3)
    tweets, targets = utils.make_polarity_targets(tweets)
    vect_options = {
          'ngram_range': (1,1),
          'max_df': 0.5
        }
    tfidf_options = {
         'sublinear_tf': False,
          'use_idf': True,
          'smooth_idf': True,
                     }
    clf = SVM(tweets, targets, vect_options, tfidf_options)
    clf.set_feature_set('PC2', features.get_google_sentiment_values(3))
    clf.train_on_feature_set()
    return clf
Exemplo n.º 9
0
def train_and_test_dataset_increase():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)
    accuracy_data = {
        "NB(SA)": [],
        "NB(SB)": [],
        "NB(SC)": [],
        "SVM(SA)": [],
        "SVM(SB)": [],
        "SVM(SC)": [],
        "MaxEnt(SA)": [],
        "MaxEnt(SB)": [],
        "MaxEnt(SC)": [],
        "NB(PA)": [],
        "NB(PB)": [],
        "NB(PC)": [],
        "SVM(PA)": [],
        "SVM(PB)": [],
        "SVM(PC)": [],
        "MaxEnt(PA)": [],
        "MaxEnt(PB)": [],
        "MaxEnt(PC)": [],
    }
    f1_data = {
        "NB(SA)": [],
        "NB(SB)": [],
        "NB(SC)": [],
        "SVM(SA)": [],
        "SVM(SB)": [],
        "SVM(SC)": [],
        "MaxEnt(SA)": [],
        "MaxEnt(SB)": [],
        "MaxEnt(SC)": [],
        "NB(PA)": [],
        "NB(PB)": [],
        "NB(PC)": [],
        "SVM(PA)": [],
        "SVM(PB)": [],
        "SVM(PC)": [],
        "MaxEnt(PA)": [],
        "MaxEnt(PB)": [],
        "MaxEnt(PC)": [],
    }
    for i in range(5, 101, 5):
        print "=============================DATAPOINT NR. ", i, "========================================"
        data = train_subjectivity_and_test_on_feature_set(tweets, "SA", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SA)"].append(data["Naive Bayes"][0])
        f1_data["NB(SA)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SA)"].append(data["SVM"][0])
        f1_data["SVM(SA)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SA)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SA)"].append(data["Maximum Entropy"][3])

        data = train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SB)"].append(data["Naive Bayes"][0])
        f1_data["NB(SB)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SB)"].append(data["SVM"][0])
        f1_data["SVM(SB)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SB)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SB)"].append(data["Maximum Entropy"][3])

        data = train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SC)"].append(data["Naive Bayes"][0])
        f1_data["NB(SC)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SC)"].append(data["SVM"][0])
        f1_data["SVM(SC)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SC)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SC)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PA", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PA)"].append(data["Naive Bayes"][0])
        f1_data["NB(PA)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PA)"].append(data["SVM"][0])
        f1_data["SVM(PA)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PA)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PA)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PB", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PB)"].append(data["Naive Bayes"][0])
        f1_data["NB(PB)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PB)"].append(data["SVM"][0])
        f1_data["SVM(PB)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PB)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PB)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PC)"].append(data["Naive Bayes"][0])
        f1_data["NB(PC)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PC)"].append(data["SVM"][0])
        f1_data["SVM(PC)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PC)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PC)"].append(data["Maximum Entropy"][3])
        out = open("incremental_acc" + str(i), "wb")
        pickle.dump(accuracy_data, out)
        out = open("incremental_f1" + str(i), "wb")
        pickle.dump(f1_data, out)
    plotting.plot_temporal_sentiment(accuracy_data, filename="incremental_accuracy")
    plotting.plot_temporal_sentiment(f1_data, filename="incremental_f1")
Exemplo n.º 10
0
    pickle.dump(data, open("topically_aggregated_polarity", "wb"))


def preprocess_temporal_dataset():
    tweetlines = utils.get_dataset(utils.complete_datasets[3])
    tweets = []
    for line in tweetlines:
        if len(line) > 1:
            tweets.append(tweet.to_tweet(line))
    tweets = preprocessing.preprocess_tweets(tweets)
    sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    pickle.dump(sentiments, open("temporal_sentiments", "wb"))
    pickle.dump(tweets, open("temporal_tweets2", "wb"))


if __name__ == "__main__":
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_subjectivity_and_test_on_feature_set(tweets, "SA", datasetnr)
    train_subjectivity_and_test_on_feature_set(tweets, "SB", datasetnr)
    train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)

    train_polarity_and_test_on_feature_set(tweets, "PA", datasetnr)
    train_polarity_and_test_on_feature_set(tweets, "PB", datasetnr)
    train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues)