def get_optimal_polarity_classifier():
    """
    Trains and returns the optimal polarity classifier.
    """
    tweets = utils.get_pickles(3)
    tweets, targets = utils.make_polarity_targets(tweets)
    vect_options = {
          'ngram_range': (1,1),
          'max_df': 0.5
        }
    tfidf_options = {
         'sublinear_tf': False,
          'use_idf': True,
          'smooth_idf': True,
                     }
    clf = SVM(tweets, targets, vect_options, tfidf_options)
    clf.set_feature_set('PC2', features.get_google_sentiment_values(3))
    clf.train_on_feature_set()
    return clf
示例#2
0
def perform_grid_search_on_featureset_SA_and_PA():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")