Exemplo n.º 1
0
rates = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
training_scores = []
validation_scores = []

for downsample_rate_favor in rates:
    tmp = []
    tmp2 = []
    for downsample_rate_none in rates:
        print 120*'*'
        # ***** LOAD DATA   *****
        if use_downsample:
            print("using down sampling")
            print 'Downsample favor: ' + str(downsample_rate_favor)
            print 'Downsample none: ' + str(downsample_rate_none)
            train_data = ptd.getTrainingData()
            validate_data = ptd.getValidationData()
            #test_data = ptd.getTestData()
            sub_none = ptd.getDownsample2_0(train_data, "NONE", strength, downsample_rate_none)
            sub_favor = ptd.getDownsample2_0(train_data, "FAVOR", strength, downsample_rate_favor)
            against = train_data[train_data.Stance == "AGAINST"]

            train_data = pd.concat([sub_favor, sub_none, against])

        else:
            print("using nothing")
            train_data = ptd.getTrainingData()
            validate_data = ptd.getValidationData()
            test_data = ptd.getTestData()

        if use_upsample:
            print("using up sampling")
Exemplo n.º 2
0
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_predict, StratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

import pandas as pd
import sklearn

strength = 'soft'

# ***** LOAD DATA   *****
train_data = ptd.getTrainingData()
validate_data = ptd.getValidationData()
test_data = ptd.getTestData()

cv = StratifiedKFold(train_data.Stance,
                     n_folds=10,
                     shuffle=True,
                     random_state=1)

# Select classifiers to use
classifiers = [
    LinearSVC(C=2.3988329190194899, multi_class='crammer_singer'),
    #SVC(C=5.2, kernel='linear')
    #MultinomialNB(alpha=0.63, fit_prior=True)
    #LogisticRegression(C=22.759, penalty='l2', solver='lbfgs')
    #SGDClassifier(alpha=0.0001, loss='squared_hinge')
    #BernoulliNB(alpha=0.1, fit_prior=True)
Exemplo n.º 3
0
import json

#################
#    Parameters #
#################

store_to_file = 0


################
#    Load Data #
################


print("Loading data...")
train_data = pd.concat([ptd.getTrainingData(), ptd.getValidationData(), ptd.getTestData()])
unlabelled_data = ptd.getUnlabelledData()


#########################
#   Train classifier    #
#########################


print("Training classifier")
best_classifier = LinearSVC(C=1.178)

pipeline = Pipeline([('vect', CountVectorizer(decode_error='ignore',
                                              analyzer='word',
                                              ngram_range=(1, 2),
                                              stop_words= None,