import pandas as pd import cPickle as pickle # ***** SETTINGS ***** use_upsample = 0 use_downsample = 0 downsample_rate_favor = 0.3 downsample_rate_none = 0.3 strength = 'soft' # ***** LOAD DATA ***** if use_downsample: data = ptd.getTrainingData() sub_none = ptd.getDownsample2_0(data, "NONE", strength, downsample_rate_none) sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, sub_none, against]) else: train_data = ptd.getTrainingDataWithMeta() validate_data = ptd.getValidationDataWithMeta() test_data = ptd.getTestDataWithMeta() if use_upsample: data = pd.concat([data, data[data.Stance == "AGAINST"]])
import pandas as pd import cPickle as pickle # ***** SETTINGS ***** use_upsample = 0 use_downsample = 0 downsample_rate_favor = 0.3 downsample_rate_none = 0.3 strength = 'soft' # ***** LOAD DATA ***** if use_downsample: data = ptd.getTrainingData() sub_none = ptd.getDownsample2_0(data, "NONE", strength, downsample_rate_none) sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, sub_none, against]) else: train_data = ptd.getTrainingDataWithMeta() validate_data = ptd.getValidationDataWithMeta() test_data = ptd.getTestDataWithMeta() if use_upsample: data = pd.concat([data, data[data.Stance == "AGAINST"]]) #print "None: ", len(data[data.Stance == "NONE"])
import pandas as pd # ***** SETTINGS ***** use_upsample = 1 use_downsample = 1 #perform_test_on_unused_data = 1 downsample_rate_favor = 0.3 #downsample_rate_none = 10 strength = 'soft' if use_downsample: data = pd.read_csv('../TextFiles/data/tcp_train.csv', sep='\t') sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, against]) else: data = pd.read_csv('../TextFiles/data/tcp_train.csv', sep='\t') data = pd.concat( [data[data.Stance == "FAVOR"], data[data.Stance == "AGAINST"]]) if use_upsample: data = pd.concat([data, data[data.Stance == "AGAINST"]]) cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1) # Select classifiers to use
from sklearn.cross_validation import cross_val_predict, StratifiedKFold from sklearn.metrics import fbeta_score from sklearn.ensemble import VotingClassifier validate = 1 testing = 0 data = pd.read_csv(open('../TextFiles/data/tcp_train.csv'), sep='\t', index_col=0) val = pd.read_csv(open('../TextFiles/data/tcp_validate.csv'), sep='\t', index_col=0) test = pd.read_csv(open('../TextFiles/data/tcp_test.csv'), sep='\t', index_col=0) print("using down sampling") print 'Downsample favor: ' + str(0.2) print 'Downsample none: ' + str(0.4) #test_data = ptd.getTestData() sub_favor = ptd.getDownsample2_0(data, "FAVOR", "soft", 0.2) sub_none = ptd.getDownsample2_0(data, "NONE", "soft", 0.4) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, sub_none, against]) #glove_fnames1 = glob('../DataProcessing/GloveVectorizer/vectors/glove.6B.300d_tcp_abstracts.pkl') glove_fnames = glob('../DataProcessing/GloveVectorizer/vectors/glove.840B.300d_tcp_abstracts.pkl') #glove_fnames = glove_fnames1 + glove_fnames2 print glove_fnames glove_ids = [fname.split('/')[-1].split('_')[0] for fname in glove_fnames] # ***** FINDING BEST VECTOR SPACE ***** for fname, glove_id in zip(glove_fnames, glove_ids): print 80 * '='
# ***** SETTINGS ***** use_upsample = 1 use_downsample = 1 # perform_test_on_unused_data = 1 downsample_rate_favor = 0.3 # downsample_rate_none = 10 strength = "soft" if use_downsample: data = pd.read_csv("../TextFiles/data/tcp_train.csv", sep="\t") sub_favor = ptd.getDownsample2_0(data, "FAVOR", strength, downsample_rate_favor) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, against]) else: data = pd.read_csv("../TextFiles/data/tcp_train.csv", sep="\t") data = pd.concat([data[data.Stance == "FAVOR"], data[data.Stance == "AGAINST"]]) if use_upsample: data = pd.concat([data, data[data.Stance == "AGAINST"]]) cv = StratifiedKFold(data.Stance, n_folds=10, shuffle=True, random_state=1) # Select classifiers to use classifiers = [
data = pd.read_csv(open('../TextFiles/data/tcp_train.csv'), sep='\t', index_col=0) val = pd.read_csv(open('../TextFiles/data/tcp_validate.csv'), sep='\t', index_col=0) test = pd.read_csv(open('../TextFiles/data/tcp_test.csv'), sep='\t', index_col=0) print("using down sampling") print 'Downsample favor: ' + str(0.2) print 'Downsample none: ' + str(0.4) #test_data = ptd.getTestData() sub_favor = ptd.getDownsample2_0(data, "FAVOR", "soft", 0.2) sub_none = ptd.getDownsample2_0(data, "NONE", "soft", 0.4) against = data[data.Stance == "AGAINST"] data = pd.concat([sub_favor, sub_none, against]) #glove_fnames1 = glob('../DataProcessing/GloveVectorizer/vectors/glove.6B.300d_tcp_abstracts.pkl') glove_fnames = glob( '../DataProcessing/GloveVectorizer/vectors/glove.840B.300d_tcp_abstracts.pkl' ) #glove_fnames = glove_fnames1 + glove_fnames2 print glove_fnames glove_ids = [fname.split('/')[-1].split('_')[0] for fname in glove_fnames] # ***** FINDING BEST VECTOR SPACE *****