dep_posts = new_arr y = np.concatenate((np.ones(len(reg_posts)), np.zeros(len(dep_posts)))) x = np.concatenate((reg_posts, dep_posts)) print('b. initializing') rs = ShuffleSplit(n_splits=10, test_size=.10, random_state=0) rs.get_n_splits(x) split = 0 for train_index, test_index in rs.split(x): print "split", split x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] new_doc = D2V('w2v_' + str(split), 300) train_arrays, test_arrays, train_labels, test_labels = new_doc.build_d2v_vecs( x_train, x_test, y_train, y_test) print('Logreg') logreg.run_logreg(train_arrays, test_arrays, train_labels, test_labels) print('SVM') svm.train_svm(train_arrays, test_arrays, train_labels, test_labels) print('Simple neural network') NNet.simpleNN(train_arrays, test_arrays, train_labels, test_labels, 0.01, 100, 100) split += 1
split = 0 for train_index, test_index in rs.split(x): print "split", split x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] feat_model = DeCh("reg") feat_model.load_liwc('data/mixed_liwc2007.csv', 'data/anxiety_filtered2007.csv') print "calculating train" train_vecs = feat_model.build_feat(x_train, train_index) print "calculating test" test_vecs = feat_model.build_feat(x_test, test_index) np.save('feat/test_de' + str(split), test_vecs) np.save('feat/train_de' + str(split), train_vecs) print('Simple NN') NNet.simpleNN(train_vecs, test_vecs, y_train, y_test, 0.01, 100, 100) print('Logreg') logreg.run_logreg(train_vecs, test_vecs, y_train, y_test) print('SVM') svm.train_svm(train_vecs, test_vecs, y_train, y_test) split += 1
return x_train, x_test, y_train, y_test if __name__ == "__main__": labels, anx_liwc = read_liwc_csv('data//anxious_liwc.csv') labels, mixed_liwc = read_liwc_csv('data//mixed_liwc.csv') print len(anx_liwc) y = np.concatenate((np.ones(len(mixed_liwc)), np.zeros(len(anx_liwc)))) x = np.concatenate((mixed_liwc, anx_liwc)) rs = ShuffleSplit(n_splits=10, test_size=.10, random_state=0) rs.get_n_splits(x) split = 0 for train_index, test_index in rs.split(x): print split x_train, x_test, y_train, y_test = build_train_test( x, y, train_index, test_index) train_w_labels = np.concatenate( (x_train, y_train.reshape(len(x_train), 1)), axis=1) arff.dump('result.arff', train_w_labels, relation='liwc', names=labels) print('log reg ') run_logreg(x_train, x_test, y_train, y_test) print('svm') train_svm(x_train, x_test, y_train, y_test) split += 1
test_vecs_d = np.load('feat/test_d2v' + str(split) + '.npy') train_vecs_d = np.load('feat/train_d2v' + str(split) + '.npy') test_vecs_u = np.load('feat/test_unibigram' + str(split) + '.npy') train_vecs_u = np.load('feat/train_unibigram' + str(split) + '.npy') test_vecs_l = np.load('feat/test_lda' + str(split) + '.npy') train_vecs_l = np.load('feat/train_lda' + str(split) + '.npy') test_vecs = np.concatenate((test_vecs_w, test_vecs_l), axis=1) train_vecs = np.concatenate((train_vecs_w, train_vecs_l), axis=1) print('Logreg') acc, per, rec = logreg.run_logreg(train_vecs, test_vecs, y_train, y_test) results[split][0] = acc results[split][1] = per results[split][2] = rec print('SVM') acc, per, rec = svm.train_svm(train_vecs_l, test_vecs_l, y_train, y_test) results[split][3] = acc results[split][4] = per results[split][5] = rec print('Simple NN') acc, per, rec = NNet.simpleNN(train_vecs, test_vecs, y_train, y_test, 0.01, 10, 100) results[split][6] = acc results[split][7] = per
def execute(topic1, topic2, test, dump_files): if dump_files == "True": print_bold("\n" + "Downloading the datasets ..." + "\n") create_cleaned_files(topic1, topic2, test) print_bold("Dumps TFIDF features ..." + "\n") # category is used to specify the unique Id of the dumped model category = topic1 + "-" + topic2 dump_tfidf(category) print("=========================================================") print_bold("Start Running bayes model to establish a baseline") print("=========================================================") print_bold("\n" + "Run Bayes model ..." + "\n") pred_train_bayes, pred_test_bayes = run_bayes(category) print("=========================================================") print_bold("Improvement of the baseline") print("=========================================================") print_bold("Run Cnn model ..." + "\n") pred_train_cnn, pred_test_cnn = run_cnn() print( "--------------------------------------------------------------------------" ) print_bold("Run Fasttext model ..." + "\n") pred_train_fasttext, pred_test_fasttext = run_fasttext() print( "--------------------------------------------------------------------------" ) print_bold("Run SVM model ..." + "\n") pred_train_svm, pred_test_svm, y_train = run_svm(category) print( "--------------------------------------------------------------------------" ) print_bold("Run Logistic Regression model ..." + "\n") pred_train_logreg, pred_test_logreg, y_test = run_logreg(category) print( "--------------------------------------------------------------------------" ) print_bold("Starting Ensemble Method") # using train+val for training the ensemble (training on more dataset == stronger results) train = np.column_stack((pred_train_svm, pred_train_logreg, pred_train_cnn, pred_train_fasttext)) test = np.column_stack( (pred_test_svm, pred_test_logreg, pred_test_cnn, pred_test_fasttext)) model = xgb().fit(train, y_train) print( "--------------------------------------------------------------------------" ) print_bold("Final results on the test set : ") print(classification_report(y_test, model.predict(test)))