def main(): """ 1. Divide total dataset into several data bins by randomly extracting data entries with given ratio. 2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode. 3. Report and present statistical evaluations for each data bin. """ stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE data_pos, data_neg = load_data("../data/") data_pos, data_neg = data_filter(data_pos), data_filter(data_neg) print "Loading Doc2Vec model ..." model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model print "Doc2Vec model loading done!" models = {"SVC": sklearn.svm.SVC(), \ "Logit": sklearn.linear_model.LogisticRegression(), \ "DT": sklearn.tree.DecisionTreeClassifier(), \ "NBayes": sklearn.naive_bayes.GaussianNB(), \ "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()} model_chosen = "NBayes" print "Classifier Type:", model_chosen for binIndex in range(NUM_OF_BINS): print "Experiment on DataSet#", str(binIndex) random.shuffle(data_pos) random.shuffle(data_neg) size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO) data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list() sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list() for iteration in range(NUM_OF_ITERATION): random.seed(iteration) random.shuffle(data_pos_bin) random.shuffle(data_neg_bin) data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors print "non-SMOTE experiment" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=False) # cross validation sFscores_iter_ns.extend(Fscores) sRecalls_iter_ns.extend(recalls) sPrecisions_iter_ns.extend(precisions) print "with SMOTE experiemnt" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=True) # cross validation sFscores_iter_ws.extend(Fscores) sRecalls_iter_ws.extend(recalls) sPrecisions_iter_ws.extend(precisions) stats_Fscores_ns.append(sFscores_iter_ns) stats_precisions_ns.append(sPrecisions_iter_ns) stats_recalls_ns.append(sRecalls_iter_ns) stats_Fscores_ws.append(sFscores_iter_ws) stats_precisions_ws.append(sPrecisions_iter_ws) stats_recalls_ws.append(sRecalls_iter_ws) print "All Experiments Done!" save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\ stats_precisions_ws, model_name=model_chosen) print "Statistics ready!"
def main(): stats_Fscore, stats_recall, stats_precision = list(), list(), list() data_pos, data_neg = load_data("../data/") data_pos, data_neg = data_filter(data_pos), data_filter(data_neg) model = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) print "Model loading done!" for test_mode in range(2): if test_mode == 0: print "non-SMOTE" else: print "SMOTE" sFscores, sRecalls, sPrecisions = list(), list(), list() for iteration in range(NUM_OF_ITERATION): # start iteration random.seed(iteration) random.shuffle(data_pos) random.shuffle(data_neg) data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg, model) # convert to Word Vectors print len(data_pos_vec), len(data_neg_vec) models = {"SVC": sklearn.svm.SVC(), \ "Logit": sklearn.linear_model.LogisticRegression(), \ "DT": sklearn.tree.DecisionTreeClassifier(), \ "NBayes": sklearn.naive_bayes.GaussianNB(), \ "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()} model_chosen = "SVC" accuracys, precisions, recalls, Fscores = cross_validationS(\ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=test_mode) # cross validation sFscores.extend(Fscores) sRecalls.extend(recalls) sPrecisions.extend(precisions) stats_Fscore.append(sFscores) stats_recall.append(sRecalls) stats_precision.append(sPrecisions) plt.figure() colors = ["red", "blue"] modes = ["no-SMOTE", "SMOTE"] for i in range(len(stats_Fscore)): # plot statistical summary plt.plot(stats_Fscore[i], marker='o', color=colors[i], label=modes[i]+"_Fscore") #plt.plot(stats_precision[i], marker='+', color=colors[i], label=modes[i]+"_precision") #plt.plot(stats_recall[i], marker='*', color=colors[i], label=modes[i]+"_recall") plt.ylim([0, 1.0]) plt.legend(loc=4, borderaxespad=0.5) plt.ylabel("Scores") plt.xlabel("Data Sequence") plt.savefig("../results/"+model_chosen+"-ValidationStats.png") savefile_name = "../results/" + model_chosen + "-ValidationStats.txt" fp = open(savefile_name, 'w') print "******** Evaluation **********\n" fp.write("******** Evaluation **********\n") for test_mode in range(2): # print statistical evaluations stats_precision[test_mode].sort() stats_recall[test_mode].sort() stats_Fscore[test_mode].sort() p_median = stats_precision[test_mode][len(stats_precision)/2] r_median = stats_recall[test_mode][len(stats_recall)/2] f_median = stats_Fscore[test_mode][len(stats_Fscore)/2] iqr_p = stats_precision[test_mode][int(len(stats_precision)*0.75)] - stats_precision[test_mode][int(len(stats_precision)*0.25)] iqr_r = stats_recall[test_mode][int(len(stats_recall)*0.75)] - stats_recall[test_mode][int(len(stats_recall)*0.25)] iqr_f = stats_Fscore[test_mode][int(len(stats_Fscore)*0.75)] - stats_Fscore[test_mode][int(len(stats_Fscore)*0.25)] print modes[test_mode] fp.write(modes[test_mode]+'\n') print "\t p_median \t r_median \t f_median" fp.write("\t p_median \t r_median \t f_median \n") print "\t%.5f \t%.5f \t%.5f" % (p_median, r_median, f_median) fp.write("\t%.5f \t%.5f \t%.5f \n" % (p_median, r_median, f_median)) print "\t iqr_p \t iqr_r \t iqr_f" fp.write("\t iqr_p \t iqr_r \t iqr_f \n") print "\t%.5f \t%.5f \t%.5f" % (iqr_p, iqr_r, iqr_f) fp.write("\t%.5f \t%.5f \t%.5f \n" % (iqr_p, iqr_r, iqr_f)) print '\n'
import feature_extractor from gensim.models.doc2vec import Doc2Vec import parser import numpy as np from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error import pdb import pickle model = Doc2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True) print "MODEL LOADED" f = open('stopwords.txt') stoplist = set(line.split('\n')[0] for line in f) def filter_essay(essay): stop_removed = filter(lambda x: x not in stoplist, essay.split()) all_filtered = filter(lambda x: x in model.vocab, stop_removed) return all_filtered def filter_essays(essays): return [filter_essay(essay) for essay in essays] def calc_similarity(i1, i2): return model.n_similarity(i1, i2) def classify(k, instance, training_data, training_scores): similarity = np.array([calc_similarity(instance, x) for x in training_data])
def main(): """ 1. Divide total dataset into several data bins by randomly extracting data entries with given ratio. 2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode. 3. Report and present statistical evaluations for each data bin. """ stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list( ), list() # ns for non-SMOTE stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list( ), list() # ws for with SMOTE data_pos, data_neg = load_data("../data/") data_pos, data_neg = data_filter(data_pos), data_filter(data_neg) print "Loading Doc2Vec model ..." model_doc2vec = Doc2Vec.load_word2vec_format( 'GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model print "Doc2Vec model loading done!" models = {"SVC": sklearn.svm.SVC(), \ "Logit": sklearn.linear_model.LogisticRegression(), \ "DT": sklearn.tree.DecisionTreeClassifier(), \ "NBayes": sklearn.naive_bayes.GaussianNB(), \ "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()} model_chosen = "NBayes" print "Classifier Type:", model_chosen for binIndex in range(NUM_OF_BINS): print "Experiment on DataSet#", str(binIndex) random.shuffle(data_pos) random.shuffle(data_neg) size_pos_bin, size_neg_bin = int(len(data_pos) * SAMPLE_SIZE_RATIO), int( len(data_neg) * SAMPLE_SIZE_RATIO) data_pos_bin, data_neg_bin = data_pos[: size_pos_bin], data_neg[: size_neg_bin] # dataset bin sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list( ), list() sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list( ), list() for iteration in range(NUM_OF_ITERATION): random.seed(iteration) random.shuffle(data_pos_bin) random.shuffle(data_neg_bin) data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec( data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors print "non-SMOTE experiment" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=False) # cross validation sFscores_iter_ns.extend(Fscores) sRecalls_iter_ns.extend(recalls) sPrecisions_iter_ns.extend(precisions) print "with SMOTE experiemnt" accuracys, precisions, recalls, Fscores = cross_validationS( \ data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=True) # cross validation sFscores_iter_ws.extend(Fscores) sRecalls_iter_ws.extend(recalls) sPrecisions_iter_ws.extend(precisions) stats_Fscores_ns.append(sFscores_iter_ns) stats_precisions_ns.append(sPrecisions_iter_ns) stats_recalls_ns.append(sRecalls_iter_ns) stats_Fscores_ws.append(sFscores_iter_ws) stats_precisions_ws.append(sPrecisions_iter_ws) stats_recalls_ws.append(sRecalls_iter_ws) print "All Experiments Done!" save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\ stats_precisions_ws, model_name=model_chosen) print "Statistics ready!"