def main():
    """
    1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
    2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
    3. Report and present statistical evaluations for each data bin.
    """
    stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE
    stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    print "Loading Doc2Vec model ..."
    model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model
    print "Doc2Vec model loading done!"
    models = {"SVC": sklearn.svm.SVC(), \
              "Logit": sklearn.linear_model.LogisticRegression(), \
              "DT": sklearn.tree.DecisionTreeClassifier(), \
              "NBayes": sklearn.naive_bayes.GaussianNB(), \
              "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
    model_chosen = "NBayes"
    print "Classifier Type:", model_chosen
    for binIndex in range(NUM_OF_BINS):
        print "Experiment on DataSet#", str(binIndex)
        random.shuffle(data_pos)
        random.shuffle(data_neg)
        size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO)
        data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin
        sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list()
        sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list()
        for iteration in range(NUM_OF_ITERATION):
            random.seed(iteration)
            random.shuffle(data_pos_bin)
            random.shuffle(data_neg_bin)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors
            print "non-SMOTE experiment"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=False)  # cross validation
            sFscores_iter_ns.extend(Fscores)
            sRecalls_iter_ns.extend(recalls)
            sPrecisions_iter_ns.extend(precisions)
            print "with SMOTE experiemnt"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=True)  # cross validation
            sFscores_iter_ws.extend(Fscores)
            sRecalls_iter_ws.extend(recalls)
            sPrecisions_iter_ws.extend(precisions)
        stats_Fscores_ns.append(sFscores_iter_ns)
        stats_precisions_ns.append(sPrecisions_iter_ns)
        stats_recalls_ns.append(sRecalls_iter_ns)
        stats_Fscores_ws.append(sFscores_iter_ws)
        stats_precisions_ws.append(sPrecisions_iter_ws)
        stats_recalls_ws.append(sRecalls_iter_ws)
    print "All Experiments Done!"
    save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
               stats_precisions_ws, model_name=model_chosen)
    print "Statistics ready!"
def main():
    stats_Fscore, stats_recall, stats_precision  = list(), list(), list()
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    model = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
    print "Model loading done!"
    for test_mode in range(2):
        if test_mode == 0:
            print "non-SMOTE"
        else:
            print "SMOTE"
        sFscores, sRecalls, sPrecisions = list(), list(), list()
        for iteration in range(NUM_OF_ITERATION): # start iteration
            random.seed(iteration)
            random.shuffle(data_pos)
            random.shuffle(data_neg)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg, model) # convert to Word Vectors
            print len(data_pos_vec), len(data_neg_vec)
            models = {"SVC": sklearn.svm.SVC(), \
                      "Logit": sklearn.linear_model.LogisticRegression(), \
                      "DT": sklearn.tree.DecisionTreeClassifier(), \
                      "NBayes": sklearn.naive_bayes.GaussianNB(), \
                      "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
            model_chosen = "SVC"
            accuracys, precisions, recalls, Fscores = cross_validationS(\
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=test_mode) # cross validation
            sFscores.extend(Fscores)
            sRecalls.extend(recalls)
            sPrecisions.extend(precisions)
        stats_Fscore.append(sFscores)
        stats_recall.append(sRecalls)
        stats_precision.append(sPrecisions)
    plt.figure()
    colors = ["red", "blue"]
    modes = ["no-SMOTE", "SMOTE"]
    for i in range(len(stats_Fscore)): # plot statistical summary
        plt.plot(stats_Fscore[i], marker='o', color=colors[i], label=modes[i]+"_Fscore")
        #plt.plot(stats_precision[i], marker='+', color=colors[i], label=modes[i]+"_precision")
        #plt.plot(stats_recall[i], marker='*', color=colors[i], label=modes[i]+"_recall")
    plt.ylim([0, 1.0])
    plt.legend(loc=4, borderaxespad=0.5)
    plt.ylabel("Scores")
    plt.xlabel("Data Sequence")
    plt.savefig("../results/"+model_chosen+"-ValidationStats.png")
    savefile_name = "../results/" + model_chosen + "-ValidationStats.txt"
    fp = open(savefile_name, 'w')
    print "******** Evaluation **********\n"
    fp.write("******** Evaluation **********\n")
    for test_mode in range(2): # print statistical evaluations
        stats_precision[test_mode].sort()
        stats_recall[test_mode].sort()
        stats_Fscore[test_mode].sort()
        p_median = stats_precision[test_mode][len(stats_precision)/2]
        r_median = stats_recall[test_mode][len(stats_recall)/2]
        f_median = stats_Fscore[test_mode][len(stats_Fscore)/2]
        iqr_p = stats_precision[test_mode][int(len(stats_precision)*0.75)] - stats_precision[test_mode][int(len(stats_precision)*0.25)]
        iqr_r = stats_recall[test_mode][int(len(stats_recall)*0.75)] - stats_recall[test_mode][int(len(stats_recall)*0.25)]
        iqr_f = stats_Fscore[test_mode][int(len(stats_Fscore)*0.75)] - stats_Fscore[test_mode][int(len(stats_Fscore)*0.25)]
        print modes[test_mode]
        fp.write(modes[test_mode]+'\n')
        print "\t p_median \t r_median \t f_median"
        fp.write("\t p_median \t r_median \t f_median \n")
        print "\t%.5f \t%.5f \t%.5f" % (p_median, r_median, f_median)
        fp.write("\t%.5f \t%.5f \t%.5f \n" % (p_median, r_median, f_median))
        print "\t iqr_p \t iqr_r \t iqr_f"
        fp.write("\t iqr_p \t iqr_r \t iqr_f \n")
        print "\t%.5f \t%.5f \t%.5f" % (iqr_p, iqr_r, iqr_f)
        fp.write("\t%.5f \t%.5f \t%.5f \n" % (iqr_p, iqr_r, iqr_f))
        print '\n'
import feature_extractor
from gensim.models.doc2vec import Doc2Vec
import parser
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pdb
import pickle


model = Doc2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True)
print "MODEL LOADED"

f = open('stopwords.txt')
stoplist = set(line.split('\n')[0] for line in f)

def filter_essay(essay):
    stop_removed = filter(lambda x: x not in stoplist, essay.split())
    all_filtered = filter(lambda x: x in model.vocab, stop_removed)
    return all_filtered

def filter_essays(essays):
    return [filter_essay(essay) for essay in essays]



def calc_similarity(i1, i2):
    return model.n_similarity(i1, i2)

def classify(k, instance, training_data, training_scores):
    similarity = np.array([calc_similarity(instance, x) for x in training_data])
예제 #4
0
def main():
    """
    1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
    2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
    3. Report and present statistical evaluations for each data bin.
    """
    stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(
    ), list()  # ns for non-SMOTE
    stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(
    ), list()  # ws for with SMOTE
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    print "Loading Doc2Vec model ..."
    model_doc2vec = Doc2Vec.load_word2vec_format(
        'GoogleNews-vectors-negative300.bin.gz',
        binary=True)  # load Doc2Vec model
    print "Doc2Vec model loading done!"
    models = {"SVC": sklearn.svm.SVC(), \
              "Logit": sklearn.linear_model.LogisticRegression(), \
              "DT": sklearn.tree.DecisionTreeClassifier(), \
              "NBayes": sklearn.naive_bayes.GaussianNB(), \
              "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
    model_chosen = "NBayes"
    print "Classifier Type:", model_chosen
    for binIndex in range(NUM_OF_BINS):
        print "Experiment on DataSet#", str(binIndex)
        random.shuffle(data_pos)
        random.shuffle(data_neg)
        size_pos_bin, size_neg_bin = int(len(data_pos) *
                                         SAMPLE_SIZE_RATIO), int(
                                             len(data_neg) * SAMPLE_SIZE_RATIO)
        data_pos_bin, data_neg_bin = data_pos[:
                                              size_pos_bin], data_neg[:
                                                                      size_neg_bin]  # dataset bin
        sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(
        ), list()
        sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(
        ), list()
        for iteration in range(NUM_OF_ITERATION):
            random.seed(iteration)
            random.shuffle(data_pos_bin)
            random.shuffle(data_neg_bin)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(
                data_pos_bin, data_neg_bin,
                model_doc2vec)  # convert to doc vectors
            print "non-SMOTE experiment"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=False)  # cross validation
            sFscores_iter_ns.extend(Fscores)
            sRecalls_iter_ns.extend(recalls)
            sPrecisions_iter_ns.extend(precisions)
            print "with SMOTE experiemnt"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=True)  # cross validation
            sFscores_iter_ws.extend(Fscores)
            sRecalls_iter_ws.extend(recalls)
            sPrecisions_iter_ws.extend(precisions)
        stats_Fscores_ns.append(sFscores_iter_ns)
        stats_precisions_ns.append(sPrecisions_iter_ns)
        stats_recalls_ns.append(sRecalls_iter_ns)
        stats_Fscores_ws.append(sFscores_iter_ws)
        stats_precisions_ws.append(sPrecisions_iter_ws)
        stats_recalls_ws.append(sRecalls_iter_ws)
    print "All Experiments Done!"
    save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
               stats_precisions_ws, model_name=model_chosen)
    print "Statistics ready!"