예제 #1
0
def evaluate_model(best_fold, related_best_fold, X_holdout, y_holdout,
                   y_holdout_bi):
    predicted = [LABELS_RELATED[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS_RELATED[int(a)] for a in y_holdout_bi]

    related_X_holdout = []
    related_actual = []
    unrelated_actual = []
    un_count = 0
    for i in range(len(predicted)):
        if predicted[i] == "related":
            related_X_holdout.append(X_holdout[i])
            related_actual.append(LABELS[int(y_holdout[i])])
        else:
            unrelated_actual.append(LABELS[int(y_holdout[i])])
            un_count += 1

    predicted = [
        LABELS[int(a)] for a in related_best_fold.predict(related_X_holdout)
    ] + ["unrelated"] * un_count
    actual = related_actual + unrelated_actual

    report_score(actual, predicted)
    print("")
    print("")
예제 #2
0
파일: fnc.py 프로젝트: wadmlkas/FNC-1
 def score(self):
     predicted = [
         LABELS[self.maxindex(a)] for a in self.model.predict(self.X_test)
     ]
     actual = [LABELS[self.maxindex(a)] for a in self.Y_test]
     np.savetxt("stance4.csv", predicted, delimiter=",", fmt='%s')
     report_score(actual, predicted)
예제 #3
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_base_classifier(train_m, train_truth, test_m, test_truth):
    print('Base results:')
    pred_strs = ['unrelated' for _ in test_truth]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    s = report_score(test_strs, pred_strs)
    pred_strs = ['agree' for _ in test_truth]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    s = max(s, report_score(test_strs, pred_strs))
    return s
예제 #4
0
def get_scores(predicted, actual):

    get_labels = np.vectorize(lambda t: LABELS[int(t)])
    predicted = get_labels(predicted)
    actual = get_labels(actual)

    print("Scores on the val set")
    report_score(actual, predicted)
    print("")
예제 #5
0
파일: fnc.py 프로젝트: wadmlkas/FNC-1
 def score(self):
     predicted = [
         LABELS[self.maxindex(a)] for a in self.model.predict([
             self.X_test_head_tf, self.X_test_bodies_tf,
             self.X_test_head_idf, self.X_test_bodies_idf
         ])
     ]
     actual = [LABELS[self.maxindex(a)] for a in self.Y_test]
     np.savetxt("stance2.csv", predicted, delimiter=",", fmt='%s')
     report_score(actual, predicted)
예제 #6
0
def do_reg():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf_stage1 = GradientBoostingClassifier(n_estimators=200,
                                                random_state=14128,
                                                verbose=True)
        #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False)
        # Try random forest
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual, predicted)
예제 #7
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_NN_classifier(train_m,
                      train_truth,
                      test_m,
                      test_truth,
                      validate=False):
    print('running NN classifier, validate=' + str(validate))
    hidden_layer_sizes = [5, 10, 50, 100, 1000]
    activations = ['identity', 'logistic', 'tanh', 'relu']
    learning_rates = [0.001, 0.01, 0.0001]
    highest_fnc = 0
    best_h_size = 100
    best_activation = 'relu'
    best_lr = 0.001
    if validate:
        split_train_m, split_train_truth, validate_m, validate_truth = get_validation_mat(
            0.7, train_m, train_truth)
        # split_train_truth = label_binarize(split_train_truth, classes=[0, 1, 2, 3])
        for h in hidden_layer_sizes:
            for a in activations:
                for lr in learning_rates:
                    model = MLPClassifier(max_iter=1000,
                                          hidden_layer_sizes=h,
                                          activation=a,
                                          learning_rate_init=lr)
                    clf = OneVsOneClassifier(model)
                    pred_labels = clf.fit(
                        split_train_m,
                        split_train_truth).predict(validate_m).tolist()
                    # pred_strs = fnc_one_hot_label_decode(pred_labels)
                    pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
                    test_strs = [fnc_vect_to_label(d) for d in validate_truth]
                    fnc_s = report_score(test_strs, pred_strs, verbose=False)
                    if fnc_s > highest_fnc:
                        highest_fnc = fnc_s
                        best_activation = a
                        best_h_size = h
                        best_lr = lr
    # train_truth = label_binarize(train_truth, classes=[0, 1, 2, 3])
    model = MLPClassifier(max_iter=1000,
                          hidden_layer_sizes=best_h_size,
                          activation=best_activation,
                          learning_rate_init=best_lr)
    clf = OneVsOneClassifier(model)
    pred_labels = clf.fit(train_m, train_truth).predict(test_m).tolist()
    pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    print('NN results:')
    f1_w = f1_score(test_strs, pred_strs, average='weighted')
    f1_class = f1_score(test_strs, pred_strs, average=None)
    acc = accuracy_score(test_strs, pred_strs)
    print('NN f1 comp: ' + str(f1_w) + ', ' + str(f1_class))
    print('NN acc: ' + str(acc))
    return report_score(test_strs, pred_strs), f1_w, acc
예제 #8
0
def print_scores(test, comp, id):
    pred_test = [s["Predict"] for s in test.get_labelled_stances() if s["Stance ID"] in id]
    actl_test = [s["Stance"]  for s in test.get_labelled_stances() if s["Stance ID"] in id]
    pred_comp = [s["Predict"] for s in comp.get_labelled_stances()]
    actl_comp = [s["Stance"]  for s in comp.get_labelled_stances()]

    print("Scores on the dev set")
    report_score(actl_test,pred_test)
    print("")
    print("")

    print("Scores on the test set")
    report_score(actl_comp,pred_comp)
예제 #9
0
    def predict(self):
        print("test model")
        self.mlp.model.load_weights('Mingjie_Chen_weights.h5')
        result = self.mlp.model.predict(self.test_vec)
        labels = ['unrelated', 'disagree', 'discuss', 'agree']
        result_lables = [labels[np.argmax(n)] for n in result]
        actual = [stance['Stance'] for stance in test_data]
        # actual = [labels[np.argmax(n)] for n in test_y]
        n = 0
        for a, p in zip(result_lables, actual):
            if a == p:
                n += 1
        print("test accuracy is {}".format(n / len(result_lables)))

        report_score(actual, result_lables)
예제 #10
0
    def mlp_predict(self,data):
        # load the svm classifier
        mlp_clf = pickle.load(open("mlp_classifier", "rb"))
        #
        total_dev_data = len(data)
        folder = 'test_feature_vectors'
        file_path_list = os.listdir(folder)
        correct = 0
        print("Testing Set!")
        actual_list = []
        predicted_list = []
        for i, file_path in enumerate(file_path_list):
            file_name = os.path.basename(file_path)
            file_path = os.path.join(folder, file_path)
            stance = re.findall(r'stance#(.*)#', file_name)[0]
            feature_vector = np.load(file_path)
            feature_vector = feature_vector.reshape(1, -1)
            if (feature_vector == np.array(None)).all():
                print("feature_vector is None!")
                sys.exit(0)
            # print("file_path: {}".format(file_path))
            # print ("feature_vector: {}".format(feature_vector))
            clf_stance = mlp_clf.predict(feature_vector)
            if stance == clf_stance:
                correct += 1
            actual_list.append(stance)
            predicted_list.append(clf_stance)
            # print ("add correct, dev data id :{}".format(i))

        # save accuracy
        Accuracy = report_score(actual_list, predicted_list)
        self.mlp_result.append(Accuracy)
예제 #11
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def test_mlf(node,
             data,
             ols_limit,
             sensitivity,
             func,
             num_classes,
             verbose=True):
    start = timeit.default_timer()
    truth_labels = [t[class_label_idx] for t in data]
    truth_labels_str = [t[len(t) - 1] for t in data]
    predict_labels = []
    predict_labels_str = []
    for test_data in data:
        membership_vect = [0 for _ in range(num_classes)]
        test_val = test_data[sample_val_idx]
        label = mlf_classify(membership_vect, node, test_val, ols_limit,
                             sensitivity, func)
        predict_labels.append(label)
        predict_labels_str.append(fnc_vect_to_label(label))
    assert len(truth_labels) == len(predict_labels)
    classify_time = timeit.default_timer() - start
    if verbose:
        print('classify time: ' + str(classify_time))
    fnc_score = report_score(truth_labels_str,
                             predict_labels_str,
                             verbose=verbose)
    f1_weighted = f1_score(truth_labels, predict_labels, average='weighted')
    f1_class = f1_score(truth_labels, predict_labels, average=None)
    acc = accuracy_score(truth_labels, predict_labels)
    print('MLF f1 comp: ' + str(f1_weighted) + ', ' + str(f1_class))
    print('MLF acc: ' + str(acc))
    return fnc_score, predict_labels, f1_weighted, acc
예제 #12
0
def evaluate_answer(model, model_inp, true):
    inv_category_dict = {
        0: 'unrelated',
        1: 'agree',
        2: 'disagree',
        3: 'discuss'
    }
    predicted = model.predict(model_inp)
    predicted = np.argmax(predicted, axis=1)
    t = np.argmax(true, axis=1)
    ground = list()
    pred = list()
    for i in predicted:
        pred.append(inv_category_dict[i])
    for i in t:
        ground.append(inv_category_dict[i])
    score.report_score(ground, pred)
예제 #13
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_LR_classifier(train_m,
                      train_truth,
                      test_m,
                      test_truth,
                      validate=False):
    print('running LR classifier, validate=' + str(validate))
    cs = [0.01, 0.1, 1.0, 10]
    regs = ['l2']
    highest_fnc = 0
    best_c = 0.1
    best_reg = 'l2'
    if validate:
        split_train_m, split_train_truth, validate_m, validate_truth = get_validation_mat(
            0.7, train_m, train_truth)
        for c in cs:
            for reg in regs:
                model = LogisticRegression(multi_class='multinomial',
                                           solver='lbfgs',
                                           C=c,
                                           penalty=reg)
                model.fit(split_train_m, split_train_truth)
                pred_labels = model.predict(validate_m)
                pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
                test_strs = [fnc_vect_to_label(d) for d in validate_truth]
                fnc_s = report_score(test_strs, pred_strs, verbose=False)
                if fnc_s > highest_fnc:
                    highest_fnc = fnc_s
                    best_reg = reg
                    best_c = c
    model = LogisticRegression(multi_class='multinomial',
                               solver='lbfgs',
                               C=best_c,
                               penalty=best_reg)
    model.fit(train_m, train_truth)
    pred_labels = model.predict(test_m)
    pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    print('LR results:')
    f1_w = f1_score(test_strs, pred_strs, average='weighted')
    f1_class = f1_score(test_strs, pred_strs, average=None)
    acc = accuracy_score(test_strs, pred_strs)
    print('LR f1 comp: ' + str(f1_w) + ', ' + str(f1_class))
    print('LR acc: ' + str(acc))
    return report_score(test_strs, pred_strs), f1_w, acc
예제 #14
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_NB_classifier(train_m,
                      train_truth,
                      test_m,
                      test_truth,
                      validate=False):
    print('running NB classifier, validate=' + str(validate))
    priors = [None, [0.5, 0.5]]
    best_prior = None
    highest_fnc = 0
    if validate:
        split_train_m, split_train_truth, validate_m, validate_truth = get_validation_mat(
            0.7, train_m, train_truth)
        # split_train_truth = label_binarize(split_train_truth, classes=[0, 1, 2, 3])
        for pr in priors:
            model = GaussianNB(priors=pr)
            clf = OneVsOneClassifier(model)
            pred_labels = clf.fit(
                split_train_m, split_train_truth).predict(validate_m).tolist()
            # pred_strs = fnc_one_hot_label_decode(pred_labels)
            pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
            test_strs = [fnc_vect_to_label(d) for d in validate_truth]
            fnc_s = report_score(test_strs, pred_strs, verbose=False)
            if fnc_s > highest_fnc:
                highest_fnc = fnc_s
                best_prior = pr
    # train_truth = label_binarize(train_truth, classes=[0, 1, 2, 3])
    model = GaussianNB(priors=best_prior)
    clf = OneVsOneClassifier(model)
    pred_labels = clf.fit(train_m, train_truth).predict(test_m).tolist()
    pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    print('NB results:')
    f1_w = f1_score(test_strs, pred_strs, average='weighted')
    f1_class = f1_score(test_strs, pred_strs, average=None)
    acc = accuracy_score(test_strs, pred_strs)
    print('NB f1 comp: ' + str(f1_w) + ', ' + str(f1_class))
    print('NB acc: ' + str(acc))
    return report_score(test_strs, pred_strs), f1_w, acc
예제 #15
0
    def check_score_cm(self, session, body_path, headline_path, ans_path, dataset, num_samples=100, print_to_screen=False):
        
        logging.info("Calculating Score for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset))
        
        tic = time.time()
        pred_list = []
        actual_list = []
        rem_count = num_samples

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in custom_get_batch_generator(self.word2id, body_path, headline_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False):

            pred_class = self.get_class(session, batch)
            pred_class = pred_class.tolist()
            
            predicted = [LABELS[int(a)] for a in pred_class]
            batch_size = batch.batch_size
            
            actual_reshaped = (batch.ans_span).reshape(batch_size,) #Reshape it to (batch_size,)
            actual = [LABELS[int(a)] for a in actual_reshaped]
            
            #score, cm = score_submission(actual, predicted)
            
            if num_samples == 0: # All entries in all batches // for dev set....
                pred_list = pred_list+predicted
                actual_list = actual_list+actual
            else:
                if rem_count>=batch_size:
                    pred_list = pred_list+predicted
                    actual_list = actual_list+actual
                    rem_count = rem_count-batch_size
                else:
                    pred_list = pred_list+predicted[:rem_count]
                    actual_list = actual_list+actual[:rem_count]
                    rem_count = 0
            
                if rem_count == 0:
                    break
        
        #HERE
        score = report_score(actual_list, pred_list)
        
        toc = time.time()
        logging.info("Calculating Score/CM for %i examples in %s set took %.2f seconds" % (num_samples, dataset, toc-tic))

        return score
예제 #16
0
def train_model(lrmodel, X, Y, devX, devY, devscores, feat_train, feat_dev, train_tfidf, test_tfidf):
    """
    Train model, using pearsonr on dev for early stopping
    """
    done = False
    best = -1.0
    #r = np.arange(1,5)
    
    while not done:
        # Every 100 epochs, check Pearson on development set
        lrmodel.fit([X,feat_train,train_tfidf], Y, verbose=2, shuffle=False, nb_epoch = 3, validation_data=([devX,feat_dev,test_tfidf], devY))
        #yhat = np.dot(lrmodel.predict(devX, verbose=2), r)
        yhat = lrmodel.predict([devX,feat_dev,test_tfidf], verbose=2)
        yhat = [i.argmax()for i in yhat]
        
        string_predicted,test_stances = [],[]
    
        for i,j in zip(yhat,devscores):
            if i == 3:
                string_predicted.append('unrelated')
            if i == 0:
                string_predicted.append('agree')
            if i == 2:
                string_predicted.append('discuss')
            if i == 1:
                string_predicted.append('disagree')
            if j == 3:
                test_stances.append('unrelated')
            if j == 0:
                test_stances.append('agree')
            if j == 2:
                test_stances.append('discuss')
            if j == 1:
                test_stances.append('disagree')
        print 'using new limit value....'
        #score = accuracy_score(devscores, yhat)
        score = report_score(test_stances,string_predicted,val=True)
        #return lrmodel
    
        if score > best:
            print score
            best = score
            bestlrmodel = prepare_model(ninputs=X.shape[1],n_feats=feat_train.shape[1],n_tfidf=train_tfidf.shape[1])
            bestlrmodel.set_weights(lrmodel.get_weights())
        else:
            done = True
            print '***** best model obtained with score',best,'******'

    yhat = bestlrmodel.predict([devX, feat_dev, test_tfidf], verbose=2)
    yhat = [i.argmax()for i in yhat]
    string_predicted,test_stances = [],[]
    
    for i,j in zip(yhat,devscores):
        if i == 3:
            string_predicted.append('unrelated')
        if i == 0:
            string_predicted.append('agree')
        if i == 2:
            string_predicted.append('discuss')
        if i == 1:
            string_predicted.append('disagree')
        if j == 3:
            test_stances.append('unrelated')
        if j == 0:
            test_stances.append('agree')
        if j == 2:
            test_stances.append('discuss')
        if j == 1:
            test_stances.append('disagree')
            
    report_score(test_stances,string_predicted)
    return bestlrmodel
예제 #17
0
def evaluate(encoder=None, seed=1234, evaltest=False, loc='./data/'):
    """
    Run experiment
    """
    print 'Preparing data for fnc...'
    
    #train, dev, test, scores = load_data(loc)
    #train[0], train[1], scores[0] = shuffle(train[0], train[1], scores[0], random_state=seed)

    '''
    trh, trb, tsh, tsb =\
                load_dataset("/fnc_data/train_stances.csv", "/fnc_data/train_bodies.csv",\
                             "/fnc_data/competition_test_stances.csv", "/fnc_data/test_bodies.csv")
   '''
    train_h = np.load('/fncdata2/encode_train_head.npy')
    train_b = np.load('/fncdata2/encode_train_body.npy')
    test_h = np.load('/fncdata2/encode_test_head.npy')
    test_b = np.load('/fncdata2/encode_test_body.npy')
    score_train = np.load('/fncdata2/score_train.npy')
    score_test = np.load('/fncdata2/score_test.npy')
    #train_b = big_mat
    #train_h, dev_h, train_b, dev_b, score_train, dev_score = split(np.array(train_h), train_b, score_train, 0.2)
 
    print 'loading training skipthoughts...'
    #trainA = encoder.encode(train_h, verbose=False, use_eos=True)
    #trainB = encoder.encode(train_b, verbose=False, use_eos=True)
    trainA = train_h
    trainB = train_b
    
    print 'Computing development skipthoughts...'
    #devA = encoder.encode(dev_h, verbose=False, use_eos=True)
    #devB = encoder.encode(dev_b, verbose=False, use_eos=True)
#    devA = dev_h
#    devB = dev_b
    devA = test_h
    devB = test_b
    dev_score = score_test

    print 'Computing feature combinations...'
    trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
    devF = np.c_[np.abs(devA - devB), devA * devB]

    print 'Encoding labels...'
    #trainY = encode_labels(train_labels)
    #devY = encode_labels(holdout_labels)
    trainY = to_categorical(score_train, 4)
    devY = to_categorical(dev_score, 4)
    
    train_Fx, test_Fx = load_features()
    #fmodel = generate_feature_model(train_Fx, score_train, test_Fx, dev_score, ninputs=len(train_Fx[0]))

    train_tfidf, test_tfidf = generate_tfidf()
    
    print 'Compiling model...'
    lrmodel = prepare_model(ninputs=trainF.shape[1],n_feats=train_Fx.shape[1],n_tfidf=train_tfidf.shape[1])

    print 'Training...'
    bestlrmodel = train_model(lrmodel, trainF, trainY, devF, devY, dev_score, train_Fx, test_Fx, train_tfidf, test_tfidf)
    
    if evaltest:
        print 'Loading test skipthoughts...'
        testA = test_h
        testB = test_b

        print 'Computing feature combinations...'
        testF = np.c_[np.abs(testA - testB), testA * testB]
        
        yhat = bestlrmodel.predict(testF, verbose=2)
        yhat = [i.argmax()for i in yhat]
        
        string_predicted,test_stances = [],[]
        
        for i,j in zip(yhat,score_test):
            if i == 3:
                string_predicted.append('unrelated')
            if i == 0:
                string_predicted.append('agree')
            if i == 2:
                string_predicted.append('discuss')
            if i == 1:
                string_predicted.append('disagree')
            if j == 3:
                test_stances.append('unrelated')
            if j == 0:
                test_stances.append('agree')
            if j == 2:
                test_stances.append('discuss')
            if j == 1:
                test_stances.append('disagree')
                
        report_score(test_stances,string_predicted)
        score = accuracy_score(score_test, yhat)
        print 'accuracy is ..',score
예제 #18
0
#    if ypp[i][1] == 1:
#        a ='disagree'
#    if ypp[i][2] == 1:
#        a ='discuss'
#    if ypp[i][3] == 1:
#        a ='unrelated'
#    Pr.append(a)       
#        
#Ac = []   
#for i in range(len(ypp)):
#    if y[i][0] == 1:
#        a = 'agree'
#    if y[i][1] == 1:
#        a ='disagree'
#    if y[i][2] == 1:
#        a ='discuss'
#    if y[i][3] == 1:
#        a ='unrelated'
#    Ac.append(a)
#
#ac = np.array(Ac)
#pr = np.array(Pr)        
        
        
        
        
        
        

aha = score.report_score(Ac,Pr)   
예제 #19
0
        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score/max_fold_score

        print("Score for fold "+ str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf



    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual,predicted)
예제 #20
0
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(actual, predicted)
    print("")
    print("")

    #Run on competition dataset
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
    actual = [LABELS[int(a)] for a in y_competition]

    print("Scores on the test set")
    report_score(actual, predicted)
# if it is more than threshold, use mlp to make further predicrion
        elif simil >= thre:
            whole = [test_head + " " + test_body]
            vec = count_vect.transform(whole).toarray()
            appro.append(clf.predict(vec))


# return the predicted list
    return appro

if __name__ == '__main__':
    stoplist = stop_list("stop_list.txt")

    similar_matrix, headline, articles = similarity(training_data, stoplist)

    thre = train_threshold(training_data, similar_matrix, headline, articles)
    print("similarity threshold:", thre)

    clf, count_vect = train_related(training_data, stoplist)

    appro = test(test_data, similar_matrix, clf, count_vect)
    # get the actual value of test data
    actual = [stance['Stance'] for stance in test_data]
    count = 0
    for i in range(len(actual)):
        if actual[i] == appro[i]:
            count += 1
    print("accuracy:", count / len(actual))

    report_score(actual, appro)
예제 #22
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_rand_classifier(train_m, train_truth, test_m, test_truth):
    pred_strs = [fnc_vect_to_label(randint(0, 3)) for _ in test_truth]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    print('Rand results:')
    return report_score(test_strs, pred_strs)
예제 #23
0
        if GPU_sw:
            y_pred.extend(y_pred_.data.cpu().numpy())
            y_target.extend(target.data.cpu().numpy())
        else:
            y_pred.extend(y_pred_.data.numpy())
            y_target.extend(target.data.numpy())

    predicted = np.argmax(np.asarray(y_pred), axis=1)
    actual = np.asarray(y_target)

    predicted1 = [LABELS[int(a)] for a in predicted]
    actual1 = [LABELS[int(a)] for a in actual]

    print("Scores on the dev set")
    report_score(actual1, predicted1)
    print("")
    print("")

    X_competition = np.array(X_competition)
    y_competition = np.array(y_competition)

    testVal = X_competition.shape[0] % 100
    testVal = X_competition.shape[0] - testVal

    test = data_utils.TensorDataset(
        torch.from_numpy(X_competition[:testVal]).float(),
        torch.from_numpy(y_competition[:testVal]))
    test_loader = data_utils.DataLoader(test,
                                        batch_size=batch_size,
                                        shuffle=True)
예제 #24
0
 def evaluate(self, model, X_val, y_val):
     # This should probably actually be in an evaluate method
     pred_related, pred_stance = model.predict(X_val)
     report_score([
         LABELS[np.where(x == 1)[0][0]] for x in y_val['stance_prediction']
     ], [LABELS[np.argmax(x)] for x in pred_stance])
예제 #25
0
파일: mlf.py 프로젝트: xuefei1/MLFtest
def run_DT_classifier(train_m,
                      train_truth,
                      test_m,
                      test_truth,
                      validate=False):
    print('running DT classifier, validate=' + str(validate))
    criterions = ['gini', 'entropy']
    splitters = ['best', 'random']
    max_features = [None, 0.5, 0.9, 'sqrt', 'log2']
    max_depths = [None, 5, 10, 50, 100]
    highest_fnc = 0
    best_depth = None
    best_criterion = 'gini'
    best_features = None
    best_splitter = 'best'
    if validate:
        split_train_m, split_train_truth, validate_m, validate_truth = get_validation_mat(
            0.7, train_m, train_truth)
        # split_train_truth = label_binarize(split_train_truth, classes=[0, 1, 2, 3])
        for c in criterions:
            for s in splitters:
                for f in max_features:
                    for d in max_depths:
                        model = DecisionTreeClassifier(criterion=c,
                                                       splitter=s,
                                                       max_features=f,
                                                       max_depth=d)
                        clf = OneVsOneClassifier(model)
                        pred_labels = clf.fit(
                            split_train_m,
                            split_train_truth).predict(validate_m).tolist()
                        # pred_strs = fnc_one_hot_label_decode(pred_labels)
                        pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
                        test_strs = [
                            fnc_vect_to_label(d) for d in validate_truth
                        ]
                        fnc_s = report_score(test_strs,
                                             pred_strs,
                                             verbose=False)
                        if fnc_s > highest_fnc:
                            highest_fnc = fnc_s
                            best_criterion = c
                            best_splitter = s
                            best_features = f
                            best_depth = d
    # train_truth = label_binarize(train_truth, classes=[0, 1, 2, 3])
    model = DecisionTreeClassifier(criterion=best_criterion,
                                   splitter=best_splitter,
                                   max_features=best_features,
                                   max_depth=best_depth)
    clf = OneVsOneClassifier(model)
    pred_labels = clf.fit(train_m, train_truth).predict(test_m).tolist()
    pred_strs = [fnc_vect_to_label(d) for d in pred_labels]
    test_strs = [fnc_vect_to_label(d) for d in test_truth]
    print('DT results:')
    f1_w = f1_score(test_strs, pred_strs, average='weighted')
    f1_class = f1_score(test_strs, pred_strs, average=None)
    acc = accuracy_score(test_strs, pred_strs)
    print('DT f1 comp: ' + str(f1_w) + ', ' + str(f1_class))
    print('DT acc: ' + str(acc))
    return report_score(test_strs, pred_strs), f1_w, acc
예제 #26
0
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(actual, predicted, type='dev')
    print("")
    print("")

    #Run on competition dataset
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
    actual = [LABELS[int(a)] for a in y_competition]

    print("Scores on the test set")
    report_score(actual, predicted, type='test')

    print("Break down scores")
    detailed_score(actual, predicted)
예제 #27
0
                                                 verbose=False)
                    clf.fit(X_train, y_train)
                    classifier = one_stage_classifier(clf)
                    final, actual, score = classifier.run(X_test, y_test_true)
                    print("Score for fold " + str(fold) + " was - " +
                          str(score))
                    if score > best_score:
                        best_score = score
                        best_fold1 = clf

                filename = model_dir + "_" + config.mode + "_" + cval[cval_ind]
                pickle.dump(best_fold1, open(filename, "wb"))

                classifier = one_stage_classifier(best_fold1)
                final, actual, score = classifier.run(X_holdout, y_holdout)
                report_score(actual, final)
                fscore.append(score)
                del best_fold1, best_score

            else:
                for fold in fold_stances:
                    ids = list(range(len(folds)))
                    del ids[fold]
                    X_stg1, X_stg2, y_stg1, y_stg2 = features.consolidated_features_cval(
                        cval_ind, fold, ids)
                    X_train_stg1 = X_stg1["train"]
                    X_test_stg1 = X_stg1["test"]
                    X_holdout_stg1 = X_stg1["holdout"]
                    y_test_true = y_stg1["true"]
                    y_train_stg1 = y_stg1["train"]
                    y_test_stg1 = y_stg1["test"]
예제 #28
0
                    ]
                    init_pred_ind = [
                        i for i, e in enumerate(init_pred) if e == 0
                    ]

                    test = np.zeros((len(init_pred_ind), n_steps, n_input))
                    cc = 0
                    for test_ind in init_pred_ind:
                        test[cc, 0, :] = X_h_holdout["features"][test_ind]
                        test[cc, 0, :] = X_b_holdout["features"][test_ind]
                        cc += 1
                    predic = sess.run(pred,
                                      feed_dict={
                                          x: test,
                                          labels: to_one_hot(y_test)
                                      })
                    predic_lab = [LABELS[int(a)] for a in from_one_hot(predic)]
                    #actual = [LABELS[int(a)] for a in np.argmax(to_one_hot(y_test),1)]
                    for i, e in enumerate(init_pred_ind):
                        base_pred[e] = predic_lab[i]
                    print('confusion matrix')
                    report_score(base_act, base_pred)

                    fold_score, _ = score_submission(base_act, base_pred)
                    max_fold_score, _ = score_submission(base_act, base_act)
                    score = fold_score / max_fold_score
                    tune_score[l_tune, lr_tune, fold] = score

    print(np.amax(tune_score, axis=2))
    pickle.dump(tune_score, open("finetuning.p", "wb"))
예제 #29
0
                                  train_co_occurance)
    dev_array = combine_featues(dev_vectors_array, dev_overlap_array,
                                tfidf_matrix_dev, dev_similarity_array,
                                dev_co_occurance)
    test_array = combine_featues(test_vectors_array, test_overlap_array,
                                 tfidf_matrix_test, test_similarity_array,
                                 test_co_occurance)

    ###############

    #B) Classifier:

    predicted = classifier(train_array, labels_train, test_array)

    ###############

    #C) Scoring:

    report_score(labels_test, predicted)

    ###############

    #D) Learning Curves:

    #plot_curve(train_array, labels_train)

    ###############

    #E) Execution time:
    print("--- %s seconds ---" % (time.time() - start_time))
예제 #30
0
                best_score = score
                best_fold = clf
        pickle.dump(best_fold, open(params.gb_weights_file, 'wb'))

    best_fold = pickle.load(open(params.gb_weights_file, 'rb'))
    # Run on Holdout set and report the final score on the holdout set
    if params.run_2_class:
        predicted = [
            LABELS_RELATED[int(a)] for a in best_fold.predict(X_holdout)
        ]
        actual = [LABELS_RELATED[int(a)] for a in y_holdout]
    else:
        predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
        actual = [LABELS[int(a)] for a in y_holdout]
        print("Scores on the dev set")
        report_score(actual, predicted)
        print("")
        print("")

    #Run on competition dataset
    predicted_combined = []
    if params.run_2_class:
        predicted = [
            LABELS_RELATED[int(a)] for a in best_fold.predict(X_competition)
        ]
    else:
        predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]

    predicted_combined = [
        a if a == "unrelated" else aD
        for a, aD in zip(predicted, dl_model_pred)
def MLP_Classifier(row_body_train,
                   row_stance_train,
                   row_body_test,
                   row_stance_test,
                   head_dir_train,
                   body_dir_train,
                   label_dir_train,
                   cos_dir_train,
                   head_dir_test,
                   body_dir_test,
                   label_dir_test,
                   cos_dir_test,
                   learning_rate=0.001,
                   batch_size=188,
                   training_epoch=70,
                   init_bias=0.001,
                   mode='train',
                   save_model_path='../tf_model/tfidf_5000_epoch',
                   holdout=False):
    """

    :param row_head_train: 원본 head train 파일이 있는 경로
    :param row_stance_train: 원본 stance train 파일이 있는 경로
    :param row_head_test: 원본 head test  파일이 있는 경로
    :param row_stance_test: 원본 stance test 파일이 있는 경로
    :param head_dir_train: head train pkl 파일이 있는 경로
    :param body_dir_train: body train pkl 파일이 있는 경로
    :param label_dir_train: y train label pkl 파일이 있는 경로
    :param cos_dir_train: cos_tfidf_train pkl 파일이 있는 경로
    :param head_dir_test: head test pkl 파일이 있는 경로
    :param body_dir_test: body test pkl 파일이 있는 경로
    :param label_dir_test: y test label pkl 파일이 있는 경로
    :param learning_rate: 학습률 파라미터
    :param cos_dir_test: cos_tfidf_test pkl 파일이 있는 경로
    :param batch_size: 배치 사이트 파라미터
    :param training_epoch: 학습 횟수인 epoch 파라미터
    :param init_bias: bias 초기값 파라미터
    :param mode: train, test 두 모드를 선택하는 파라미터
    :param save_model_path: 모델이 저장될 경로를 입력하는 파라미터
    :return: 
    """
    lr = learning_rate
    batch_size = batch_size
    training_epoch = training_epoch
    hidden = (362, 942, 1071, 870, 318, 912, 247)

    n_classes = 4
    if mode == 'train':
        X_train = make_tfidf_combined_feature_cos_5000_holdout(
            row_body_train, row_stance_train, row_body_test, row_stance_test,
            head_dir_train, body_dir_train, label_dir_train, cos_dir_train)
        y_train = load_tfidf_y(label_dir_train)
        n_input = X_train.shape[1]
    else:
        X_test = make_tfidf_combined_feature_cos_5000_holdout(
            row_body_test, row_stance_test, row_body_test, row_stance_test,
            head_dir_test, body_dir_test, label_dir_test, cos_dir_test)
        y_test = load_tfidf_y(label_dir_test)
        n_input = X_test.shape[1]

    LABELS = ['agree', 'disagree', 'discuss', 'unrelated']

    predictions_list = []
    actual_list = []

    graph = tf.Graph()
    with graph.as_default():

        X = tf.placeholder("float32", [None, n_input])
        Y = tf.placeholder("float32", [None, n_classes])
        learning_rate_tensor = tf.placeholder(tf.float32)
        momentum = tf.placeholder(tf.float32)

        layer1 = tf.nn.relu(
            tf.add(tf.matmul(X, weight_variable('w1', [n_input, hidden[0]])),
                   bias_variable('b1', [hidden[0]], init_bias)))
        layer2 = tf.nn.relu(
            tf.add(
                tf.matmul(layer1, weight_variable('w2',
                                                  [hidden[0], hidden[1]])),
                bias_variable('b2', [hidden[1]], init_bias)))

        layer3 = tf.nn.relu(
            tf.add(
                tf.matmul(layer2, weight_variable('w3',
                                                  [hidden[1], hidden[2]])),
                bias_variable('b3', [hidden[2]], init_bias)))

        layer4 = tf.nn.relu(
            tf.add(
                tf.matmul(layer3, weight_variable('w4',
                                                  [hidden[2], hidden[3]])),
                bias_variable('b4', [hidden[3]], init_bias)))

        layer5 = tf.nn.relu(
            tf.add(
                tf.matmul(layer4, weight_variable('w5',
                                                  [hidden[3], hidden[4]])),
                bias_variable('b5', [hidden[4]], init_bias)))

        layer6 = tf.nn.relu(
            tf.add(
                tf.matmul(layer5, weight_variable('w6',
                                                  [hidden[4], hidden[5]])),
                bias_variable('b6', [hidden[5]], init_bias)))

        layer7 = tf.nn.relu(
            tf.add(
                tf.matmul(layer6, weight_variable('w7',
                                                  [hidden[5], hidden[6]])),
                bias_variable('b7', [hidden[6]], init_bias)))
        logits = tf.add(
            tf.matmul(layer7, weight_variable('out_w',
                                              [hidden[6], n_classes])),
            bias_variable('out_b', [n_classes], init_bias))
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                       labels=Y))
        optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate_tensor).minimize(cost)

        predictions = tf.argmax(logits, 1)
        correct_prediction = tf.equal(predictions, tf.argmax(Y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        saver = tf.train.Saver(max_to_keep=10)

    model_path = save_model_path + str(training_epoch)

    tf.reset_default_graph()
    with tf.Session(graph=graph) as sess:
        sess.run(tf.global_variables_initializer())

        if mode == 'train':
            print('Learning Started!')
            calc_learning_rate = lr
            for epoch in range(training_epoch):
                print('epoch : ', epoch)
                print(len(X_train))
                momentum_start = 0.5
                momentum_end = 0.99
                i = 0
                calc_momentum = momentum_start + (float(
                    (momentum_end - momentum_start) / training_epoch) * epoch)

                if epoch > 0 and (epoch == 20 or epoch == 35 or epoch == 45):
                    calc_learning_rate = float(calc_learning_rate / 10.0)

                batch_acc = 0.0
                batch_cost = 0.0
                batch_count = 1
                # print(ep)
                while i < len(X_train):

                    start = i
                    end = i + batch_size
                    batch_x = np.array(X_train[start:end])
                    batch_y = np.array(y_train[start:end])

                    c, acc, _ = sess.run(
                        [cost, accuracy, optimizer],
                        feed_dict={
                            X: batch_x,
                            Y: batch_y,
                            learning_rate_tensor: calc_learning_rate,
                            momentum: calc_momentum
                        })
                    # print('cost : ', c, ' accuracy : ', acc)
                    i += batch_size
                    batch_acc += acc
                    batch_cost += c
                    if i % (batch_size * 90) == 0:
                        print(i, '/', len(X_train))
                        print(' batch acc : ', batch_acc / batch_count)
                        print(' batch cost : ', batch_cost / batch_count)

                    batch_count += 1

                if holdout:
                    i = 0
                    while i < len(X_test):
                        start = i
                        end = i + batch_size
                        batch_x = np.array(X_test[start:end])
                        batch_y = np.array(y_test[start:end])

                        c, acc, _ = sess.run(
                            [cost, accuracy, optimizer],
                            feed_dict={
                                X: batch_x,
                                Y: batch_y,
                                learning_rate_tensor: calc_learning_rate,
                                momentum: calc_momentum
                            })
                        # print('cost : ', c, ' accuracy : ', acc)
                        i += batch_size
                        batch_acc += acc
                        batch_cost += c
                        if i % (batch_size * 90) == 0:
                            print(i, '/', len(X_test))
                            print(' batch acc : ', batch_acc / batch_count)
                            print(' batch cost : ', batch_cost / batch_count)
                        batch_count += 1

            saver.save(sess, model_path)
            print('Training Finished!')

        if mode == 'test':
            saver.restore(sess, model_path)
            print('model load finish!')

            pred, acc = sess.run([predictions, accuracy],
                                 feed_dict={
                                     X: X_test,
                                     Y: y_test
                                 })
            print('pred :', pred, ', acc :', acc)
            report_score([LABELS[e] for e in np.argmax(y_test, 1)],
                         [LABELS[e] for e in pred])