Пример #1
0
def run_baselines():
    ROOT_DIR = dirname(dirname(__file__))

    gold_data_folder = join(ROOT_DIR, 'data/training/')

    all_debates = [
        join(gold_data_folder, debate_name)
        for debate_name in listdir(gold_data_folder)
    ]
    all_debates.sort()
    train_debates = all_debates[:-1]
    test_debate = all_debates[-1]

    random_baseline_fpath = join(ROOT_DIR,
                                 'baselines/data/task1_random_baseline.tsv')
    run_random_baseline(test_debate, random_baseline_fpath)
    if check_format(random_baseline_fpath):
        thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(
            test_debate, random_baseline_fpath)
        print("Random Baseline AVGP:", avg_precision)

    ngram_baseline_fpath = join(ROOT_DIR,
                                'baselines/data/task1_ngram_baseline.tsv')
    run_ngram_baseline(train_debates, test_debate, ngram_baseline_fpath)
    if check_format(ngram_baseline_fpath):
        thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(
            test_debate, ngram_baseline_fpath)
        print("Ngram Baseline AVGP:", avg_precision)
Пример #2
0
def run_baselines():

    gold_data_folder = join(ROOT_DIR, 'data/training/')
    gold_data_folder = [join(gold_data_folder, debate_name) for debate_name in listdir(gold_data_folder)]
    gold_data_folder.sort()
    
    n_train = int(.8 *len(gold_data_folder))
    train_debates = gold_data_folder[:n_train]
    dev_debates = gold_data_folder[n_train:]

    run_random_baseline(dev_debates)
    avg_precisions = []
    for test_debate in dev_debates:
        random_baseline_fpath = join(ROOT_DIR, 'baselines/data/task5_random_baseline_%s'%(os.path.basename(test_debate)))
        if check_format(random_baseline_fpath):
            thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(test_debate, random_baseline_fpath)
            avg_precisions.append(avg_precision)
    print("Random Baseline AVGP:", np.mean(avg_precisions))

    run_ngram_baseline(train_debates, dev_debates)
    avg_precisions = []
    for test_debate in dev_debates:
        ngram_baseline_fpath = join(ROOT_DIR, 'baselines/data/task5_ngram_baseline_%s'%(os.path.basename(test_debate)))
        if check_format(ngram_baseline_fpath):
            thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(test_debate, ngram_baseline_fpath)
            avg_precisions.append(avg_precision)
    print("Ngram Baseline AVGP:", np.mean(avg_precisions))
Пример #3
0
def run_baselines():
    train_fpath = join(ROOT_DIR, 'data/training.tsv')
    test_fpath = join(ROOT_DIR, 'data/dev.tsv')

    run_random_baseline(test_fpath)
    random_baseline_fpath = join(
        ROOT_DIR, 'baselines/data/task1_random_baseline_%s' %
        (os.path.basename(test_fpath)))
    if check_format(random_baseline_fpath):
        thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(
            test_fpath, random_baseline_fpath)
    print("Random Baseline AVGP:", avg_precision)

    run_ngram_baseline(train_fpath, test_fpath)
    ngram_baseline_fpath = join(
        ROOT_DIR, 'baselines/data/task1_ngram_baseline_%s' %
        (os.path.basename(test_fpath)))
    if check_format(ngram_baseline_fpath):
        thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate(
            test_fpath, ngram_baseline_fpath)
    print("Ngram Baseline AVGP:", avg_precision)
def get_best_svm_model(feature_vector_train, label, feature_vector_valid):
    param_grid = [{'kernel':'linear', 'C': np.logspace(-3, 3, 20), 'gamma': [1]}, 
                  {'kernel':'rbf', 'C': np.logspace(-3, 3, 20), 
                  'gamma': np.logspace(-3, 3, 20)}]

    pca_list = [1.0,0.99,0.98,0.97,0.96,0.95]
    best_acc = 0.0
    best_model = 0
    best_prec = 0.0
    best_pca_nk = 0
    temp_xtrain = feature_vector_train
    temp_xval = feature_vector_valid
    for pca_nk in pca_list:
        if pca_nk != 1.0:
            pca = decomposition.PCA(n_components=pca_nk).fit(temp_xtrain)
            feature_vector_train = pca.transform(temp_xtrain)
            feature_vector_valid = pca.transform(temp_xval)
        for params in param_grid:
            for C in params['C']:
                for gamma in params['gamma']:
                    # Model with different parameters
                    model = svm.SVC(C=C, gamma=gamma, kernel=params['kernel'], random_state=42, class_weight='balanced')

                    # fit the training dataset on the classifier
                    model.fit(feature_vector_train, label)

                    # predict the labels on validation dataset
                    predictions = model.predict(feature_vector_valid)

                    acc = metrics.accuracy_score(predictions, val_y)

                    predicted_distance = model.decision_function(feature_vector_valid)
                    results_fpath = my_loc+'/results/temp5_%d.tsv'%(args.out_file)
                    with open(results_fpath, "w") as results_file:
                        for i, line in valDF.iterrows():
                            dist = predicted_distance[i]
                            results_file.write("{}\t{}\t{}\t{}\n".format('covid-19', line['tweet_id'],
                                                                         dist, "w2v_pos"))

                    _, _, avg_precision, _, _ = evaluate('data/dev.tsv',results_fpath)

                    if round(avg_precision,4) >= round(best_prec,4) and round(acc,2) >= round(best_acc,2):
                        best_prec = avg_precision
                        best_acc = acc
                        best_model = model
                        best_pca_nk = pca_nk

    return best_acc, best_pca_nk, best_model
    if best_pca_nk != 1.0:
        pca = decomposition.PCA(n_components=best_pca_nk).fit(ft_train)
        ft_val = pca.transform(ft_val)

    print("SVM, %s+PoS Accuracy: %.3f"%(wordmod, round(accuracy,3)))
    print("PCA No. Components: %.2f, Dim: %d"%(best_pca_nk, ft_val.shape[1]))
    print("C: %.3f, Gamma: %.3f, kernel: %s"%(classifier.C, classifier.gamma, classifier.kernel))

    predicted_distance = classifier.decision_function(ft_val)
    results_fpath = my_loc+'/results/task1_%s_pos_svm_dev_%d.tsv'%(wordmod, args.out_file)
    with open(results_fpath, "w") as results_file:
        for i, line in valDF.iterrows():
            dist = predicted_distance[i]
            results_file.write("{}\t{}\t{}\t{}\n".format('covid-19', line['tweet_id'],
                dist, wordmod+'_pos'))

    thresholds, precisions, avg_precision, reciprocal_rank, num_relevant = evaluate('data/dev.tsv', results_fpath)
    print("%s+PoS SVM AVGP: %.4f\n"%(wordmod, round(avg_precision,4)))

    all_res.append([round(accuracy,3), round(avg_precision,4), best_pca_nk, ft_train.shape[1], ft_val.shape[1]])


with open(my_loc+'/file_results/w2v_pos_%d.txt'%(args.out_file), 'a+') as f:
    for res in all_res:
        f.write("%.3f,%.4f,%.2f,%d,%d\n"%(res[0], res[1], res[2], res[3], res[4]))

    f.write('\n\n')


Пример #6
0
def get_best_svm_model(feature_vector_train, label, feature_vector_valid,
                       fname, emb_type):
    # param_grid = [{'kernel':'linear', 'C': np.logspace(-2, 2, 10), 'gamma': [1]},
    #               {'kernel':'rbf', 'C': np.logspace(-2, 2, 10),
    #               'gamma': np.logspace(-2, 2, 10)}]
    param_grid = [{
        'kernel': 'rbf',
        'C': np.logspace(-3, 3, 30),
        'gamma': np.logspace(-3, 3, 30)
    }]

    pca_list = [1.0, 0.99, 0.98, 0.97, 0.96, 0.95]
    best_acc = 0.0
    best_model = 0
    best_prec = 0.0
    best_pca_nk = 0
    temp_xtrain = feature_vector_train
    temp_xval = feature_vector_valid
    for pca_nk in pca_list:
        print(pca_nk)
        if pca_nk != 1.0:
            pca = decomposition.PCA(n_components=pca_nk).fit(temp_xtrain)
            feature_vector_train = pca.transform(temp_xtrain)
            feature_vector_valid = pca.transform(temp_xval)

        for params in param_grid:
            for C in params['C']:
                for gamma in params['gamma']:
                    # Model with different parameters
                    model = SVC(C=C,
                                gamma=gamma,
                                kernel=params['kernel'],
                                random_state=42,
                                class_weight='balanced',
                                gpu_id=args.gpu_id)

                    # fit the training dataset on the classifier
                    model.fit(feature_vector_train, label)

                    # predict the acc on validation dataset
                    acc = model.score(feature_vector_valid, val_y)

                    predicted_distance = model.decision_function(
                        feature_vector_valid)
                    results_fpath = my_loc + '/results/bert_word_posdep_%s_%s_svm_norm%d.tsv' % (
                        fname, emb_type, args.normalize)
                    with open(results_fpath, "w") as results_file:
                        for i, line in valDF.iterrows():
                            dist = predicted_distance[i][0]
                            results_file.write("{}\t{}\t{}\t{}\n".format(
                                'covid-19', line['tweet_id'], dist,
                                "bert_wd_posdep"))

                    _, _, avg_precision, _, _ = evaluate(
                        'data/dev.tsv', results_fpath)

                    if round(avg_precision, 4) >= round(
                            best_prec, 4) and round(acc, 2) >= round(
                                best_acc, 2):
                        best_prec = avg_precision
                        best_acc = acc
                        best_model = model
                        best_pca_nk = pca_nk

    return best_acc, best_pca_nk, best_model
Пример #7
0
    print("SVM, %s, %s Accuracy: %.3f" % (fname, emb_type, round(accuracy, 3)))
    print("PCA No. Components: %.2f, Dim: %d" % (best_pca_nk, ft_val.shape[1]))
    print("C: %.3f, Gamma: %.3f, kernel: %s" %
          (classifier.C, classifier.gamma, classifier.kernel))

    predicted_distance = classifier.decision_function(ft_val)
    results_fpath = my_loc + '/results/bert_word_posdep_%s_%s_svm_norm%d.tsv' % (
        fname, emb_type, args.normalize)
    with open(results_fpath, "w") as results_file:
        for i, line in valDF.iterrows():
            dist = predicted_distance[i][0]
            results_file.write("{}\t{}\t{}\t{}\n".format(
                'covid-19', line['tweet_id'], dist, 'bert_wd_posdep'))

    _, _, avg_precision, _, _ = evaluate('data/dev.tsv', results_fpath)
    print("%s, %s SVM AVGP: %.4f\n" %
          (fname, emb_type, round(avg_precision, 4)))

    pickle.dump({'best_pca': best_pca_nk},
                open(
                    my_loc + '/models/' + fname + '_' + emb_type +
                    '_posdep_norm%s.pkl' % (args.normalize), 'wb'))
    classifier.save_to_file(my_loc + '/models/' + fname + '_' + emb_type +
                            '_posdep_norm%s.dt' % (args.normalize))

    all_res.append([
        emb_type,
        round(accuracy, 3),
        round(avg_precision, 4), best_pca_nk, ft_train.shape[1],
        ft_val.shape[1]