Exemplo n.º 1
0
def variance_plot_cv(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, fold_range=range(2, 30), regularization='l2'):
    rootdist_feature = sparse.csr_matrix(rootdist_matrix)
    questionmark_feature = questionmark
    ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature())

    combined_all = sparse.hstack((
        rootdist_feature,
        questionmark_feature,
        ppdb_alignment_feature,
        tf_matrix
    ))
    plot_2D_data(combined_all, target)

    results = []

    for n_folds in fold_range:
        print(n_folds)
        custom_folds = cv_fold_generator(claim_ids, n_folds)
        results.append(logistic_regression_var(combined_all, target, custom_folds, regularization, 10000))

    results_arr = np.array(results)
    plt.plot(fold_range, results_arr[:, 0], label='Accuracy')  # Plot accuracy
    plt.plot(fold_range, results_arr[:, 1], label='F1-Score')  # Plot F1-score
    plt.plot(fold_range, results_arr[:, 2], label='Recall')  # Plot recall
    plt.plot(fold_range, results_arr[:, 3], label='Precision')  # Plot precision
    plt.legend()
    plt.show()
    plt.plot(fold_range, results_arr[:, 4], label='Accuracy var.')  # Plot accuracy variance
    plt.plot(fold_range, results_arr[:, 5], label='F1-Score var.')  # Plot F1-score variance
    plt.plot(fold_range, results_arr[:, 6], label='Recall var.')  # Plot recall variance
    plt.plot(fold_range, results_arr[:, 7], label='Precision var.')  # Plot precision variance
    plt.legend()
    plt.show()
Exemplo n.º 2
0
def grid_search_bow_custom_fold(data_h, target, ids, questionmark_features, folds=10, do_custom_folds=True):
    ngram_range = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)]
    max_features = range(80, 95)
    custom_folds = cv_fold_generator(ids, folds)
    res = []
    count = 0
    for i in ngram_range:
        for j in max_features:
            print(count / (len(max_features) * len(ngram_range)))
            count += 1
            bow = BoW(ngram_range=i, max_features=j, stop_words=None)
            x = bow.fit(data_h)
            if i == (1, 2) and j == 90:
                plot_2D_data(x, target)

            # print(reduced)
            # combined2 = np.column_stack((reduced, questionmark_features.toarray()))
            combined = add_question_mark_feature(x, questionmark_features)
            # print(combined.toarray()[0])
            regularization = 'l2'
            if do_custom_folds:
                res.append([logistic_regression(combined, target, custom_folds, regularization), i, j])
            else:
                res.append([logistic_regression(combined, target, folds, regularization), i, j])

    print(sorted(res, key=lambda x: x[0], reverse=True))
Exemplo n.º 3
0
def combined_crossval(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, folds=7, do_custom_folds=True):
    custom_folds = cv_fold_generator(claim_ids, folds)
    rootdist_feature = sparse.csr_matrix(rootdist_matrix)
    questionmark_feature = questionmark
    ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature())

    combined_all = sparse.hstack((
        rootdist_feature,
        questionmark_feature,
        ppdb_alignment_feature,
        tf_matrix
    ))
    plot_2D_data(combined_all, target)

    if do_custom_folds:
        folds = custom_folds

    print("Classifier: ", '[accuracy,', 'f1_macro,', 'recall_macro,', 'precision_macro]')
    print("Logistic regression ovr L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'ovr'))
    print("Logistic regression ovr L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'ovr'))
    print("Logistic regression multiclass L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'multinomial'))
    print("Logistic regression multiclass L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'multinomial'))
    print("SVM Cross-validation")
    svm_crossval_grid(combined_all, target, folds)
    print("Naive Bayes: ", naive_bayes(combined_all.toarray(), target, folds))
Exemplo n.º 4
0
def questionmark_only(claim_ids, target, questionmark, folds=5, do_custom_folds=True, regularization='l2'):
    custom_folds = cv_fold_generator(claim_ids, folds)
    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(questionmark, target, custom_folds, regularization, 1000000))
    else:
        print(logistic_regression(questionmark, target, folds, regularization, 1000000))
Exemplo n.º 5
0
def bow_rootdist(claim_ids, target, rootdist_matrix, tf_matrix, folds=5, do_custom_folds=True, regularization='l2'):
    custom_folds = cv_fold_generator(claim_ids, folds)
    data_sparse = sparse.csr_matrix(rootdist_matrix)
    combined_all = sparse.hstack((data_sparse, tf_matrix))
    plot_2D_data(combined_all, target)

    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(combined_all, target, custom_folds, regularization, 1000000))
    else:
        print(logistic_regression(combined_all, target, folds, regularization, 1000000))
Exemplo n.º 6
0
def crossval_grid_search(target,
                         ids,
                         min_rootdist=1,
                         max_rootdist=200,
                         step=1,
                         ppdb=None,
                         questionmark_features=None,
                         bow=None,
                         folds=10):
    default_score = range(min_rootdist, max_rootdist + 1, step)
    res = []
    count = 0
    custom_folds = cv_fold_generator(ids, folds)
    for i in default_score:
        data = sparse.csc_matrix(get_rootdist_matrix(i))
        print("At ", round((count * 100.0) / (len(default_score)), 2), "%")
        count += 1
        combined = sparse.hstack((data, questionmark_features, bow, ppdb))

        regularization = 'l2'
        res.append([
            logistic_regression(combined, target, custom_folds,
                                regularization), i
        ])

    acc = np.asarray([[a[0][0], a[1]] for a in res])
    f1 = np.asarray([[a[0][1], a[1]] for a in res])
    recall = np.asarray([[a[0][2], a[1]] for a in res])
    precision = np.asarray([[a[0][3], a[1]] for a in res])
    print("Max acc without question at default_dist: ",
          acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0]))
    print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]),
                                                          1], " ",
          np.max(f1[:, 0]))
    print("Max recall without question at default_dist: ",
          recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0]))
    print("Max precision without question at default_dist: ",
          precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:,
                                                                          0]))
    plt.plot(acc[:, 1], acc[:, 0], label='Accuracy')
    plt.plot(f1[:, 1], f1[:, 0], label='F1-Score')
    plt.plot(recall[:, 1], recall[:, 0], label='Recall')
    plt.plot(precision[:, 1], precision[:, 0], label='Precision')
    plt.legend()
    plt.xlabel("Default rootdist score")
    plt.ylabel("Accuracy")
    plt.show()

    return res
Exemplo n.º 7
0
def crossval_rootdist(data,
                      target,
                      ids,
                      questionmark_features=None,
                      folds=10,
                      do_custom_folds=True):
    custom_folds = cv_fold_generator(ids, folds)
    data = sparse.csr_matrix(data)
    if questionmark_features is not None:
        combined = add_question_mark_feature(data, questionmark_features)
    else:
        combined = data
    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(combined, target, custom_folds))
    else:
        print(logistic_regression(combined, target, folds))