Exemplo n.º 1
0
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
    x = np.concatenate((x_c, x_e), axis=1)
    z = np.concatenate((z_c, z_e), axis=1)

    # Baseline: Linear Logistic Regression
    lin_lr = LogisticRegression(random_state=0,
                                solver='liblinear').fit(x, y.ravel())
    acc_lin_lr = lin_lr.score(z, z_y)
    # hard_label_lin_lr = lin_lr.predict(z)
    # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]

    # TRANSDUCTIVE APPROACHES
    # merge labelled and unlabelled data (with label -1) for transductive methods
    x_merged = np.concatenate((x, z))
    y_merged = np.concatenate((y, -1 * np.ones(
        (z.shape[0], 1)))).ravel().astype(int)

    # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
    lin_tsvm = SKTSVM(kernel='linear')
    lin_tsvm.fit(x_merged, y_merged)
    acc_lin_tsvm = lin_tsvm.score(z, z_y)
    # hard_label_lin_tsvm = lin_tsvm.predict(z)
    # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]

    # Baseline: Non-Linear TSVM:  https://github.com/tmadl/semisup-learn/tree/master/methods
    rbf_tsvm = SKTSVM(kernel='RBF')
    rbf_tsvm.fit(x_merged, y_merged)
    acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
    # hard_label_rbf_tsvm = rbf_tsvm.predict(z)
    # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]

    # Baseline: Label Propagation RBF weights
    try:
        rbf_label_prop = LabelPropagation(kernel='rbf')
        rbf_label_prop.fit(x_merged, y_merged)
        acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
        # hard_label_rbf_label_prop= rbf_label_prop.predict(z)
        # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_prop = []
        print 'rbf label prop did not work'

    # Baseline: Label Spreading with RBF weights
    try:
        rbf_label_spread = LabelSpreading(kernel='rbf')
        rbf_label_spread.fit(x_merged, y_merged)
        acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
        # hard_label_rbf_label_spread = rbf_label_spread.predict(z)
        # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_spread = []
        print 'rbf label spread did not work '

    # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
    # Baseline: Label Propagation with k-NN weights
    try:
        knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
        knn_label_prop.fit(x_merged, y_merged)
        acc_knn_label_prop = knn_label_prop.score(z, z_y)
        # hard_label_knn_label_prop = knn_label_prop.predict(z)
        # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
    except:
        acc_knn_label_prop = []
        print 'knn label prop did not work'

    # Baseline: Label Spreading with k-NN weights
    try:
        knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
        knn_label_spread.fit(x_merged, y_merged)
        acc_knn_label_spread = knn_label_spread.score(z, z_y)
        # hard_label_knn_label_spread = knn_label_spread.predict(z)
        # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
    except:
        acc_knn_label_spread = []
        print 'knn label spread did not work'

    # Generative Models
    # Semi-generative model on labelled data only
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e, converged=True)
    soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_semigen = soft_label_semigen > 0.5
    acc_semigen_labelled = np.mean(hard_label_semigen == z_y)

    # EM with soft labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_soft_EM = soft_label_soft_EM > 0.5
    acc_soft_EM = np.mean(hard_label_soft_EM == z_y)

    # EM with hard labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_hard_EM = soft_label_hard_EM > 0.5
    acc_hard_EM = np.mean(hard_label_hard_EM == z_y)

    # Conditional label prop
    acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)

    return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
           acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
Exemplo n.º 2
0
test_set = TfidfVect.transform(test_data).toarray()

# Label Propagation
"""
label_prop_model = helpers.get_function('LP')
label_prop_model.fit(train_set, train_labels)
test_predict = label_prop_model.predict(test_set)
print(label_prop_model.score(test_set, test_labels))
"""

print("Total size of training set: ", len(train_labels))
i = 0
for l in train_labels:
    if l == -1:
        i += 1
print("Size of unlabeled data: ", i)
print("Size of the testing set ", len(test_data))

# TSVM
#"""
tsvm.fit(train_set, train_labels)
test_predict = tsvm.predict(test_set)
print("Accuracy: ", tsvm.score(test_set, test_labels))
#"""
print("Confusion matrix:")
matrix = confusion_matrix(test_labels, test_predict, labels=[1, 0])
print(matrix)
precision, recall, f_measure = functions.fmeasure(matrix)
print("Precision: ", precision)
print("Recall: ", recall)
print("f_measure: ", f_measure)