def run_methods(x_c, y, x_e, z_c, z_y, z_e): x = np.concatenate((x_c, x_e), axis=1) z = np.concatenate((z_c, z_e), axis=1) # Baseline: Linear Logistic Regression lin_lr = LogisticRegression(random_state=0, solver='liblinear').fit(x, y.ravel()) acc_lin_lr = lin_lr.score(z, z_y) # hard_label_lin_lr = lin_lr.predict(z) # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1] # TRANSDUCTIVE APPROACHES # merge labelled and unlabelled data (with label -1) for transductive methods x_merged = np.concatenate((x, z)) y_merged = np.concatenate((y, -1 * np.ones( (z.shape[0], 1)))).ravel().astype(int) # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods lin_tsvm = SKTSVM(kernel='linear') lin_tsvm.fit(x_merged, y_merged) acc_lin_tsvm = lin_tsvm.score(z, z_y) # hard_label_lin_tsvm = lin_tsvm.predict(z) # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1] # Baseline: Non-Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods rbf_tsvm = SKTSVM(kernel='RBF') rbf_tsvm.fit(x_merged, y_merged) acc_rbf_tsvm = rbf_tsvm.score(z, z_y) # hard_label_rbf_tsvm = rbf_tsvm.predict(z) # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1] # Baseline: Label Propagation RBF weights try: rbf_label_prop = LabelPropagation(kernel='rbf') rbf_label_prop.fit(x_merged, y_merged) acc_rbf_label_prop = rbf_label_prop.score(z, z_y) # hard_label_rbf_label_prop= rbf_label_prop.predict(z) # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1] except: acc_rbf_label_prop = [] print 'rbf label prop did not work' # Baseline: Label Spreading with RBF weights try: rbf_label_spread = LabelSpreading(kernel='rbf') rbf_label_spread.fit(x_merged, y_merged) acc_rbf_label_spread = rbf_label_spread.score(z, z_y) # hard_label_rbf_label_spread = rbf_label_spread.predict(z) # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1] except: acc_rbf_label_spread = [] print 'rbf label spread did not work ' # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K # Baseline: Label Propagation with k-NN weights try: knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11) knn_label_prop.fit(x_merged, y_merged) acc_knn_label_prop = knn_label_prop.score(z, z_y) # hard_label_knn_label_prop = knn_label_prop.predict(z) # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1] except: acc_knn_label_prop = [] print 'knn label prop did not work' # Baseline: Label Spreading with k-NN weights try: knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11) knn_label_spread.fit(x_merged, y_merged) acc_knn_label_spread = knn_label_spread.score(z, z_y) # hard_label_knn_label_spread = knn_label_spread.predict(z) # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1] except: acc_knn_label_spread = [] print 'knn label spread did not work' # Generative Models # Semi-generative model on labelled data only a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e, converged=True) soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_semigen = soft_label_semigen > 0.5 acc_semigen_labelled = np.mean(hard_label_semigen == z_y) # EM with soft labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e) soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_soft_EM = soft_label_soft_EM > 0.5 acc_soft_EM = np.mean(hard_label_soft_EM == z_y) # EM with hard labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM( x_c, y, x_e, z_c, z_e) soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_hard_EM = soft_label_hard_EM > 0.5 acc_hard_EM = np.mean(hard_label_hard_EM == z_y) # Conditional label prop acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e) return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\ acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
test_set = TfidfVect.transform(test_data).toarray() # Label Propagation """ label_prop_model = helpers.get_function('LP') label_prop_model.fit(train_set, train_labels) test_predict = label_prop_model.predict(test_set) print(label_prop_model.score(test_set, test_labels)) """ print("Total size of training set: ", len(train_labels)) i = 0 for l in train_labels: if l == -1: i += 1 print("Size of unlabeled data: ", i) print("Size of the testing set ", len(test_data)) # TSVM #""" tsvm.fit(train_set, train_labels) test_predict = tsvm.predict(test_set) print("Accuracy: ", tsvm.score(test_set, test_labels)) #""" print("Confusion matrix:") matrix = confusion_matrix(test_labels, test_predict, labels=[1, 0]) print(matrix) precision, recall, f_measure = functions.fmeasure(matrix) print("Precision: ", precision) print("Recall: ", recall) print("f_measure: ", f_measure)