Exemplo n.º 1
0
def get_function(name):
    if name == "LP":
        return (LabelPropagation(kernel=rbf_kernel_safe))
    elif name == "TSVM":
        return (SKTSVM(probability=False))
    elif name == "hash":
        return (HashingVectorizer())
    elif name == "count":
        return (CountVectorizer())
    elif name == "tfidf":
        return (TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False))
Exemplo n.º 2
0
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
    x = np.concatenate((x_c, x_e), axis=1)
    z = np.concatenate((z_c, z_e), axis=1)

    # Baseline: Linear Logistic Regression
    lin_lr = LogisticRegression(random_state=0,
                                solver='liblinear').fit(x, y.ravel())
    acc_lin_lr = lin_lr.score(z, z_y)
    # hard_label_lin_lr = lin_lr.predict(z)
    # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]

    # TRANSDUCTIVE APPROACHES
    # merge labelled and unlabelled data (with label -1) for transductive methods
    x_merged = np.concatenate((x, z))
    y_merged = np.concatenate((y, -1 * np.ones(
        (z.shape[0], 1)))).ravel().astype(int)

    # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
    lin_tsvm = SKTSVM(kernel='linear')
    lin_tsvm.fit(x_merged, y_merged)
    acc_lin_tsvm = lin_tsvm.score(z, z_y)
    # hard_label_lin_tsvm = lin_tsvm.predict(z)
    # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]

    # Baseline: Non-Linear TSVM:  https://github.com/tmadl/semisup-learn/tree/master/methods
    rbf_tsvm = SKTSVM(kernel='RBF')
    rbf_tsvm.fit(x_merged, y_merged)
    acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
    # hard_label_rbf_tsvm = rbf_tsvm.predict(z)
    # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]

    # Baseline: Label Propagation RBF weights
    try:
        rbf_label_prop = LabelPropagation(kernel='rbf')
        rbf_label_prop.fit(x_merged, y_merged)
        acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
        # hard_label_rbf_label_prop= rbf_label_prop.predict(z)
        # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_prop = []
        print 'rbf label prop did not work'

    # Baseline: Label Spreading with RBF weights
    try:
        rbf_label_spread = LabelSpreading(kernel='rbf')
        rbf_label_spread.fit(x_merged, y_merged)
        acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
        # hard_label_rbf_label_spread = rbf_label_spread.predict(z)
        # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_spread = []
        print 'rbf label spread did not work '

    # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
    # Baseline: Label Propagation with k-NN weights
    try:
        knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
        knn_label_prop.fit(x_merged, y_merged)
        acc_knn_label_prop = knn_label_prop.score(z, z_y)
        # hard_label_knn_label_prop = knn_label_prop.predict(z)
        # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
    except:
        acc_knn_label_prop = []
        print 'knn label prop did not work'

    # Baseline: Label Spreading with k-NN weights
    try:
        knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
        knn_label_spread.fit(x_merged, y_merged)
        acc_knn_label_spread = knn_label_spread.score(z, z_y)
        # hard_label_knn_label_spread = knn_label_spread.predict(z)
        # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
    except:
        acc_knn_label_spread = []
        print 'knn label spread did not work'

    # Generative Models
    # Semi-generative model on labelled data only
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e, converged=True)
    soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_semigen = soft_label_semigen > 0.5
    acc_semigen_labelled = np.mean(hard_label_semigen == z_y)

    # EM with soft labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_soft_EM = soft_label_soft_EM > 0.5
    acc_soft_EM = np.mean(hard_label_soft_EM == z_y)

    # EM with hard labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_hard_EM = soft_label_hard_EM > 0.5
    acc_hard_EM = np.mean(hard_label_hard_EM == z_y)

    # Conditional label prop
    acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)

    return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
           acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
Exemplo n.º 3
0
    def process(self):
        X, ytrue, sc_X = self.data_processing()
        self.basemodel = svm.SVC(kernel='rbf',
                                 decision_function_shape='ovr',
                                 probability=True)

        print("SVM model cross Validation")
        # create SVM model
        self.model2 = svm.SVC(kernel='sigmoid',
                              decision_function_shape='ovr',
                              probability=True,
                              gamma=.1,
                              coef0=.5)
        self.cross_valid(self.model2, X, ytrue)

        #TSVM
        print("T SVM Semi Supervised Classifier cross Validation")
        self.TSVMmodel = SKTSVM(kernel='rbf')
        #self.validate_algo(X, ytrue, self.TSVMmodel)

        #S3VMmodel
        print("CPLE SVM Semi Supervised Classifier cross Validation")
        self.S3VMmodel = CPLELearningModel(
            self.basemodel, predict_from_probabilities=True)  # RBF SVM
        #self.validate_algo(X, ytrue, self.S3VMmodel)
        #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5)

        # create semi supervised model with svm as base model
        self.ssmodel = SelfLearningModel(self.basemodel)
        print("Fast Semi Supervised Classifier cross Validation")
        #self.validate_algo(X, ytrue, self.ssmodel)

        # split train, test data
        X, X_test, ytrue, y_test = model_selection.train_test_split(
            X, ytrue, test_size=.2, random_state=7)

        #split label and unlabel sample
        ys = self.unlabel_data(ytrue, 42, .8)

        # model with simple SVM
        self.model2.fit(X, ytrue)
        print("Simple SVM Model")
        y_pred_train_svm = self.model2.predict(X)
        y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1]
        print("SVM Algo Train Data Validation")
        self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm)
        # test data with svm
        y_pred_test_svm = self.model2.predict(X_test)
        y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1]
        print("SVM Algo Test Data Validation")
        self.validation(y_test, y_pred_test_svm, y_pred_prob_svm)

        # fit TSVM semi supervised model
        self.TSVMmodel.fit(X, ys)
        print("TSVM Semi Supervised Fast Algo ready")
        y_pred_train = self.TSVMmodel.predict(X)
        y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1]
        print("TSVM Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.TSVMmodel.predict(X_test)
        y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1]
        print("TSVMmodel Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit CPLE semi supervised model
        self.S3VMmodel.fit(X, ys)
        print("CPLE Semi Supervised Fast Algo ready")
        y_pred_train = self.S3VMmodel.predict(X)
        y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1]
        print("CPLE Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.S3VMmodel.predict(X_test)
        y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1]
        print("CPLE Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit Fast semi supervised model
        self.ssmodel.fit(X, ys)
        print("Semi Supervised Fast Algo ready")
        y_pred_train = self.ssmodel.predict(X)
        y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1]
        print("Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.ssmodel.predict(X_test)
        y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1]
        print("Semi Supervised Fast Algo Test Data Validation")
        return self.validation(y_test, y_pred_test, y_pred_prob)
Exemplo n.º 4
0
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import confusion_matrix
import numpy as np
import helpers
import functions
from sklearn.feature_extraction.text import TfidfVectorizer
from scikitTSVM import SKTSVM
import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

tsvm = SKTSVM(probability=False, C=0.01, gamma=1.0, kernel='linear', lamU=1.0)
percent_test = 0.15
positive_set = 'data/bc_samples.txt'
negative_set = 'data/bc_grounds.txt'
unlabeled_set = 'data/unlabeled-data.csv'
analogy_list = functions.get_list_re(positive_set)
non_analogy_list = functions.get_list_re(negative_set)
unlabeled_list = functions.get_list_re(unlabeled_set)
samples = [(text, 1) for text in analogy_list] + [(text, 0)
                                                  for text in non_analogy_list]
train_data, train_labels, test_data, test_labels = functions.preprocess(
    samples, percent_test)
j = 0
for sample in unlabeled_list:
    if j <= 20000:
        train_data.append(sample)
        train_labels.append(-1)
    j += 1
train_labels = np.array(train_labels)