示例#1
0
def get_function(name):
    if name == "LP":
        return (LabelPropagation(kernel=rbf_kernel_safe))
    elif name == "TSVM":
        return (SKTSVM(probability=False))
    elif name == "hash":
        return (HashingVectorizer())
    elif name == "count":
        return (CountVectorizer())
    elif name == "tfidf":
        return (TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False))
示例#2
0
class DiabetesPrediction:
    def __init__(self, data="diabetes"):
        self.data = data

    def data_processing(self, fileName='pima-indians-diabetes.csv'):
        dataset = read_csv(fileName, header=None)
        #dataset = fetch_mldata(self.data)

        # replace zero with mean value for few colunms
        dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4,
                                            5]].replace(0, numpy.NaN)
        values = dataset.values
        imputer = MICE(n_imputations=100,
                       impute_type='pmm',
                       n_nearest_columns=5,
                       verbose=FALSE)
        transformed_values = imputer.complete(values)
        X = transformed_values[:, 0:8]
        ytrue = transformed_values[:, 8]
        # feature selection
        X = X[:, [0, 1, 2, 5, 6, 7]]
        sc_X = StandardScaler()
        X = sc_X.fit_transform(X)
        return X, ytrue, sc_X

    def unlabel_data(self, ytrue, seed=42, label_perc=.2):
        # split label and unlabeled data
        rng = np.random.RandomState(seed)
        random_labeled_points = rng.rand(len(ytrue)) < label_perc
        ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
        #label_perc = label_sample_perc
        #label_len = len(ytrue) * label_perc // 100
        #for x in range(0, label_len):
        #    ys[x] = ytrue[x]
        ys[random_labeled_points] = ytrue[random_labeled_points]
        return ys

    def validation(self, y_test, y_pred_test, y_pred_prob):
        acc = sklearn.metrics.accuracy_score(y_test,
                                             y_pred_test,
                                             sample_weight=None)
        print("Accuracy:", acc)
        print("F1 SCORE: ", f1_score(y_test, y_pred_test))
        print("classification report: ")
        print(classification_report(y_test, y_pred_test))
        cm = confusion_matrix(y_test, y_pred_test)
        TP = cm[1, 1]
        TN = cm[0, 0]
        FP = cm[0, 1]
        FN = cm[1, 0]
        classification_error = (FP + FN) / float(TP + TN + FP + FN)
        print("classification_error: ", classification_error)
        sensitivity = TP / float(FN + TP)
        print(
            "sensitivity: ", sensitivity
        )  # also known as recall score, When the actual value is positive, how often is the prediction correct?
        specificity = TN / (TN + FP)
        print(
            "specificity: ", specificity
        )  # When the actual value is negative, how often is the prediction correct?
        precision = TP / float(TP + FP)
        print(
            "precision: ", precision
        )  # How "precise" is the classifier when predicting positive instances?
        roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_prob)
        print("ROC Curve AUC Area: ", roc_auc)
        print("Confusion matrix:")
        print(cm)
        label = ["0", "1"]
        sns.heatmap(cm, annot=True, xticklabels=label, yticklabels=label)
        plt.show()
        # plot histogram of predicted probability of diabtes
        plt.rcParams['font.size'] = 12
        # 8 bins
        plt.hist(y_pred_prob, bins=8)
        # x-axis limit from 0 to 1
        plt.xlim(0, 1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of diabetes')
        plt.ylabel('Frequency')
        plt.show()
        # plot ROC curve
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred_prob)
        print("fpr below")
        print(fpr)
        print("tpr below")
        print(tpr)
        plt.plot(fpr, tpr)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for diabetes classifier')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.grid(True)
        plt.show()
        return acc, sensitivity, specificity, roc_auc

    def cross_valid(self, model, X, Y):
        # Constants
        num_folds = 10
        num_instances = len(X)
        seed = 42
        np.random.seed(seed)
        kfold = cross_validation.KFold(n=num_instances,
                                       n_folds=num_folds,
                                       random_state=seed)
        #kfold = cross_validation.StratifiedKFold(n_splits=num_folds, random_state=seed)
        results = cross_val_score(model, X, Y, cv=kfold)

        results *= 100.0
        info = "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" % (
            results.mean(), results.std())
        print(info)
        #print(results)

    def cross_valid2(self,
                     model,
                     X,
                     y,
                     label_perc=.8,
                     test_train_split=.2,
                     show_plot=False):
        results = []
        result_mean = []
        for i in range(0, 10):
            # split train, test data
            X_train, X_test, ytrue, y_test = model_selection.train_test_split(
                X, y, test_size=test_train_split, random_state=5 + i)

            # split label and unlabel sample
            ys = self.unlabel_data(ytrue, 5 + i, label_perc)

            model.fit(X_train, ys)
            y_pred_test = model.predict(X_test)
            y_pred_test_prob = model.predict_proba(X_test)[:, 1]
            accuracy = sklearn.metrics.accuracy_score(y_test,
                                                      y_pred_test,
                                                      sample_weight=None)
            results.append(accuracy * 100.0)
        print(results)
        print(
            "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" %
            (np.mean(results), np.std(results)), "label %", label_perc)
        result_mean.append(np.mean(results))
        if show_plot:
            fig, ax = plt.subplots()
            plt.axis([1, 10, 0, 100])
            plt.title("10 fold CV Accuracy variance")
            sns.pointplot(x=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          y=results,
                          ax=ax,
                          x_min=0,
                          x_max=10,
                          y_min=0,
                          y_max=100)
            ax.set_xlabel('Index Number for trial')
            ax.set_ylabel('Accuracy')
            plt.show()
        return result_mean

    def validate_algo(self, X, ytrue, model):
        self.cross_valid2(model, X, ytrue, show_plot=TRUE)
        label_percs = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in label_percs:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying label mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))

        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with label sample %")
        sns.pointplot(x=label_percs,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Labeled Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

        test_train_splits = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in test_train_splits:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, .5, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying test data mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))
        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with test sample %")
        sns.pointplot(x=test_train_splits,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Test Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

    def process(self):
        X, ytrue, sc_X = self.data_processing()
        self.basemodel = svm.SVC(kernel='rbf',
                                 decision_function_shape='ovr',
                                 probability=True)

        print("SVM model cross Validation")
        # create SVM model
        self.model2 = svm.SVC(kernel='sigmoid',
                              decision_function_shape='ovr',
                              probability=True,
                              gamma=.1,
                              coef0=.5)
        self.cross_valid(self.model2, X, ytrue)

        #TSVM
        print("T SVM Semi Supervised Classifier cross Validation")
        self.TSVMmodel = SKTSVM(kernel='rbf')
        #self.validate_algo(X, ytrue, self.TSVMmodel)

        #S3VMmodel
        print("CPLE SVM Semi Supervised Classifier cross Validation")
        self.S3VMmodel = CPLELearningModel(
            self.basemodel, predict_from_probabilities=True)  # RBF SVM
        #self.validate_algo(X, ytrue, self.S3VMmodel)
        #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5)

        # create semi supervised model with svm as base model
        self.ssmodel = SelfLearningModel(self.basemodel)
        print("Fast Semi Supervised Classifier cross Validation")
        #self.validate_algo(X, ytrue, self.ssmodel)

        # split train, test data
        X, X_test, ytrue, y_test = model_selection.train_test_split(
            X, ytrue, test_size=.2, random_state=7)

        #split label and unlabel sample
        ys = self.unlabel_data(ytrue, 42, .8)

        # model with simple SVM
        self.model2.fit(X, ytrue)
        print("Simple SVM Model")
        y_pred_train_svm = self.model2.predict(X)
        y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1]
        print("SVM Algo Train Data Validation")
        self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm)
        # test data with svm
        y_pred_test_svm = self.model2.predict(X_test)
        y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1]
        print("SVM Algo Test Data Validation")
        self.validation(y_test, y_pred_test_svm, y_pred_prob_svm)

        # fit TSVM semi supervised model
        self.TSVMmodel.fit(X, ys)
        print("TSVM Semi Supervised Fast Algo ready")
        y_pred_train = self.TSVMmodel.predict(X)
        y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1]
        print("TSVM Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.TSVMmodel.predict(X_test)
        y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1]
        print("TSVMmodel Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit CPLE semi supervised model
        self.S3VMmodel.fit(X, ys)
        print("CPLE Semi Supervised Fast Algo ready")
        y_pred_train = self.S3VMmodel.predict(X)
        y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1]
        print("CPLE Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.S3VMmodel.predict(X_test)
        y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1]
        print("CPLE Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit Fast semi supervised model
        self.ssmodel.fit(X, ys)
        print("Semi Supervised Fast Algo ready")
        y_pred_train = self.ssmodel.predict(X)
        y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1]
        print("Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.ssmodel.predict(X_test)
        y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1]
        print("Semi Supervised Fast Algo Test Data Validation")
        return self.validation(y_test, y_pred_test, y_pred_prob)

    def predict(self, x):
        return self.ssmodel.predict(x)

    def plot_boundary(self, pl, model, title):
        X1, ytrue, sc_X = self.data_processing()
        # create PCA transform
        pca = PCA(n_components=2).fit(X1)
        pca_2d = pca.transform(X1)
        for i in range(0, pca_2d.shape[0]):
            if ytrue[i] == 0:
                c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+')
            else:
                c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o')
        pl.legend([c1, c2], ['Diabetes', 'No Diabetes'])
        x_min, x_max = pca_2d[:, 0].min() - 1, pca_2d[:, 0].max() + 1
        y_min, y_max = pca_2d[:, 1].min() - 1, pca_2d[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, .01),
                             np.arange(y_min, y_max, .01))

        # split label and unlabeled data for PCA self learning model
        ys = self.unlabel_data(ytrue, 42, .8)

        # create self learning model for PCA
        #basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True)
        #ssmodel = SelfLearningModel(basemodel)
        model.fit(pca_2d, ys)
        print("PCA model built")
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        SMALL_SIZE = 14
        MEDIUM_SIZE = 16
        BIGGER_SIZE = 16
        plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
        plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
        pl.contour(xx, yy, Z)
        pl.axis('off')
        pl.title(title)
        pl.show()
        return pl

    def Run_Algo(self):
        # main code
        D = DiabetesPrediction()
        D.process()

        # testing
        X1, ytrue, sc_X = D.data_processing()
        ##sample = [[6, 148, 72, 33.5, 0.627, 50]]
        ##sample = sc_X.transform(sample)
        print("testing first 10 samples:")
        print("Actual Y values:", ytrue[:10])
        print("Semi Supervised predicted Y values", D.predict(X1[:10, :]))
        print("Semi supervised predicted Y prob")
        print(D.ssmodel.predict_proba(X1[:10, :]))

        # plot model decision boundary
        D.plot_boundary(plt, self.ssmodel)
        D.plot_boundary(plt, self.TSVMmodel)
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
    x = np.concatenate((x_c, x_e), axis=1)
    z = np.concatenate((z_c, z_e), axis=1)

    # Baseline: Linear Logistic Regression
    lin_lr = LogisticRegression(random_state=0,
                                solver='liblinear').fit(x, y.ravel())
    acc_lin_lr = lin_lr.score(z, z_y)
    # hard_label_lin_lr = lin_lr.predict(z)
    # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]

    # TRANSDUCTIVE APPROACHES
    # merge labelled and unlabelled data (with label -1) for transductive methods
    x_merged = np.concatenate((x, z))
    y_merged = np.concatenate((y, -1 * np.ones(
        (z.shape[0], 1)))).ravel().astype(int)

    # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
    lin_tsvm = SKTSVM(kernel='linear')
    lin_tsvm.fit(x_merged, y_merged)
    acc_lin_tsvm = lin_tsvm.score(z, z_y)
    # hard_label_lin_tsvm = lin_tsvm.predict(z)
    # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]

    # Baseline: Non-Linear TSVM:  https://github.com/tmadl/semisup-learn/tree/master/methods
    rbf_tsvm = SKTSVM(kernel='RBF')
    rbf_tsvm.fit(x_merged, y_merged)
    acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
    # hard_label_rbf_tsvm = rbf_tsvm.predict(z)
    # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]

    # Baseline: Label Propagation RBF weights
    try:
        rbf_label_prop = LabelPropagation(kernel='rbf')
        rbf_label_prop.fit(x_merged, y_merged)
        acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
        # hard_label_rbf_label_prop= rbf_label_prop.predict(z)
        # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_prop = []
        print 'rbf label prop did not work'

    # Baseline: Label Spreading with RBF weights
    try:
        rbf_label_spread = LabelSpreading(kernel='rbf')
        rbf_label_spread.fit(x_merged, y_merged)
        acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
        # hard_label_rbf_label_spread = rbf_label_spread.predict(z)
        # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_spread = []
        print 'rbf label spread did not work '

    # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
    # Baseline: Label Propagation with k-NN weights
    try:
        knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
        knn_label_prop.fit(x_merged, y_merged)
        acc_knn_label_prop = knn_label_prop.score(z, z_y)
        # hard_label_knn_label_prop = knn_label_prop.predict(z)
        # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
    except:
        acc_knn_label_prop = []
        print 'knn label prop did not work'

    # Baseline: Label Spreading with k-NN weights
    try:
        knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
        knn_label_spread.fit(x_merged, y_merged)
        acc_knn_label_spread = knn_label_spread.score(z, z_y)
        # hard_label_knn_label_spread = knn_label_spread.predict(z)
        # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
    except:
        acc_knn_label_spread = []
        print 'knn label spread did not work'

    # Generative Models
    # Semi-generative model on labelled data only
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e, converged=True)
    soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_semigen = soft_label_semigen > 0.5
    acc_semigen_labelled = np.mean(hard_label_semigen == z_y)

    # EM with soft labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_soft_EM = soft_label_soft_EM > 0.5
    acc_soft_EM = np.mean(hard_label_soft_EM == z_y)

    # EM with hard labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_hard_EM = soft_label_hard_EM > 0.5
    acc_hard_EM = np.mean(hard_label_hard_EM == z_y)

    # Conditional label prop
    acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)

    return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
           acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
示例#4
0
    def process(self):
        X, ytrue, sc_X = self.data_processing()
        self.basemodel = svm.SVC(kernel='rbf',
                                 decision_function_shape='ovr',
                                 probability=True)

        print("SVM model cross Validation")
        # create SVM model
        self.model2 = svm.SVC(kernel='sigmoid',
                              decision_function_shape='ovr',
                              probability=True,
                              gamma=.1,
                              coef0=.5)
        self.cross_valid(self.model2, X, ytrue)

        #TSVM
        print("T SVM Semi Supervised Classifier cross Validation")
        self.TSVMmodel = SKTSVM(kernel='rbf')
        #self.validate_algo(X, ytrue, self.TSVMmodel)

        #S3VMmodel
        print("CPLE SVM Semi Supervised Classifier cross Validation")
        self.S3VMmodel = CPLELearningModel(
            self.basemodel, predict_from_probabilities=True)  # RBF SVM
        #self.validate_algo(X, ytrue, self.S3VMmodel)
        #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5)

        # create semi supervised model with svm as base model
        self.ssmodel = SelfLearningModel(self.basemodel)
        print("Fast Semi Supervised Classifier cross Validation")
        #self.validate_algo(X, ytrue, self.ssmodel)

        # split train, test data
        X, X_test, ytrue, y_test = model_selection.train_test_split(
            X, ytrue, test_size=.2, random_state=7)

        #split label and unlabel sample
        ys = self.unlabel_data(ytrue, 42, .8)

        # model with simple SVM
        self.model2.fit(X, ytrue)
        print("Simple SVM Model")
        y_pred_train_svm = self.model2.predict(X)
        y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1]
        print("SVM Algo Train Data Validation")
        self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm)
        # test data with svm
        y_pred_test_svm = self.model2.predict(X_test)
        y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1]
        print("SVM Algo Test Data Validation")
        self.validation(y_test, y_pred_test_svm, y_pred_prob_svm)

        # fit TSVM semi supervised model
        self.TSVMmodel.fit(X, ys)
        print("TSVM Semi Supervised Fast Algo ready")
        y_pred_train = self.TSVMmodel.predict(X)
        y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1]
        print("TSVM Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.TSVMmodel.predict(X_test)
        y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1]
        print("TSVMmodel Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit CPLE semi supervised model
        self.S3VMmodel.fit(X, ys)
        print("CPLE Semi Supervised Fast Algo ready")
        y_pred_train = self.S3VMmodel.predict(X)
        y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1]
        print("CPLE Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.S3VMmodel.predict(X_test)
        y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1]
        print("CPLE Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit Fast semi supervised model
        self.ssmodel.fit(X, ys)
        print("Semi Supervised Fast Algo ready")
        y_pred_train = self.ssmodel.predict(X)
        y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1]
        print("Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.ssmodel.predict(X_test)
        y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1]
        print("Semi Supervised Fast Algo Test Data Validation")
        return self.validation(y_test, y_pred_test, y_pred_prob)
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import confusion_matrix
import numpy as np
import helpers
import functions
from sklearn.feature_extraction.text import TfidfVectorizer
from scikitTSVM import SKTSVM
import warnings
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

tsvm = SKTSVM(probability=False, C=0.01, gamma=1.0, kernel='linear', lamU=1.0)
percent_test = 0.15
positive_set = 'data/bc_samples.txt'
negative_set = 'data/bc_grounds.txt'
unlabeled_set = 'data/unlabeled-data.csv'
analogy_list = functions.get_list_re(positive_set)
non_analogy_list = functions.get_list_re(negative_set)
unlabeled_list = functions.get_list_re(unlabeled_set)
samples = [(text, 1) for text in analogy_list] + [(text, 0)
                                                  for text in non_analogy_list]
train_data, train_labels, test_data, test_labels = functions.preprocess(
    samples, percent_test)
j = 0
for sample in unlabeled_list:
    if j <= 20000:
        train_data.append(sample)
        train_labels.append(-1)
    j += 1
train_labels = np.array(train_labels)