Exemplo n.º 1
0
class DiabetesPrediction:
    def __init__(self, data="diabetes"):
        self.data = data

    def data_processing(self, fileName='pima-indians-diabetes.csv'):
        dataset = read_csv(fileName, header=None)
        #dataset = fetch_mldata(self.data)

        # replace zero with mean value for few colunms
        dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4,
                                            5]].replace(0, numpy.NaN)
        values = dataset.values
        imputer = MICE(n_imputations=100,
                       impute_type='pmm',
                       n_nearest_columns=5,
                       verbose=FALSE)
        transformed_values = imputer.complete(values)
        X = transformed_values[:, 0:8]
        ytrue = transformed_values[:, 8]
        # feature selection
        X = X[:, [0, 1, 2, 5, 6, 7]]
        sc_X = StandardScaler()
        X = sc_X.fit_transform(X)
        return X, ytrue, sc_X

    def unlabel_data(self, ytrue, seed=42, label_perc=.2):
        # split label and unlabeled data
        rng = np.random.RandomState(seed)
        random_labeled_points = rng.rand(len(ytrue)) < label_perc
        ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
        #label_perc = label_sample_perc
        #label_len = len(ytrue) * label_perc // 100
        #for x in range(0, label_len):
        #    ys[x] = ytrue[x]
        ys[random_labeled_points] = ytrue[random_labeled_points]
        return ys

    def validation(self, y_test, y_pred_test, y_pred_prob):
        acc = sklearn.metrics.accuracy_score(y_test,
                                             y_pred_test,
                                             sample_weight=None)
        print("Accuracy:", acc)
        print("F1 SCORE: ", f1_score(y_test, y_pred_test))
        print("classification report: ")
        print(classification_report(y_test, y_pred_test))
        cm = confusion_matrix(y_test, y_pred_test)
        TP = cm[1, 1]
        TN = cm[0, 0]
        FP = cm[0, 1]
        FN = cm[1, 0]
        classification_error = (FP + FN) / float(TP + TN + FP + FN)
        print("classification_error: ", classification_error)
        sensitivity = TP / float(FN + TP)
        print(
            "sensitivity: ", sensitivity
        )  # also known as recall score, When the actual value is positive, how often is the prediction correct?
        specificity = TN / (TN + FP)
        print(
            "specificity: ", specificity
        )  # When the actual value is negative, how often is the prediction correct?
        precision = TP / float(TP + FP)
        print(
            "precision: ", precision
        )  # How "precise" is the classifier when predicting positive instances?
        roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_prob)
        print("ROC Curve AUC Area: ", roc_auc)
        print("Confusion matrix:")
        print(cm)
        label = ["0", "1"]
        sns.heatmap(cm, annot=True, xticklabels=label, yticklabels=label)
        plt.show()
        # plot histogram of predicted probability of diabtes
        plt.rcParams['font.size'] = 12
        # 8 bins
        plt.hist(y_pred_prob, bins=8)
        # x-axis limit from 0 to 1
        plt.xlim(0, 1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of diabetes')
        plt.ylabel('Frequency')
        plt.show()
        # plot ROC curve
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred_prob)
        print("fpr below")
        print(fpr)
        print("tpr below")
        print(tpr)
        plt.plot(fpr, tpr)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.rcParams['font.size'] = 12
        plt.title('ROC curve for diabetes classifier')
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.grid(True)
        plt.show()
        return acc, sensitivity, specificity, roc_auc

    def cross_valid(self, model, X, Y):
        # Constants
        num_folds = 10
        num_instances = len(X)
        seed = 42
        np.random.seed(seed)
        kfold = cross_validation.KFold(n=num_instances,
                                       n_folds=num_folds,
                                       random_state=seed)
        #kfold = cross_validation.StratifiedKFold(n_splits=num_folds, random_state=seed)
        results = cross_val_score(model, X, Y, cv=kfold)

        results *= 100.0
        info = "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" % (
            results.mean(), results.std())
        print(info)
        #print(results)

    def cross_valid2(self,
                     model,
                     X,
                     y,
                     label_perc=.8,
                     test_train_split=.2,
                     show_plot=False):
        results = []
        result_mean = []
        for i in range(0, 10):
            # split train, test data
            X_train, X_test, ytrue, y_test = model_selection.train_test_split(
                X, y, test_size=test_train_split, random_state=5 + i)

            # split label and unlabel sample
            ys = self.unlabel_data(ytrue, 5 + i, label_perc)

            model.fit(X_train, ys)
            y_pred_test = model.predict(X_test)
            y_pred_test_prob = model.predict_proba(X_test)[:, 1]
            accuracy = sklearn.metrics.accuracy_score(y_test,
                                                      y_pred_test,
                                                      sample_weight=None)
            results.append(accuracy * 100.0)
        print(results)
        print(
            "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" %
            (np.mean(results), np.std(results)), "label %", label_perc)
        result_mean.append(np.mean(results))
        if show_plot:
            fig, ax = plt.subplots()
            plt.axis([1, 10, 0, 100])
            plt.title("10 fold CV Accuracy variance")
            sns.pointplot(x=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          y=results,
                          ax=ax,
                          x_min=0,
                          x_max=10,
                          y_min=0,
                          y_max=100)
            ax.set_xlabel('Index Number for trial')
            ax.set_ylabel('Accuracy')
            plt.show()
        return result_mean

    def validate_algo(self, X, ytrue, model):
        self.cross_valid2(model, X, ytrue, show_plot=TRUE)
        label_percs = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in label_percs:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying label mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))

        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with label sample %")
        sns.pointplot(x=label_percs,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Labeled Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

        test_train_splits = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        result = []
        for i in test_train_splits:
            result = numpy.append(result,
                                  self.cross_valid2(model, X, ytrue, .5, i),
                                  axis=0)
        print(result)
        print(
            "Model 10 fold Accuracy with varrying test data mean: %.2f%% (+/- %.3f%%)"
            % (np.mean(result), np.std(result)))
        fig, ax = plt.subplots()
        plt.axis([0, 1, 0, 100])
        plt.title("10 fold CV Accuracy with test sample %")
        sns.pointplot(x=test_train_splits,
                      y=result,
                      ax=ax,
                      x_min=0,
                      x_max=1,
                      y_min=0,
                      y_max=100)
        ax.set_xlabel('Test Sample Percentage')
        ax.set_ylabel('Accuracy')
        plt.show()

    def process(self):
        X, ytrue, sc_X = self.data_processing()
        self.basemodel = svm.SVC(kernel='rbf',
                                 decision_function_shape='ovr',
                                 probability=True)

        print("SVM model cross Validation")
        # create SVM model
        self.model2 = svm.SVC(kernel='sigmoid',
                              decision_function_shape='ovr',
                              probability=True,
                              gamma=.1,
                              coef0=.5)
        self.cross_valid(self.model2, X, ytrue)

        #TSVM
        print("T SVM Semi Supervised Classifier cross Validation")
        self.TSVMmodel = SKTSVM(kernel='rbf')
        #self.validate_algo(X, ytrue, self.TSVMmodel)

        #S3VMmodel
        print("CPLE SVM Semi Supervised Classifier cross Validation")
        self.S3VMmodel = CPLELearningModel(
            self.basemodel, predict_from_probabilities=True)  # RBF SVM
        #self.validate_algo(X, ytrue, self.S3VMmodel)
        #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5)

        # create semi supervised model with svm as base model
        self.ssmodel = SelfLearningModel(self.basemodel)
        print("Fast Semi Supervised Classifier cross Validation")
        #self.validate_algo(X, ytrue, self.ssmodel)

        # split train, test data
        X, X_test, ytrue, y_test = model_selection.train_test_split(
            X, ytrue, test_size=.2, random_state=7)

        #split label and unlabel sample
        ys = self.unlabel_data(ytrue, 42, .8)

        # model with simple SVM
        self.model2.fit(X, ytrue)
        print("Simple SVM Model")
        y_pred_train_svm = self.model2.predict(X)
        y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1]
        print("SVM Algo Train Data Validation")
        self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm)
        # test data with svm
        y_pred_test_svm = self.model2.predict(X_test)
        y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1]
        print("SVM Algo Test Data Validation")
        self.validation(y_test, y_pred_test_svm, y_pred_prob_svm)

        # fit TSVM semi supervised model
        self.TSVMmodel.fit(X, ys)
        print("TSVM Semi Supervised Fast Algo ready")
        y_pred_train = self.TSVMmodel.predict(X)
        y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1]
        print("TSVM Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.TSVMmodel.predict(X_test)
        y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1]
        print("TSVMmodel Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit CPLE semi supervised model
        self.S3VMmodel.fit(X, ys)
        print("CPLE Semi Supervised Fast Algo ready")
        y_pred_train = self.S3VMmodel.predict(X)
        y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1]
        print("CPLE Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.S3VMmodel.predict(X_test)
        y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1]
        print("CPLE Semi Supervised Fast Algo Test Data Validation")
        self.validation(y_test, y_pred_test, y_pred_prob)

        # fit Fast semi supervised model
        self.ssmodel.fit(X, ys)
        print("Semi Supervised Fast Algo ready")
        y_pred_train = self.ssmodel.predict(X)
        y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1]
        print("Semi Supervised Fast Algo Train Data Validation")
        self.validation(ytrue, y_pred_train, y_pred_train_prob)

        y_pred_test = self.ssmodel.predict(X_test)
        y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1]
        print("Semi Supervised Fast Algo Test Data Validation")
        return self.validation(y_test, y_pred_test, y_pred_prob)

    def predict(self, x):
        return self.ssmodel.predict(x)

    def plot_boundary(self, pl, model, title):
        X1, ytrue, sc_X = self.data_processing()
        # create PCA transform
        pca = PCA(n_components=2).fit(X1)
        pca_2d = pca.transform(X1)
        for i in range(0, pca_2d.shape[0]):
            if ytrue[i] == 0:
                c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+')
            else:
                c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o')
        pl.legend([c1, c2], ['Diabetes', 'No Diabetes'])
        x_min, x_max = pca_2d[:, 0].min() - 1, pca_2d[:, 0].max() + 1
        y_min, y_max = pca_2d[:, 1].min() - 1, pca_2d[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, .01),
                             np.arange(y_min, y_max, .01))

        # split label and unlabeled data for PCA self learning model
        ys = self.unlabel_data(ytrue, 42, .8)

        # create self learning model for PCA
        #basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True)
        #ssmodel = SelfLearningModel(basemodel)
        model.fit(pca_2d, ys)
        print("PCA model built")
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        SMALL_SIZE = 14
        MEDIUM_SIZE = 16
        BIGGER_SIZE = 16
        plt.rc('font', size=SMALL_SIZE)  # controls default text sizes
        plt.rc('axes', titlesize=SMALL_SIZE)  # fontsize of the axes title
        plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
        plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
        plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
        pl.contour(xx, yy, Z)
        pl.axis('off')
        pl.title(title)
        pl.show()
        return pl

    def Run_Algo(self):
        # main code
        D = DiabetesPrediction()
        D.process()

        # testing
        X1, ytrue, sc_X = D.data_processing()
        ##sample = [[6, 148, 72, 33.5, 0.627, 50]]
        ##sample = sc_X.transform(sample)
        print("testing first 10 samples:")
        print("Actual Y values:", ytrue[:10])
        print("Semi Supervised predicted Y values", D.predict(X1[:10, :]))
        print("Semi supervised predicted Y prob")
        print(D.ssmodel.predict_proba(X1[:10, :]))

        # plot model decision boundary
        D.plot_boundary(plt, self.ssmodel)
        D.plot_boundary(plt, self.TSVMmodel)
Exemplo n.º 2
0
test_set = TfidfVect.transform(test_data).toarray()

# Label Propagation
"""
label_prop_model = helpers.get_function('LP')
label_prop_model.fit(train_set, train_labels)
test_predict = label_prop_model.predict(test_set)
print(label_prop_model.score(test_set, test_labels))
"""

print("Total size of training set: ", len(train_labels))
i = 0
for l in train_labels:
    if l == -1:
        i += 1
print("Size of unlabeled data: ", i)
print("Size of the testing set ", len(test_data))

# TSVM
#"""
tsvm.fit(train_set, train_labels)
test_predict = tsvm.predict(test_set)
print("Accuracy: ", tsvm.score(test_set, test_labels))
#"""
print("Confusion matrix:")
matrix = confusion_matrix(test_labels, test_predict, labels=[1, 0])
print(matrix)
precision, recall, f_measure = functions.fmeasure(matrix)
print("Precision: ", precision)
print("Recall: ", recall)
print("f_measure: ", f_measure)