def get_function(name): if name == "LP": return (LabelPropagation(kernel=rbf_kernel_safe)) elif name == "TSVM": return (SKTSVM(probability=False)) elif name == "hash": return (HashingVectorizer()) elif name == "count": return (CountVectorizer()) elif name == "tfidf": return (TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False))
class DiabetesPrediction: def __init__(self, data="diabetes"): self.data = data def data_processing(self, fileName='pima-indians-diabetes.csv'): dataset = read_csv(fileName, header=None) #dataset = fetch_mldata(self.data) # replace zero with mean value for few colunms dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4, 5]].replace(0, numpy.NaN) values = dataset.values imputer = MICE(n_imputations=100, impute_type='pmm', n_nearest_columns=5, verbose=FALSE) transformed_values = imputer.complete(values) X = transformed_values[:, 0:8] ytrue = transformed_values[:, 8] # feature selection X = X[:, [0, 1, 2, 5, 6, 7]] sc_X = StandardScaler() X = sc_X.fit_transform(X) return X, ytrue, sc_X def unlabel_data(self, ytrue, seed=42, label_perc=.2): # split label and unlabeled data rng = np.random.RandomState(seed) random_labeled_points = rng.rand(len(ytrue)) < label_perc ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point #label_perc = label_sample_perc #label_len = len(ytrue) * label_perc // 100 #for x in range(0, label_len): # ys[x] = ytrue[x] ys[random_labeled_points] = ytrue[random_labeled_points] return ys def validation(self, y_test, y_pred_test, y_pred_prob): acc = sklearn.metrics.accuracy_score(y_test, y_pred_test, sample_weight=None) print("Accuracy:", acc) print("F1 SCORE: ", f1_score(y_test, y_pred_test)) print("classification report: ") print(classification_report(y_test, y_pred_test)) cm = confusion_matrix(y_test, y_pred_test) TP = cm[1, 1] TN = cm[0, 0] FP = cm[0, 1] FN = cm[1, 0] classification_error = (FP + FN) / float(TP + TN + FP + FN) print("classification_error: ", classification_error) sensitivity = TP / float(FN + TP) print( "sensitivity: ", sensitivity ) # also known as recall score, When the actual value is positive, how often is the prediction correct? specificity = TN / (TN + FP) print( "specificity: ", specificity ) # When the actual value is negative, how often is the prediction correct? precision = TP / float(TP + FP) print( "precision: ", precision ) # How "precise" is the classifier when predicting positive instances? roc_auc = sklearn.metrics.roc_auc_score(y_test, y_pred_prob) print("ROC Curve AUC Area: ", roc_auc) print("Confusion matrix:") print(cm) label = ["0", "1"] sns.heatmap(cm, annot=True, xticklabels=label, yticklabels=label) plt.show() # plot histogram of predicted probability of diabtes plt.rcParams['font.size'] = 12 # 8 bins plt.hist(y_pred_prob, bins=8) # x-axis limit from 0 to 1 plt.xlim(0, 1) plt.title('Histogram of predicted probabilities') plt.xlabel('Predicted probability of diabetes') plt.ylabel('Frequency') plt.show() # plot ROC curve fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_pred_prob) print("fpr below") print(fpr) print("tpr below") print(tpr) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.rcParams['font.size'] = 12 plt.title('ROC curve for diabetes classifier') plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') plt.grid(True) plt.show() return acc, sensitivity, specificity, roc_auc def cross_valid(self, model, X, Y): # Constants num_folds = 10 num_instances = len(X) seed = 42 np.random.seed(seed) kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed) #kfold = cross_validation.StratifiedKFold(n_splits=num_folds, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold) results *= 100.0 info = "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" % ( results.mean(), results.std()) print(info) #print(results) def cross_valid2(self, model, X, y, label_perc=.8, test_train_split=.2, show_plot=False): results = [] result_mean = [] for i in range(0, 10): # split train, test data X_train, X_test, ytrue, y_test = model_selection.train_test_split( X, y, test_size=test_train_split, random_state=5 + i) # split label and unlabel sample ys = self.unlabel_data(ytrue, 5 + i, label_perc) model.fit(X_train, ys) y_pred_test = model.predict(X_test) y_pred_test_prob = model.predict_proba(X_test)[:, 1] accuracy = sklearn.metrics.accuracy_score(y_test, y_pred_test, sample_weight=None) results.append(accuracy * 100.0) print(results) print( "Model 10 fold Accuracy mean: %.2f%% (+/- %.3f%%)" % (np.mean(results), np.std(results)), "label %", label_perc) result_mean.append(np.mean(results)) if show_plot: fig, ax = plt.subplots() plt.axis([1, 10, 0, 100]) plt.title("10 fold CV Accuracy variance") sns.pointplot(x=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], y=results, ax=ax, x_min=0, x_max=10, y_min=0, y_max=100) ax.set_xlabel('Index Number for trial') ax.set_ylabel('Accuracy') plt.show() return result_mean def validate_algo(self, X, ytrue, model): self.cross_valid2(model, X, ytrue, show_plot=TRUE) label_percs = [.1, .2, .3, .4, .5, .6, .7, .8, .9] result = [] for i in label_percs: result = numpy.append(result, self.cross_valid2(model, X, ytrue, i), axis=0) print(result) print( "Model 10 fold Accuracy with varrying label mean: %.2f%% (+/- %.3f%%)" % (np.mean(result), np.std(result))) fig, ax = plt.subplots() plt.axis([0, 1, 0, 100]) plt.title("10 fold CV Accuracy with label sample %") sns.pointplot(x=label_percs, y=result, ax=ax, x_min=0, x_max=1, y_min=0, y_max=100) ax.set_xlabel('Labeled Sample Percentage') ax.set_ylabel('Accuracy') plt.show() test_train_splits = [.1, .2, .3, .4, .5, .6, .7, .8, .9] result = [] for i in test_train_splits: result = numpy.append(result, self.cross_valid2(model, X, ytrue, .5, i), axis=0) print(result) print( "Model 10 fold Accuracy with varrying test data mean: %.2f%% (+/- %.3f%%)" % (np.mean(result), np.std(result))) fig, ax = plt.subplots() plt.axis([0, 1, 0, 100]) plt.title("10 fold CV Accuracy with test sample %") sns.pointplot(x=test_train_splits, y=result, ax=ax, x_min=0, x_max=1, y_min=0, y_max=100) ax.set_xlabel('Test Sample Percentage') ax.set_ylabel('Accuracy') plt.show() def process(self): X, ytrue, sc_X = self.data_processing() self.basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True) print("SVM model cross Validation") # create SVM model self.model2 = svm.SVC(kernel='sigmoid', decision_function_shape='ovr', probability=True, gamma=.1, coef0=.5) self.cross_valid(self.model2, X, ytrue) #TSVM print("T SVM Semi Supervised Classifier cross Validation") self.TSVMmodel = SKTSVM(kernel='rbf') #self.validate_algo(X, ytrue, self.TSVMmodel) #S3VMmodel print("CPLE SVM Semi Supervised Classifier cross Validation") self.S3VMmodel = CPLELearningModel( self.basemodel, predict_from_probabilities=True) # RBF SVM #self.validate_algo(X, ytrue, self.S3VMmodel) #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5) # create semi supervised model with svm as base model self.ssmodel = SelfLearningModel(self.basemodel) print("Fast Semi Supervised Classifier cross Validation") #self.validate_algo(X, ytrue, self.ssmodel) # split train, test data X, X_test, ytrue, y_test = model_selection.train_test_split( X, ytrue, test_size=.2, random_state=7) #split label and unlabel sample ys = self.unlabel_data(ytrue, 42, .8) # model with simple SVM self.model2.fit(X, ytrue) print("Simple SVM Model") y_pred_train_svm = self.model2.predict(X) y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1] print("SVM Algo Train Data Validation") self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm) # test data with svm y_pred_test_svm = self.model2.predict(X_test) y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1] print("SVM Algo Test Data Validation") self.validation(y_test, y_pred_test_svm, y_pred_prob_svm) # fit TSVM semi supervised model self.TSVMmodel.fit(X, ys) print("TSVM Semi Supervised Fast Algo ready") y_pred_train = self.TSVMmodel.predict(X) y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1] print("TSVM Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.TSVMmodel.predict(X_test) y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1] print("TSVMmodel Semi Supervised Fast Algo Test Data Validation") self.validation(y_test, y_pred_test, y_pred_prob) # fit CPLE semi supervised model self.S3VMmodel.fit(X, ys) print("CPLE Semi Supervised Fast Algo ready") y_pred_train = self.S3VMmodel.predict(X) y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1] print("CPLE Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.S3VMmodel.predict(X_test) y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1] print("CPLE Semi Supervised Fast Algo Test Data Validation") self.validation(y_test, y_pred_test, y_pred_prob) # fit Fast semi supervised model self.ssmodel.fit(X, ys) print("Semi Supervised Fast Algo ready") y_pred_train = self.ssmodel.predict(X) y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1] print("Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.ssmodel.predict(X_test) y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1] print("Semi Supervised Fast Algo Test Data Validation") return self.validation(y_test, y_pred_test, y_pred_prob) def predict(self, x): return self.ssmodel.predict(x) def plot_boundary(self, pl, model, title): X1, ytrue, sc_X = self.data_processing() # create PCA transform pca = PCA(n_components=2).fit(X1) pca_2d = pca.transform(X1) for i in range(0, pca_2d.shape[0]): if ytrue[i] == 0: c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+') else: c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o') pl.legend([c1, c2], ['Diabetes', 'No Diabetes']) x_min, x_max = pca_2d[:, 0].min() - 1, pca_2d[:, 0].max() + 1 y_min, y_max = pca_2d[:, 1].min() - 1, pca_2d[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, .01), np.arange(y_min, y_max, .01)) # split label and unlabeled data for PCA self learning model ys = self.unlabel_data(ytrue, 42, .8) # create self learning model for PCA #basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True) #ssmodel = SelfLearningModel(basemodel) model.fit(pca_2d, ys) print("PCA model built") Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) SMALL_SIZE = 14 MEDIUM_SIZE = 16 BIGGER_SIZE = 16 plt.rc('font', size=SMALL_SIZE) # controls default text sizes plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize pl.contour(xx, yy, Z) pl.axis('off') pl.title(title) pl.show() return pl def Run_Algo(self): # main code D = DiabetesPrediction() D.process() # testing X1, ytrue, sc_X = D.data_processing() ##sample = [[6, 148, 72, 33.5, 0.627, 50]] ##sample = sc_X.transform(sample) print("testing first 10 samples:") print("Actual Y values:", ytrue[:10]) print("Semi Supervised predicted Y values", D.predict(X1[:10, :])) print("Semi supervised predicted Y prob") print(D.ssmodel.predict_proba(X1[:10, :])) # plot model decision boundary D.plot_boundary(plt, self.ssmodel) D.plot_boundary(plt, self.TSVMmodel)
def run_methods(x_c, y, x_e, z_c, z_y, z_e): x = np.concatenate((x_c, x_e), axis=1) z = np.concatenate((z_c, z_e), axis=1) # Baseline: Linear Logistic Regression lin_lr = LogisticRegression(random_state=0, solver='liblinear').fit(x, y.ravel()) acc_lin_lr = lin_lr.score(z, z_y) # hard_label_lin_lr = lin_lr.predict(z) # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1] # TRANSDUCTIVE APPROACHES # merge labelled and unlabelled data (with label -1) for transductive methods x_merged = np.concatenate((x, z)) y_merged = np.concatenate((y, -1 * np.ones( (z.shape[0], 1)))).ravel().astype(int) # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods lin_tsvm = SKTSVM(kernel='linear') lin_tsvm.fit(x_merged, y_merged) acc_lin_tsvm = lin_tsvm.score(z, z_y) # hard_label_lin_tsvm = lin_tsvm.predict(z) # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1] # Baseline: Non-Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods rbf_tsvm = SKTSVM(kernel='RBF') rbf_tsvm.fit(x_merged, y_merged) acc_rbf_tsvm = rbf_tsvm.score(z, z_y) # hard_label_rbf_tsvm = rbf_tsvm.predict(z) # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1] # Baseline: Label Propagation RBF weights try: rbf_label_prop = LabelPropagation(kernel='rbf') rbf_label_prop.fit(x_merged, y_merged) acc_rbf_label_prop = rbf_label_prop.score(z, z_y) # hard_label_rbf_label_prop= rbf_label_prop.predict(z) # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1] except: acc_rbf_label_prop = [] print 'rbf label prop did not work' # Baseline: Label Spreading with RBF weights try: rbf_label_spread = LabelSpreading(kernel='rbf') rbf_label_spread.fit(x_merged, y_merged) acc_rbf_label_spread = rbf_label_spread.score(z, z_y) # hard_label_rbf_label_spread = rbf_label_spread.predict(z) # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1] except: acc_rbf_label_spread = [] print 'rbf label spread did not work ' # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K # Baseline: Label Propagation with k-NN weights try: knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11) knn_label_prop.fit(x_merged, y_merged) acc_knn_label_prop = knn_label_prop.score(z, z_y) # hard_label_knn_label_prop = knn_label_prop.predict(z) # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1] except: acc_knn_label_prop = [] print 'knn label prop did not work' # Baseline: Label Spreading with k-NN weights try: knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11) knn_label_spread.fit(x_merged, y_merged) acc_knn_label_spread = knn_label_spread.score(z, z_y) # hard_label_knn_label_spread = knn_label_spread.predict(z) # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1] except: acc_knn_label_spread = [] print 'knn label spread did not work' # Generative Models # Semi-generative model on labelled data only a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e, converged=True) soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_semigen = soft_label_semigen > 0.5 acc_semigen_labelled = np.mean(hard_label_semigen == z_y) # EM with soft labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e) soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_soft_EM = soft_label_soft_EM > 0.5 acc_soft_EM = np.mean(hard_label_soft_EM == z_y) # EM with hard labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM( x_c, y, x_e, z_c, z_e) soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_hard_EM = soft_label_hard_EM > 0.5 acc_hard_EM = np.mean(hard_label_hard_EM == z_y) # Conditional label prop acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e) return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\ acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
def process(self): X, ytrue, sc_X = self.data_processing() self.basemodel = svm.SVC(kernel='rbf', decision_function_shape='ovr', probability=True) print("SVM model cross Validation") # create SVM model self.model2 = svm.SVC(kernel='sigmoid', decision_function_shape='ovr', probability=True, gamma=.1, coef0=.5) self.cross_valid(self.model2, X, ytrue) #TSVM print("T SVM Semi Supervised Classifier cross Validation") self.TSVMmodel = SKTSVM(kernel='rbf') #self.validate_algo(X, ytrue, self.TSVMmodel) #S3VMmodel print("CPLE SVM Semi Supervised Classifier cross Validation") self.S3VMmodel = CPLELearningModel( self.basemodel, predict_from_probabilities=True) # RBF SVM #self.validate_algo(X, ytrue, self.S3VMmodel) #self.cross_valid2(self.S3VMmodel, X, ytrue, show_plot=TRUE, label_perc = .5) # create semi supervised model with svm as base model self.ssmodel = SelfLearningModel(self.basemodel) print("Fast Semi Supervised Classifier cross Validation") #self.validate_algo(X, ytrue, self.ssmodel) # split train, test data X, X_test, ytrue, y_test = model_selection.train_test_split( X, ytrue, test_size=.2, random_state=7) #split label and unlabel sample ys = self.unlabel_data(ytrue, 42, .8) # model with simple SVM self.model2.fit(X, ytrue) print("Simple SVM Model") y_pred_train_svm = self.model2.predict(X) y_pred_train_prob_svm = self.model2.predict_proba(X)[:, 1] print("SVM Algo Train Data Validation") self.validation(ytrue, y_pred_train_svm, y_pred_train_prob_svm) # test data with svm y_pred_test_svm = self.model2.predict(X_test) y2_pred_prob_svm = self.model2.predict_proba(X_test)[:, 1] print("SVM Algo Test Data Validation") self.validation(y_test, y_pred_test_svm, y_pred_prob_svm) # fit TSVM semi supervised model self.TSVMmodel.fit(X, ys) print("TSVM Semi Supervised Fast Algo ready") y_pred_train = self.TSVMmodel.predict(X) y_pred_train_prob = self.TSVMmodel.predict_proba(X)[:, 1] print("TSVM Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.TSVMmodel.predict(X_test) y_pred_prob = self.TSVMmodel.predict_proba(X_test)[:, 1] print("TSVMmodel Semi Supervised Fast Algo Test Data Validation") self.validation(y_test, y_pred_test, y_pred_prob) # fit CPLE semi supervised model self.S3VMmodel.fit(X, ys) print("CPLE Semi Supervised Fast Algo ready") y_pred_train = self.S3VMmodel.predict(X) y_pred_train_prob = self.S3VMmodel.predict_proba(X)[:, 1] print("CPLE Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.S3VMmodel.predict(X_test) y_pred_prob = self.S3VMmodel.predict_proba(X_test)[:, 1] print("CPLE Semi Supervised Fast Algo Test Data Validation") self.validation(y_test, y_pred_test, y_pred_prob) # fit Fast semi supervised model self.ssmodel.fit(X, ys) print("Semi Supervised Fast Algo ready") y_pred_train = self.ssmodel.predict(X) y_pred_train_prob = self.ssmodel.predict_proba(X)[:, 1] print("Semi Supervised Fast Algo Train Data Validation") self.validation(ytrue, y_pred_train, y_pred_train_prob) y_pred_test = self.ssmodel.predict(X_test) y_pred_prob = self.ssmodel.predict_proba(X_test)[:, 1] print("Semi Supervised Fast Algo Test Data Validation") return self.validation(y_test, y_pred_test, y_pred_prob)
from sklearn import datasets from sklearn.semi_supervised import LabelPropagation from sklearn.metrics import confusion_matrix import numpy as np import helpers import functions from sklearn.feature_extraction.text import TfidfVectorizer from scikitTSVM import SKTSVM import warnings warnings.filterwarnings("ignore", category=PendingDeprecationWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) tsvm = SKTSVM(probability=False, C=0.01, gamma=1.0, kernel='linear', lamU=1.0) percent_test = 0.15 positive_set = 'data/bc_samples.txt' negative_set = 'data/bc_grounds.txt' unlabeled_set = 'data/unlabeled-data.csv' analogy_list = functions.get_list_re(positive_set) non_analogy_list = functions.get_list_re(negative_set) unlabeled_list = functions.get_list_re(unlabeled_set) samples = [(text, 1) for text in analogy_list] + [(text, 0) for text in non_analogy_list] train_data, train_labels, test_data, test_labels = functions.preprocess( samples, percent_test) j = 0 for sample in unlabeled_list: if j <= 20000: train_data.append(sample) train_labels.append(-1) j += 1 train_labels = np.array(train_labels)