def SelfTraingWrapper(X_train, y_train, X_test): from frameworks.SelfLearning import SelfLearningModel clf = RandomForestClassifier(warm_start=True, n_estimators=1000) ssmodel = SelfLearningModel(clf, prob_threshold=0.9) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) ssmodel.fit(np.concatenate((X_train, X_test)), newlabels) return ssmodel.predict(X_test)
# label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)) # semi-supervised score, RBF SVM model
# label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print "self-learning log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue) # semi-supervised score, RBF SVM model ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True) # RBF SVM
S_with_lowest_Q_statistic = S_lowest_new.copy() sim_schemas_lowest = sim_schemas_lowest_new.copy() Xms_lowest = Xms_lowest_new.copy() Xus_lowest = Xus_lowest_new.copy() return sim_schemas_lowest, Xms_lowest, Xus_lowest sim_schemas, Xms, Xus = select_sim_schemas_with_high_Q_statistic( sim_schemas, Xms, Xus, 10) #### Jurek_step 5 the self learning training process models = [] labels_per_schema = [] probs_per_schema = [] for i, schema in enumerate(sim_schemas): model = SelfLearningModel(LogisticRegression(tol=1e-3, solver='liblinear')) models.append(model) x = pairs[schema].values y_df = pd.DataFrame(list(-1 for i in range(x.shape[0])), columns=['y'], index=pairs.index) y_df.loc[Xms[i]] = 1 y_df.loc[Xus[i]] = 0 y = y_df['y'].values model.fit(x, y) labels = model.predict(x) labels_per_schema.append(labels) probs_per_schema.append(model.predict_proba(x)[:, 1]) labels_ensemble = np.array([round(sum(list(labels_per_schema[i][j] \ for i in range(len(labels_per_schema)))) / len(labels_per_schema)) \
# set the labels of the labeled samples ys[random_labeled_points] = ytrue[random_labeled_points] # 2 # print(X[random_labeled_points]) # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis # SGDClassifier basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression # model fit basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # self learning framework # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) # defaule use the sample weighting ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score, WQDA model # WQDA: Weighted Quadratic Discriminant Analysis, 加权二次判别式分析 ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)
kernel = "rbf" Xsupervised = X[ys!=-1, :] ysupervised = ys[ys!=-1] lbl = "Base model SVM(kernel=rbf):" print (lbl) basemodel = sklearn.svm.SVC(kernel=kernel, probability=True) basemodel.fit(Xsupervised, ysupervised) evaluate(basemodel, X, ys, ytrue, lbl) # basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression # basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) # print ("supervised log.reg. score", basemodel.score(X, ytrue)) # # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print ("self-learning log.reg. score", ssmodel.score(X, ytrue)) kernel = "rbf" Xsupervised = X[ys!=-1, :] ysupervised = ys[ys!=-1] lbl = "Purely supervised SVM:" print (lbl) model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) evaluate(model, X, ys, ytrue, lbl) lbl = "S3VM (Gieseke et al. 2012):"
ys[np.random.randint(N, 2 * N)] = 1 Xsupervised = Xs[ys != -1, :] ysupervised = ys[ys != -1] # compare models lbl = "Purely supervised QDA:" print lbl model = WQDA() model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "SelfLearning QDA:" print lbl model = SelfLearningModel(WQDA()) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) QDA:" print lbl model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) QDA:" print lbl CPLELearningModel.pessimistic = False model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) acc = basemodel.score(X, ytrue) if acc: sgd_active.append(acc) kernel = "rbf" svm_model = sklearn.svm.SVC(kernel=kernel, probability=True) ssmodel = SelfLearningModel(svm_model) ssmodel.fit(X, ys) acc = ssmodel.score(X, ytrue) if acc: self_learning_active.append(acc) Xsupervised = X[ys != -1, :] ysupervised = ys[ys != -1] lbl = "Purely supervised SVM:" model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) acc = evaluate(model, X, ys, ytrue, lbl) print("SVM Accuracy:{}".format(acc)) if acc: svm_active.append(acc)
basemodel.fit(X_model[random_labeled_points, :], ys[random_labeled_points]) #print "supervised log.reg. score", basemodel.score(X_test, y_test) #if j == 2: #Plot the base model # evaluate_and_plot(basemodel, X_model, ys, ytrue, "Logistic Regression", subplot = 1, block=True) #Calculate accuracy sum_super += basemodel.score(X_test, y_test) super_acc[i] = basemodel.score(X_test, y_test) sum_super_err += 1.96 * np.sqrt(super_acc[i] * (1 - super_acc[i]) / X_test.shape[0]) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X_model, ys) #print "self-learning log.reg. score", ssmodel.score(X_test, y_test) #if j == 2: #Plot the ssmodel # evaluate_and_plot(ssmodel, X_model, ys, ytrue, "Self-Learning", subplot = 2, block=True) #Calculate accuracy sum_semi += ssmodel.score(X_test, y_test) semi_acc[i] = ssmodel.score(X_test, y_test) sum_semi_err += 1.96 * np.sqrt(semi_acc[i] * (1 - semi_acc[i]) / X_test.shape[0]) #if j==2: #Save the figure
ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) sgd_active.append(basemodel.score(X, ytrue)) kernel = "rbf" svm_model = sklearn.svm.SVC(kernel=kernel, probability=True) ssmodel = SelfLearningModel(svm_model) ssmodel.fit(X, ys) self_learning_active.append(ssmodel.score(X, ytrue)) Xsupervised = X[ys != -1, :] ysupervised = ys[ys != -1] lbl = "Purely supervised SVM:" model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) acc = evaluate(model, X, ys, ytrue, lbl) svm_active.append(acc) lbl = "S3VM (Gieseke et al. 2012):" model = scikitTSVM.SKTSVM(kernel=kernel) model.fit(X, ys)
# label a few points labeled_N = 30 ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = WQDA() # weighted Quadratic Discriminant Analysis #basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) #print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("this is the fitted thing", ssmodel.fit(X,ys)) y_score = ssmodel.predict(heart.data) #print "heart.target", heart.target #print "this is the prediction", y_score print("self-learning log.reg. score", ssmodel.score(X, ytrue)) fpr = dict() tpr = dict() roc_auc = dict() for i in range(2): fpr[i], tpr[i], _ = roc_curve(label_binarize(heart.target, classes = [0,1]), label_binarize(y_score, classes = [0,1])) roc_auc[i] = auc(fpr[i], tpr[i]) for i in range(2):