def tsvm(train_examples, train_labels, test_examples, test_labels, verbose): model = CPLELearningModel(SVC(kernel="rbf", C=10, gamma=0.01, probability=True), predict_from_probabilities=True) model.fit(train_examples, train_labels) score = model.score(test_examples, test_labels) return score
def CPLELearningWrapper(X_train, y_train, X_test): from frameworks.CPLELearning import CPLELearningModel #clf = RandomForestClassifier() from sklearn.linear_model.stochastic_gradient import SGDClassifier clf = SGDClassifier(loss='log', penalty='l1') ssmodel = CPLELearningModel(clf) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) ssmodel.fit(np.concatenate((X_train, X_test)), newlabels) return ssmodel.predict(X_test)
def run3(file_path): index = -1 features = [] label = [] for path in file_path: a, b, c = fe(path, 1, 1, 0.05, 0.05, compute_beat=False) for example in a: features.append(example.tolist()) label.append(index) index += 1 print(index, " FOLDER FEATURE EXTRACTED") features = np.asarray(features) label = np.asarray(label) model = CPLELearningModel(SVC(kernel="rbf", C=10, gamma=0.01, probability=True), predict_from_probabilities=True) model.fit(features, label) pkl_filename = "tsvm.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(model, file) print("MODEL SAVED")
ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)) # semi-supervised score, RBF SVM model ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True) # RBF SVM ssmodel.fit(X, ys) print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
plt.subplot(2,2,i+1) plt.hold(True) t1=time.time() # train model if i == 0: lbl = "Purely supervised SVM:" model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) else: if i==1: lbl = "S3VM (Gieseke et al. 2012):" model = scikitTSVM.SKTSVM(kernel=kernel) elif i == 2: lbl = "CPLE(pessimistic) SVM:" model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True)) elif i == 3: lbl = "CPLE(optimistic) SVM:" CPLELearningModel.pessimistic = False model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True)) model.fit(Xs, ys.astype(int)) print "" print lbl print "Model training time: ", round(time.time()-t1, 3) # predict, and evaluate pred = model.predict(Xs) acc = np.mean(pred==ytrue) print "accuracy:", round(acc, 3)
Xsupervised = Xs[ys != -1, :] ysupervised = ys[ys != -1] # compare models lbl = "Purely supervised QDA:" print lbl model = WQDA() model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "SelfLearning QDA:" print lbl model = SelfLearningModel(WQDA()) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) QDA:" print lbl model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) QDA:" print lbl CPLELearningModel.pessimistic = False model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "S3VM (Gieseke et al. 2012):" print(lbl) model = scikitTSVM.SKTSVM(kernel=kernel) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) SVM:" print(lbl) model = CPLELearningModel( sklearn.svm.SVC( kernel=kernel, probability=True, gamma='auto'), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) SVM:" print(lbl) CPLELearningModel.pessimistic = False model = CPLELearningModel( sklearn.svm.SVC( kernel=kernel, probability=True, gamma='auto'), predict_from_probabilities=True)
from sklearn.linear_model.stochastic_gradient import SGDClassifier from methods.scikitWQDA import WQDA # load data heart = fetch_mldata("heart") X = heart.data ytrue = np.copy(heart.target) ytrue[ytrue == -1] = 0 # label a few points labeled_N = 2 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised score", basemodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "semi-supervised score", ssmodel.score(X, ytrue) # supervised score 0.418518518519 # semi-supervised score 0.555555555556
ysupervised = ys[ys != -1] # compare models lbl = "Purely supervised SVM:" print lbl model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "S3VM (Gieseke et al. 2012):" print lbl model = scikitTSVM.SKTSVM(kernel=kernel) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) SVM:" print lbl model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) SVM:" print lbl CPLELearningModel.pessimistic = False model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
ys[sidx] = ytrue[sidx] Xsupervised = Xs[ys!=-1, :] ysupervised = ys[ys!=-1] # compare models lbl = "Purely supervised SVM:" print(lbl) model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "S3VM (Gieseke et al. 2012):" print(lbl) model = scikitTSVM.SKTSVM(kernel=kernel) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) SVM:" print(lbl) model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) SVM:" print(lbl) CPLELearningModel.pessimistic = False model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
def main(centroid_name, cluster_name, feature_root, centroid_KPI_label_count): matrix = np.array([[0, 0], [0, 0]]) summary_cluster = [] summary_curve = [] df_source = pd.read_csv(os.path.join(feature_root, centroid_name)) feature_name = [i for i in df_source.columns if i.startswith("F#")] cluster_matrix = np.array([[0, 0], [0, 0]]) real_label = df_source['label'].values.copy() df_source, centroid_KPI_real_label_count = label_positive( df_source, centroid_KPI_label_count, 'random') centroid_train = df_source.copy() '''Start PU learning''' PU_model = PULearningModel(centroid_train[feature_name].values, centroid_train['label'].values, len(centroid_train)) print Counter(centroid_train['label'].values) PU_model.pre_training(0.2) print Counter(real_label) RF_model = RandomForestClassifier(n_estimators=100) PU_labels, positive_label_count = PU_model.add_reliable_samples_using_RandomForest( 0.015, 200, 0.7, real_label, RF_model) train_data = centroid_train[feature_name].values centroid_train['label'] = PU_labels print 'Finish PU learning for centroid:', Counter( centroid_train['label'].values) '''Finish PU learning''' for name_suffix in os.listdir(feature_root): if name_suffix == centroid_name: continue print('*' * 30) print(name_suffix) print('*' * 30) df_target = pd.read_csv(os.path.join(feature_root, name_suffix)) target_test_length = int(target_test_ratio * len(df_target)) test = df_target[-target_test_length:].copy() target_train_length = int(target_train_ratio * len(df_target)) target_train = df_target[:target_train_length].copy() target_train_with_label = target_train.copy() target_train['label'] = -1 train = pd.concat([centroid_train, target_train]).copy() print Counter(train['label'].values) model = CPLELearningModel(basemodel=RandomForestClassifier( config.RF_n_trees, n_jobs=15), max_iter=50, predict_from_probabilities=True, real_label=None) train_data = train[feature_name].values train_label = train['label'].values print 'start training CPLE model:', Counter(train_label) model.fit(train_data, train_label) print("finish train") # exit() name = name_suffix + '_PU' joblib.dump(model, model_root + '/' + name + ".sav") model1 = joblib.load(model_root + '/' + name + '.sav') print("model is :", model1) proba = model.predict_proba(test[feature_name]) proba = proba[:, 1] eva = delay_eva(test["label"].values, proba) print(proba) _, best_threshold = eva.best_fscore_threshold() threshold = best_threshold print "threshold is", threshold predict_ans = eva.predict_for_threshold(threshold) save_proba(model, test, name + "_test" + ".csv", predict_ans) fscore = eva.fscore_for_threshold(threshold) average_detection_delay = eva.average_detection_delay( threshold) * config.interval / 60 print("PUAD fscore of test is %f", fscore) print("PUAD average_detection_delay is %f", average_detection_delay) temp_matrix = eva.confusion_matrix_for_threshold(threshold) matrix = matrix + temp_matrix cluster_matrix = cluster_matrix + temp_matrix _, pre, rec = cal_fscore(temp_matrix) TP = temp_matrix[1][1] FP = temp_matrix[0][1] FN = temp_matrix[1][0] print 'TP:', TP print 'FP:', FP print 'FN:', FN temp = OrderedDict([("name", name), ("medios", 0), ("label", test["label"].values.sum()), ("PU_fscore", fscore), ("delay", average_detection_delay), ("pre", pre), ("rec", rec), ("TP", TP), ("FP", FP), ("FN", FN), ("centroid_KPI_label_count", centroid_KPI_label_count), ("threshold", threshold)]) summary_curve.append(temp) # 在这里上面的循环结束 df_curveresults = pd.DataFrame(summary_curve) df_curveresults.to_csv(os.path.join( result_root, 'PUAD_%s_%d_%f_result.csv' % (cluster_name, centroid_KPI_label_count, 0.015)), index=False)
sidx = random.sample(np.where(ytrue == 0)[0], supevised_data_points/2)+random.sample(np.where(ytrue == 1)[0], supevised_data_points/2) ys[sidx] = ytrue[sidx] Xsupervised = Xs[ys!=-1, :] ysupervised = ys[ys!=-1] # compare models lbl = "Purely supervised SVM:" print lbl model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "S3VM (Gieseke et al. 2012):" print lbl model = scikitTSVM.SKTSVM(kernel=kernel) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) SVM:" print lbl model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3) lbl = "CPLE(optimistic) SVM:" print lbl CPLELearningModel.pessimistic = False model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True) model.fit(Xs, ys.astype(int)) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis # scikit logistic regression basemodel = SGDClassifier(loss='log', penalty='l1') basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model # weighted Quadratic Discriminant Analysis ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True) ssmodel.fit(X, ys) print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)) # semi-supervised score, RBF SVM model ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True) # RBF SVM ssmodel.fit(X, ys) print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
train_data = np.concatenate((X_labelled, X_unlabelled),axis =0) len_train = len(train_data) print 'No. of training data:', len_train #Final Traning labels train_labels = np.concatenate((y_labelled, y_minusone), axis = 0) len_labels = len(train_labels) print 'No. of training labels:', len_labels ##Print the number of test data print 'No. of test data:', len(y_extra) ################################################################################ ################################################################################ lbl = "CPLE(pessimistic) SVM:" print lbl model = CPLELearningModel(svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True, max_iter = 5000 ) model.fit(train_data, train_labels) y_predict = model.predict(X_extra) accuracy = accuracy_score(y_extra, y_predict) print accuracy error_rate_svm[i] = 1 - accuracy logLik_svm[i] = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) ) print 'CPLE Error Rate:', error_rate_svm[i], logLik_svm[i] ############################################################################### ################################################################################ #Create the semi supervised KNN classifier lbl = "Label Propagation(KNN):" print lbl knn_model = label_propagation.LabelSpreading(kernel='knn', alpha=0.0001, max_iter=3000) knn_model.fit(train_data, train_labels)