def test_LabelPropagation_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelPropagation rbf kernel") plt.show()
def load_all_data(): # Read am partition the matrix data = pd.read_feather('../feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[4977:4977+3000] y_va = y[4977:4977+3000] x = np.concatenate((x[:4977],x[4977+3000:])) y = np.concatenate((y[:4977],y[4977+3000:])) nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x = np.concatenate([x_obs, x_nuls], axis=0) y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 ''' X, y, unlabeled_indices = data # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
def semiLabelPropagation(feature_extractor, generator, val_generator, kernel, neighbors, gamma): semi = LabelPropagation(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=None, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def testing_predictions(self, test_data, model, num_pcs, gamma=False, max_iter=1000000, mean=False): pca_data = self.principal_components(test_data, self.pca, num_pcs) if mean == False: return np.array([p[1] for p in model.predict_proba(pca_data)]) train_pca_data = self.principal_components(self.X, self.pca, num_pcs) predicted_probs = "" for seed in self.seeds: np.random.seed(seed) model = LabelPropagation(kernel='rbf', gamma=gamma, max_iter=max_iter) model.fit(train_pca_data, self.Y) predicted_prob = np.array( [p[1] for p in model.predict_proba(pca_data)]) if predicted_probs == "": predicted_probs = predicted_prob else: predicted_probs = np.vstack((predicted_probs, predicted_prob)) #get mean of each run: mean_probs = np.mean(predicted_probs, axis=0) return mean_probs
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 ''' X, y, unlabeled_indices, XPredict, yTrue = data #print("get ytrue") #print(yTrue) # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 print(y_train) #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1) clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5) #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20) clf.fit(X, y_train) ### 获取预测准确率 # 预测标记 predicted_labels = clf.predict(XPredict) print(XPredict) #predicted_labels = clf.transduction_[unlabeled_indices] # 真实标记 #yTrue #true_labels = y[unlabeled_indices] print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))
def lb_prop_classify(network, labels): kf = StratifiedKFold(n_splits=10) scores = [] cms = [] for test_index, train_index in kf.split(network ,labels): first_train_index, last_train_index = min(train_index), max(train_index) train_dataset = network[first_train_index:last_train_index] train_labels = labels[first_train_index:last_train_index] test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0) test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0) label_spreading_model = LabelPropagation() label_spreading_model.fit(train_dataset, train_labels) scores.append(label_spreading_model.score(test_dataset, test_labels)) prediction = label_spreading_model.predict(test_dataset) cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_)) print('label propagation media {}'.format(np.average(scores))) print('label propagation desvio padrao {}'.format(np.std(scores))) print('label propagation matriz de confusao') print(get_percentile_cm(get_average_cm(cms))) print('\n') return scores
def sklearn_lp(X, y, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3) label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X_train, y_train) y_predict = label_prop_model.predict(X_test) print 'y_train: ', y_train print 'y_predict: ', y_predict print '+--------------------------------------------------------+' print '| Report +' print '+--------------------------------------------------------+' print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6): prop = LabelPropagation(kernel=kernel, n_neighbors=k, gamma=g, max_iter=MI, n_jobs=-1) prop.fit(xTrain, yTrain) predY = prop.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'HC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def load_all_data(): # Read am partition the matrix data = pd.read_feather('./feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[[i in [8, 9] for i in o.values]] y_va = y[[i in [8, 9] for i in o.values]] x = x[[i not in [8, 9] for i in o.values]] y = y[[i not in [8, 9] for i in o.values]] o.unique() nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x_all = np.concatenate([x_obs, x_nuls], axis=0) y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) # Over sample the stages zen = SMOTE(random_state=8675309) x, y = zen.fit_resample(x_all, y_all) x, y = shuffle(x, y, random_state=42) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def doLabelPropagation(self,X,y,**kwargs): label_prop_model = LabelPropagation(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_prop_model.fit(X, y) if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) ) return label_prop_model.predict_proba(X)
def _label_propagation(df): X = _generate_features(df) labels = _generate_labels(df) # for some reason pandas returns NaN for -1 values labels = labels.fillna(-1) label_prop_model = LabelPropagation() label_prop_model.fit(X.toarray(), labels) return label_prop_model.predict(X.toarray())
def main(): stop = fetch_default_stop_words() comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop) # 将无标注的数据的标记置为-1 for key in comm.keys(): if comm[key].integrity is None: comm[key].integrity = -1 comm[key].interpretability = -1 labeled = {key: c for key, c in comm.items() if c.integrity != -1} unlabeled = {key: c for key, c in comm.items() if c.integrity == -1} # 加载基于答复文本的wv模型 wv_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False) # ------------------------------------------------------- # 模型一 xy = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in unlabeled.values()] x = [t[0] for t in xy] y = [t[1] for t in xy] xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in labeled.values()] x_labeled = [t[0] for t in xy_labeled] y_labeled = [t[1] for t in xy_labeled] x_train, y_train = (x_labeled, y_labeled) # 将已标注的数据与未标注的数据混合成为训练集 x_train += x y_train += y # 训练标记传播模型 clf = LabelPropagation(gamma=30) # 模型1 clf.fit(x_train, y_train) joblib.dump(clf, integrity_clf_path) # -------------------------------------------------------------- # 模型二 xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()] x = [t[0] for t in xy] y = [t[1] for t in xy] xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()] x_labeled = [t[0] for t in xy_labeled] y_labeled = [t[1] for t in xy_labeled] x_train, y_train = (x_labeled, y_labeled) # 将已标注的数据与未标注的数据混合成为训练集 x_train += x y_train += y # 训练标记传播模型 clf = LabelSpreading() # 模型2 clf.fit(x_train, y_train) joblib.dump(clf, interpretability_clf_path)
def test_LabelPropagation(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))
def test_LabelPropagation(*data): x, y ,unlabeled_indices = data y_train = np.copy(y) # 这里选择复制,后面要用到y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为-1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) # 获取预测准确率 true_labels = y[unlabeled_indices] # 取得真实标记 print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0], )) all_labels = np.concatenate((labels, neg_ones), axis=0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0],)) all_labels = np.concatenate((labels, neg_ones), axis = 0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def LP(source_train, target_test, label1, label3): label_prop_model = LabelPropagation() label_prop_model.fit(source_train, label1) source_predict = label_prop_model.predict(target_test) # 评价参数 accuracy = metrics.accuracy_score(label3, source_predict) recall = metrics.recall_score(label3, source_predict, average='weighted') f1 = metrics.f1_score(label3, source_predict, average='weighted') precision = metrics.precision_score(label3, source_predict, average='weighted') print("LP:", accuracy, recall, f1, precision) return accuracy, recall, f1, precision
def create_label_prop(dataset): vectors, labels = make_vectors(dataset) Q_labels = -1 * np.ones(dataset.Q.shape[0] + dataset.test_X.shape[0]) labels = np.concatenate((labels, Q_labels)) vectors = np.concatenate((vectors, dataset.Q, dataset.test_X)) label_prop = LabelPropagation() label_prop.fit(vectors, labels) print("\tLabel Propogation accuracy:") return label_prop
def model_and_fit(type, train_vector, classes): if type == SemiSupervisedAlgorithms.LABEL_PROPAGATION: model = LabelPropagation() model.fit(train_vector, classes) return model elif type == SemiSupervisedAlgorithms.LABEL_SPREADING: from scipy.sparse import csgraph model = LabelSpreading(kernel='rbf') model.fit(train_vector, classes) return model else: raise ValueError('Wrong semi supervised model type!')
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def do_evaluation(X, y, kernel='knn', output=None, gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): # from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score import random size = len(X) random_seeds = np.random.randint(1, 1000, size=10) for i in range(len(random_seeds)): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i]) labels = np.copy(y) tmp = np.arange(size) np.random.shuffle(tmp) train_test_split_rate = int(size*.9) random_unlabeled_points = tmp[:train_test_split_rate] labeled_points = tmp[train_test_split_rate:] random_unlabeled_points.sort() X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points] y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points] y_train = [y[_] for _ in range(size) if _ in labeled_points] labels[random_unlabeled_points] = -1 label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, labels) y_predict = label_prop_model.predict(X_test) print '+--------------------------------------------------------+' print '| Report |' print '+--------------------------------------------------------+' print 'test round:', (i+1), ' with random seed: ', random_seeds[i] print 'training label: ', y_train print 'training post id: ', [_+1 for _ in labeled_points] print 'predict label: ', y_predict print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def process(self, n_components): X_train, y_train, X_test, y_test = self.preprocess(n_components) label_prop_model = LabelPropagation(n_jobs=-1) label_prop_model.fit(X_train, y_train) y_pred = label_prop_model.predict(X_test) mean_acc = label_prop_model.score(X_test, y_test) plot_confusion_matrix(y_test, y_pred, self.labels, normalize=False, figname=('lp_comps_%d.png' % n_components)) self.m_acc.append(mean_acc) print(label_prop_model.get_params())
def khren3(G): result_s = {} result_d = {} passed_set = [] list_neighbrs = {} for v in G.nodes: list_neighbrs.update({v: set(nx.neighbors(G, v))}) for u in G.nodes: passed_set.append(u) for v in nx.neighbors(G, u): if not v in passed_set: cmn_nmbr = list_neighbrs[u] & list_neighbrs[v] # dist = nx.shortest_path_length(G,u,v) # if dist == 2: # cmn_nmbr = G.distance(u,v) if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']: result_s.update({(u, v): cmn_nmbr}) else: result_d.update({(u, v): cmn_nmbr}) # max_s = max(len(result_s.values())) min_s = len(min(result_s.values(), key=len)) min_d = len(min(result_d.values(), key=len)) max_d = len(max(result_d.values(), key=len)) for (pair, vertex_list) in result_d.items(): if len(vertex_list) == max_d: max_pair = pair break print(min_s, min_d) adj_matrix = nx.adjacency_matrix(G).toarray() labels = [-1 for node in G.nodes] true_labels = [G.nodes[node]['ground_label'] for node in G.nodes] # labels[[0]] = 0 labels[max_pair[0]] = 0 labels[max_pair[1]] = 1 # labels[0:10] = [0 for i in range(10)] # labels[900:910] = [1 for i in range(10)] lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000) lp.fit(adj_matrix, labels) print(lp.score(adj_matrix, true_labels)) return (result_s, result_d)
def execute(self, function_context: FunctionContext, input_list: List) -> List: x_train = input_list[0] y_label = input_list[1] input_dim = 512 x_train_columns = list() x_train_columns.append('face_id') for i in range(1, input_dim + 1): x_train_columns.append('col' + str(i)) trainDf = pd.DataFrame(x_train, columns=x_train_columns) labelDf = pd.DataFrame(y_label, columns=('face_id', 'label')) trainDf = pd.merge(trainDf, labelDf, on=['face_id'], how='inner', suffixes=('_x', '_y')) y_label = trainDf['label'].values.astype(int) trainDf = trainDf.drop('face_id', 1) x_train = trainDf.drop('label', 1).values label_prop_model = None score = 0.0 while score < 0.95: print('before train ACC:', score) random_unlabeled_points = np.random.rand(len(y_label)) random_unlabeled_points = random_unlabeled_points < 0.3 # 0-1的随机数,小于0.7返回1,大于等于0.7返回0 Y = y_label[random_unlabeled_points] # label转换之前的 y_label[random_unlabeled_points] = -1 # 标签重置,将标签为1的变为-1 label_prop_model = LabelPropagation() label_prop_model.fit(x_train, y_label) Y_pred = label_prop_model.predict(x_train) Y_pred = Y_pred[random_unlabeled_points] score = accuracy_score(Y, Y_pred) y_label[random_unlabeled_points] = Y model_path = os.path.dirname(os.path.abspath(__file__)) + '/model' print('Save trained model to {}'.format(model_path)) if not os.path.exists(model_path): joblib.dump(label_prop_model, model_path) model_meta: ModelMeta = function_context.node_spec.output_model # Register model version to notify that cluster serving is ready to start loading the registered model version. register_model_version(model=model_meta, model_path=model_path) return []
def label_propagation(self, X_train, y, X_test): clf = LabelPropagation() print("X_train Shape :", X_train.shape, type(X_train)) print("X_test shape : ", X_test.shape, type(X_test)) print("y shape : ", y.shape) X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def label_prop(): labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']] model = LabelPropagation(kernel=rbf_kernel_safe) model.fit(df10, labels.values.ravel()) pred = np.array(model.predict(df12)) df13 = pd.DataFrame(pred, columns=['Prediction']) df14 = pd.concat([df12, df13], axis=1) print(df14[['ID', 'Prediction']]) # print(df14.loc[df14['Prediction'] == 'Y']) plt.style.use ( 'seaborn' ) df14['Prediction'].value_counts().plot(kind='bar') plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] ) plt.ylabel('Number of occurrences after prediction by RBF algorithm'); plt.show()
def propagate_labels(X_u, y_u, X_l, num_unlabeled): # unlabeled samples are represented by -1 in labelprop y_u_placeholder = np.zeros(num_unlabeled) - 1 X_train_prop = np.concatenate((X_l, X_u), axis=0) y_train_prop = np.concatenate((y_l, y_u_placeholder), axis=0) prop = LabelPropagation(gamma=15) prop.fit(X_train_prop, y_train_prop) y_train_lda = prop.transduction_ X_train_lda = np.concatenate((X_l, X_u), axis=0) return X_train_lda, y_train_lda
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X, y_train) ### 获取预测准确率 predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记 true_labels = y[unlabeled_indices] # 真实标记 print("Accuracy:%f" % metrics.accuracy_score(true_labels, predicted_labels))
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
class _LabelPropagationImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def do_label_propagation(input_data, input_label, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=30, tol=0.001): n_neighbors += 1 # input label input_label_fh = open(input_label, 'rb') label_lines = input_label_fh.readlines() label_lines = [int(_.strip()) for _ in label_lines] y = np.array(label_lines) input_label_fh.close() size = len(y) # input data input_data_fh = open(input_data, 'rb') data_lines = input_data_fh.readlines()[:size] data_lines = [_.strip() for _ in data_lines] X = np.array(np.mat(';'.join(data_lines))) input_data_fh.close() label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, y) prediction = label_prop_model.predict(X) if output: output_fh = open(output, 'wb') for p in prediction: output_fh.write(str(p)+'\n') output_fh.close() return label_prop_model
def tryLabelPropagation(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.semi_supervised import LabelPropagation from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid propOperator = LabelPropagation(gamma=150) propOperator.fit(training_data[:3000],training_labels[:3000]) score = accuracy_score(validation_labels, propOperator.predict(validation_data)) print str(score)
from sklearn.semi_supervised import LabelPropagation from sklearn import metrics import numpy as np #K nearest neighbors model ensures we dont run over our memory #rbf Kernel needs complete graph, so requires feature selection lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data #Unlabeled points - random size for now. unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels))) labels = np.copy(Ytr_labels) #Save training labels for testing labels[unlabeled_points] = -1 #Set unlabeled value, classes : 0, 1 lp_model.fit(Xtr,labels) #Train ############################################# # Models use n_neighbors and max_iteration to control kernel ############################################# ############################################# # Test Functions ############################################# #Mean squared Error yhat = lp_model.predict(Xtr); mse = metrics.mean_squared_error(Ytr_labels,yhat); ############################################### # Cross Validation ###############################################
noLabels = [1 for i in Y_train if i==-1] labels = [1 for i in Y_train if not i==-1] print('unlabeled: ',np.sum(noLabels)) print('labeled: ',np.sum(labels)) # In[8]: from sklearn.semi_supervised import LabelPropagation as LP from sklearn.semi_supervised import LabelSpreading as LS # In[17]: lspr = LP(gamma = 70) lspr.fit(X_norm,Ytrain) # In[15]: print('nofClasses: ',lspr.classes_) # In[16]: pred = lspr.predict(X_norm) notN = [1 for i in pred if i>0.0] print(sum(notN)) # In[12]:
else: clf = LabelSpreading(kernel=param["kernel"], n_neighbors=param["n_neighbors"]) extra_param = param["n_neighbors"] now = datetime.datetime.now() date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000), now.month, now.day, now.hour, now.minute) #classification Type #clf = OneVsOneClassifier(clf) #clf = OneVsRestClassifier(clf) logging.info("start with training ") clf.fit(X_train, y_train) #y_pred = clf.predict(X_valid) #print("min:{0} max:{0}".format(y_pred.min(),y_pred.max())) #score = accuracy_score(y_valid, y_pred, True) print("found classes are {0}".format(clf.classes_)) y_test = clf.predict(X_test) y_test = y_test.astype(np.uint32) lib_IO.write_Y("Data/pr4/{0}_{1}_{2}_{3}".format(name,param["kernel"],extra_param,date_time),y_test,Ids=ids) #Gridsearch #grid_search = GridSearchCV(clf, param, scoring='accuracy',cv=10, n_jobs=-1, verbose=1) #grid_search.fit(X_train, y_train) #clf_tmp = grid_search.best_estimator_
class IndicatorIdentifier(object): ''' Identify the an indicator for a document ''' def __init__(self): self.model = LabelPropagation() #(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() labeledFile = "Database/Indicator/Indicators.xlsx" sheet = "Sheet1" df = da.readingDatabaseTetum(labeledFile, sheet, head= 0) cut = int(0.8*df.shape[0]) # re-duplicate the data => Result: one document has one label only columns = df.columns.tolist() columns.remove("Content") print columns X_train = [] Y_train = [] X_test = [] Y_test = [] for index, row in df.iterrows(): labels = list(set([row[col] for col in columns if not pd.isnull(row[col])])) content = row["Content"] if index < cut: # training part for label in labels: X_train.append(content) Y_train.append(label) else: X_test.append(content) Y_test.append(labels) fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) #print unlabeledData[0] return (X_train, Y_train, X_test, Y_test, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (X_train, Y_train, X_test, Y_test, unlabeledData) = self.readingDatabase() print "Training size: " + str(len(X_train)) print "Test size: " + str(len(X_test)) ''' X_train = X_train[:100] Y_train = Y_train[:100] X_test = X_test[:100] Y_test = Y_test[:100] ''' print "Finish reading database." #print FreqDist(indicators).most_common() k = 0 dictLabel = FreqDist(Y_train) for key in dictLabel: dictLabel[key] = k k+=1 Y_train = [dictLabel[ind] for ind in Y_train] Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test] ''' random.seed(123456) # Training z = zip(labeledData, indicators) random.shuffle(z) labeledData, indicators = zip(*z) X_train = list(labeledData[:cut]) Y_train = list(indicators[:cut]) X_test = list(labeledData[cut:]) Y_test = list(indicators[cut:]) ''' X_train += unlabeledData Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() #pprint(X_train) #print Y_train #print X_train[cut-2:cut+2] #print Y_train[cut-2:cut+2] print "Training..." self.train(X_train, Y_train) # Testing print "Testing..." Y_predicted = self.test(X_test) print Y_predicted # The Y_predicted only need to be one of the true labels in order to be calculated as correctness for i in range(len(Y_predicted)): lab = Y_predicted[i] if lab in Y_test[i]: Y_test[i] = lab else: Y_test[i] = -1 (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted) print accuracy
class SentimentAnalysis(object): ''' Identify the a sentiment for each document. ''' def __init__(self): self.model = LabelPropagation()#(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx" sheet = "Sheet1" posData = da.readingDatabaseTetum(filePos, sheet) posData = posData[0].tolist() print len(posData) print posData[0] fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx" sheet = "Sheet1" negData = da.readingDatabaseTetum(fileNeg, sheet) negData = negData[0].tolist() print len(negData) print negData[0] fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) print unlabeledData[0] return (posData, negData, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (posData, negData, unlabeledData) = self.readingDatabase() print "Finish reading database." # Divide training and test data cut = 10 posDataTrain = posData[:cut] negDataTrain = negData[:cut] posDataTest = posData[cut:] negDataTest = negData[cut:] random.seed(123456) # Training X_train = posDataTrain + negDataTrain + unlabeledData Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() z = zip(X_train, Y_train) random.shuffle(z) X_train, Y_train = zip(*z) self.train(X_train, Y_train) # Testing X_test = posDataTest + negDataTest Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist() z = zip(X_test, Y_test) random.shuffle(z) X_test, Y_test = zip(*z) Y_predicted = self.test(X_test) print Y_predicted (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted) print (accuracy,precision,recall,f1)