def load_all_data(): # Read am partition the matrix data = pd.read_feather('../feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[4977:4977+3000] y_va = y[4977:4977+3000] x = np.concatenate((x[:4977],x[4977+3000:])) y = np.concatenate((y[:4977],y[4977+3000:])) nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x = np.concatenate([x_obs, x_nuls], axis=0) y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def testing_predictions(self, test_data, model, num_pcs, gamma=False, max_iter=1000000, mean=False): pca_data = self.principal_components(test_data, self.pca, num_pcs) if mean == False: return np.array([p[1] for p in model.predict_proba(pca_data)]) train_pca_data = self.principal_components(self.X, self.pca, num_pcs) predicted_probs = "" for seed in self.seeds: np.random.seed(seed) model = LabelPropagation(kernel='rbf', gamma=gamma, max_iter=max_iter) model.fit(train_pca_data, self.Y) predicted_prob = np.array( [p[1] for p in model.predict_proba(pca_data)]) if predicted_probs == "": predicted_probs = predicted_prob else: predicted_probs = np.vstack((predicted_probs, predicted_prob)) #get mean of each run: mean_probs = np.mean(predicted_probs, axis=0) return mean_probs
def test_LabelPropagation_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelPropagation rbf kernel") plt.show()
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 ''' X, y, unlabeled_indices = data # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
def semiLabelPropagation(feature_extractor, generator, val_generator, kernel, neighbors, gamma): semi = LabelPropagation(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=None, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 ''' X, y, unlabeled_indices, XPredict, yTrue = data #print("get ytrue") #print(yTrue) # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 print(y_train) #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1) clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5) #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20) clf.fit(X, y_train) ### 获取预测准确率 # 预测标记 predicted_labels = clf.predict(XPredict) print(XPredict) #predicted_labels = clf.transduction_[unlabeled_indices] # 真实标记 #yTrue #true_labels = y[unlabeled_indices] print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def sklearn_lp(X, y, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3) label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X_train, y_train) y_predict = label_prop_model.predict(X_test) print 'y_train: ', y_train print 'y_predict: ', y_predict print '+--------------------------------------------------------+' print '| Report +' print '+--------------------------------------------------------+' print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6): prop = LabelPropagation(kernel=kernel, n_neighbors=k, gamma=g, max_iter=MI, n_jobs=-1) prop.fit(xTrain, yTrain) predY = prop.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'HC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def load_all_data(): # Read am partition the matrix data = pd.read_feather('./feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[[i in [8, 9] for i in o.values]] y_va = y[[i in [8, 9] for i in o.values]] x = x[[i not in [8, 9] for i in o.values]] y = y[[i not in [8, 9] for i in o.values]] o.unique() nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x_all = np.concatenate([x_obs, x_nuls], axis=0) y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) # Over sample the stages zen = SMOTE(random_state=8675309) x, y = zen.fit_resample(x_all, y_all) x, y = shuffle(x, y, random_state=42) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def _label_propagation(df): X = _generate_features(df) labels = _generate_labels(df) # for some reason pandas returns NaN for -1 values labels = labels.fillna(-1) label_prop_model = LabelPropagation() label_prop_model.fit(X.toarray(), labels) return label_prop_model.predict(X.toarray())
def test_LabelPropagation(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))
def test_LabelPropagation(*data): x, y ,unlabeled_indices = data y_train = np.copy(y) # 这里选择复制,后面要用到y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为-1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) # 获取预测准确率 true_labels = y[unlabeled_indices] # 取得真实标记 print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0], )) all_labels = np.concatenate((labels, neg_ones), axis=0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0],)) all_labels = np.concatenate((labels, neg_ones), axis = 0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def LP(source_train, target_test, label1, label3): label_prop_model = LabelPropagation() label_prop_model.fit(source_train, label1) source_predict = label_prop_model.predict(target_test) # 评价参数 accuracy = metrics.accuracy_score(label3, source_predict) recall = metrics.recall_score(label3, source_predict, average='weighted') f1 = metrics.f1_score(label3, source_predict, average='weighted') precision = metrics.precision_score(label3, source_predict, average='weighted') print("LP:", accuracy, recall, f1, precision) return accuracy, recall, f1, precision
def create_label_prop(dataset): vectors, labels = make_vectors(dataset) Q_labels = -1 * np.ones(dataset.Q.shape[0] + dataset.test_X.shape[0]) labels = np.concatenate((labels, Q_labels)) vectors = np.concatenate((vectors, dataset.Q, dataset.test_X)) label_prop = LabelPropagation() label_prop.fit(vectors, labels) print("\tLabel Propogation accuracy:") return label_prop
def do_evaluation(X, y, kernel='knn', output=None, gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): # from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score import random size = len(X) random_seeds = np.random.randint(1, 1000, size=10) for i in range(len(random_seeds)): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i]) labels = np.copy(y) tmp = np.arange(size) np.random.shuffle(tmp) train_test_split_rate = int(size*.9) random_unlabeled_points = tmp[:train_test_split_rate] labeled_points = tmp[train_test_split_rate:] random_unlabeled_points.sort() X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points] y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points] y_train = [y[_] for _ in range(size) if _ in labeled_points] labels[random_unlabeled_points] = -1 label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, labels) y_predict = label_prop_model.predict(X_test) print '+--------------------------------------------------------+' print '| Report |' print '+--------------------------------------------------------+' print 'test round:', (i+1), ' with random seed: ', random_seeds[i] print 'training label: ', y_train print 'training post id: ', [_+1 for _ in labeled_points] print 'predict label: ', y_predict print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def khren3(G): result_s = {} result_d = {} passed_set = [] list_neighbrs = {} for v in G.nodes: list_neighbrs.update({v: set(nx.neighbors(G, v))}) for u in G.nodes: passed_set.append(u) for v in nx.neighbors(G, u): if not v in passed_set: cmn_nmbr = list_neighbrs[u] & list_neighbrs[v] # dist = nx.shortest_path_length(G,u,v) # if dist == 2: # cmn_nmbr = G.distance(u,v) if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']: result_s.update({(u, v): cmn_nmbr}) else: result_d.update({(u, v): cmn_nmbr}) # max_s = max(len(result_s.values())) min_s = len(min(result_s.values(), key=len)) min_d = len(min(result_d.values(), key=len)) max_d = len(max(result_d.values(), key=len)) for (pair, vertex_list) in result_d.items(): if len(vertex_list) == max_d: max_pair = pair break print(min_s, min_d) adj_matrix = nx.adjacency_matrix(G).toarray() labels = [-1 for node in G.nodes] true_labels = [G.nodes[node]['ground_label'] for node in G.nodes] # labels[[0]] = 0 labels[max_pair[0]] = 0 labels[max_pair[1]] = 1 # labels[0:10] = [0 for i in range(10)] # labels[900:910] = [1 for i in range(10)] lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000) lp.fit(adj_matrix, labels) print(lp.score(adj_matrix, true_labels)) return (result_s, result_d)
def execute(self, function_context: FunctionContext, input_list: List) -> List: x_train = input_list[0] y_label = input_list[1] input_dim = 512 x_train_columns = list() x_train_columns.append('face_id') for i in range(1, input_dim + 1): x_train_columns.append('col' + str(i)) trainDf = pd.DataFrame(x_train, columns=x_train_columns) labelDf = pd.DataFrame(y_label, columns=('face_id', 'label')) trainDf = pd.merge(trainDf, labelDf, on=['face_id'], how='inner', suffixes=('_x', '_y')) y_label = trainDf['label'].values.astype(int) trainDf = trainDf.drop('face_id', 1) x_train = trainDf.drop('label', 1).values label_prop_model = None score = 0.0 while score < 0.95: print('before train ACC:', score) random_unlabeled_points = np.random.rand(len(y_label)) random_unlabeled_points = random_unlabeled_points < 0.3 # 0-1的随机数,小于0.7返回1,大于等于0.7返回0 Y = y_label[random_unlabeled_points] # label转换之前的 y_label[random_unlabeled_points] = -1 # 标签重置,将标签为1的变为-1 label_prop_model = LabelPropagation() label_prop_model.fit(x_train, y_label) Y_pred = label_prop_model.predict(x_train) Y_pred = Y_pred[random_unlabeled_points] score = accuracy_score(Y, Y_pred) y_label[random_unlabeled_points] = Y model_path = os.path.dirname(os.path.abspath(__file__)) + '/model' print('Save trained model to {}'.format(model_path)) if not os.path.exists(model_path): joblib.dump(label_prop_model, model_path) model_meta: ModelMeta = function_context.node_spec.output_model # Register model version to notify that cluster serving is ready to start loading the registered model version. register_model_version(model=model_meta, model_path=model_path) return []
def label_prop(): labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']] model = LabelPropagation(kernel=rbf_kernel_safe) model.fit(df10, labels.values.ravel()) pred = np.array(model.predict(df12)) df13 = pd.DataFrame(pred, columns=['Prediction']) df14 = pd.concat([df12, df13], axis=1) print(df14[['ID', 'Prediction']]) # print(df14.loc[df14['Prediction'] == 'Y']) plt.style.use ( 'seaborn' ) df14['Prediction'].value_counts().plot(kind='bar') plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] ) plt.ylabel('Number of occurrences after prediction by RBF algorithm'); plt.show()
def propagate_labels(X_u, y_u, X_l, num_unlabeled): # unlabeled samples are represented by -1 in labelprop y_u_placeholder = np.zeros(num_unlabeled) - 1 X_train_prop = np.concatenate((X_l, X_u), axis=0) y_train_prop = np.concatenate((y_l, y_u_placeholder), axis=0) prop = LabelPropagation(gamma=15) prop.fit(X_train_prop, y_train_prop) y_train_lda = prop.transduction_ X_train_lda = np.concatenate((X_l, X_u), axis=0) return X_train_lda, y_train_lda
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X, y_train) ### 获取预测准确率 predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记 true_labels = y[unlabeled_indices] # 真实标记 print("Accuracy:%f" % metrics.accuracy_score(true_labels, predicted_labels))
def fit(self, x_input, y_input): x = x_input.copy() y = y_input.copy() # standardize input if self.standardize_flag: self.scaler.fit(x) x = self.scaler.transform(x) # apply PCA if config.PCA_VAR_THR < 1: if self.PCA.n_components is None: self.PCA.n_components = x.shape[1] self.PCA.fit(x) n_components = np.where(self.PCA.explained_variance_ratio_. cumsum() > config.PCA_VAR_THR)[0][0] self.PCA = decomposition.PCA(n_components=n_components, whiten=True) self.PCA.fit(x) x = self.PCA.transform(x) cv = GridSearchCV(LabelPropagation(), self.model_params, scoring=self.metric_key, cv=self.num_splits, n_jobs=config.N_JOBS) cv.fit(x, y) self.estimator = cv.best_estimator_ self.best_val_score = cv.cv_results_["mean_test_score"].max()
def __init__(self, method="spreading", kernel="knn", alpha=0.2, gamma=20, n_neighbors=7, **kwargs): super(LabSP, self).__init__(**kwargs) if method.lower() == "propagation": self.regressors = [ LabelPropagation(kernel=kernel, alpha=alpha, gamma=gamma, n_neighbors=n_neighbors) for _ in range(len(self.regions)) ] elif method.lower() == "spreading": self.regressors = [ LabelSpreading(kernel=kernel, alpha=alpha, gamma=gamma, n_neighbors=n_neighbors) for _ in range(len(self.regions)) ] else: raise InitializationError("Method %s not valid" % method)
def start(self): """ 01. Initialise the data paths and transformation functions. """ self.data_dir = '../data/raw_data' self.trans_primitives = ['weekday', 'hour', 'time_since_previous'] self.agg_primitives = [ 'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last', 'time_since_last', 'mode' ] self.ignore_cols = [ 'num_contacts', 'num_referrals', 'num_successful_referrals' ] self.feature_windows = [10, 30, 60, 90] #[10,20,30] self.max_feature_depth = 2 # list of estimators to use self.estimators = [ ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()), ('gbc', GradientBoostingClassifier(validation_fraction=0.15, n_iter_no_change=50)), ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()), ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()), ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()), ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()), ('dtc', DecisionTreeClassifier()), ('qda', QuadraticDiscriminantAnalysis()), ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)), ('knn6', KNeighborsClassifier(6)), ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()), ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()), ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()), ('sgdc', SGDClassifier()), ('svg', SVC()), ('ngbc', NGBClassifier(Dist=Bernoulli)) ] self.next(self.load_raw_data)
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def label_propagation_classification(kernel='rbf', gamma=20, n_neighbors=7, max_iter=30, tol=1e-3, n_jobs=None): ''' Take in the parameters for label propation. Kernel can be a string or a dict. If it's a dict, it must have two keys: a "name" for the kernel and the "method" that is a callable function :param kernel: string (rbf, graph, knn) or dict :param gamma: if rbf this be used :param n_neighbors: if knn this will be used :param max_iter: max number of iterations, default is 30 :param tol: tolerance for convergence, default is 1e-3 :param n_jobs: number of cores :return: th_model ''' if isinstance(kernel, dict): if 'name' not in kernel.keys() or 'method' not in kernel.keys() or len( kernel.keys()) > 2: ValueError("input dictionary must have two keys: name and method") kernel_method = kernel['method'] temp = kernel['name'] kernel = temp elif kernel == 'graph': kernel_method = _graph_kernel else: kernel_method = kernel # Creating an sklearn random forest classification model: lp = LabelPropagation(kernel=kernel_method, gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter, tol=tol, n_jobs=n_jobs) # Creating an instance of the SklearnClassification TestHarnessModel subclass if kernel == 'rbf': th_model = SklearnClassification(model=lp, model_author='Mohammed', \ model_description="Label Propagation: kernel={0}, \ gamma={1}, max_iter={2},\ tol={3},n_jobs={4}" .format( kernel, gamma, max_iter,tol,n_jobs)) elif kernel == 'knn': th_model = SklearnClassification(model=lp, model_author='Mohammed', \ model_description="Label Propagation: kernel={0}, \ n_neighbors={1}, max_iter={2},\ tol={3},n_jobs={4}" .format( kernel, n_neighbors, max_iter, tol, n_jobs)) else: th_model = SklearnClassification(model=lp, model_author='Mohammed', \ model_description="Label Propagation: kernel={0}, \ max_iter={1},\ tol={2},n_jobs={3}" .format( kernel, max_iter, tol, n_jobs)) return th_model
def label_spr(self): RESULT_ACC_SS = 0 for i in range(self.manyfit): #Initialisinig of variables: self.init_variables() #PCA preprocessing: if (self.PCA_MODE): self.pca_preprocess(self.pca) #Semi supervised algo if (self.ss_mod == 'LabSpr' and self.ss_kern == 'knn'): self.label_prop_model = LabelSpreading( kernel='knn', gamma=self.gamma, n_neighbors=self.neighbors, alpha=self.alpha) elif (self.ss_mod == 'LabProp' and self.ss_kern == 'rbf'): self.label_prop_model = LabelPropagation( kernel='rbf', gamma=self.gamma, n_neighbors=self.neighbors, alpha=self.alpha, max_iter=10) else: self.label_prop_model = LabelPropagtion( kernel=self.ss_kern, gamma=self.gamma, n_neighbors=self.neighbors) print('Starting to fit. Run for shelter!') self.label_prop_model.fit(self.X_tot, self.y_tot) temp_acc = self.label_prop_model.score(self.X_valid_lab, self.y_valid) print('{} / {} :accuracy = {}'.format(i, self.manyfit, temp_acc)) RESULT_ACC_SS += temp_acc self.y_tot = self.label_prop_model.transduction_ self.y_submit = self.label_prop_model.predict(self.X_submit) if (self.datastate == "save"): self.save_to_csv(self.X_tot, self.y_tot, self.X_valid_lab, self.y_valid) RESULT_ACC_SS /= self.manyfit self.json_dict['ss_accuracy'] = RESULT_ACC_SS print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS)
def ssl_label_prop(unlabel, clfs, true, x, y, test): for row in y: row = int(row) df_noise_x, df_noise_y, noisy_labels = shuffle.run(unlabel, [-1] * len(unlabel), x, y) ground = [] point = [] for row in test: ground.append(row[0]) point.append(row[1:]) # sklearn algo label_prop_model = LabelPropagation(kernel='knn', n_neighbors=2, max_iter=400, tol=0.01) label_prop_model.fit(df_noise_x, df_noise_y) return label_prop_model.score(point, ground)
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 ''' X, y, unlabeled_indices = data # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X, y_train) ### 获取预测准确率 # 预测标记 predicted_labels = clf.transduction_[unlabeled_indices] # 真实标记 true_labels = y[unlabeled_indices] print("Accuracy:%f" % metrics.accuracy_score(true_labels, predicted_labels))
def do_label_propagation(input_data, input_label, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=30, tol=0.001): n_neighbors += 1 # input label input_label_fh = open(input_label, 'rb') label_lines = input_label_fh.readlines() label_lines = [int(_.strip()) for _ in label_lines] y = np.array(label_lines) input_label_fh.close() size = len(y) # input data input_data_fh = open(input_data, 'rb') data_lines = input_data_fh.readlines()[:size] data_lines = [_.strip() for _ in data_lines] X = np.array(np.mat(';'.join(data_lines))) input_data_fh.close() label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, y) prediction = label_prop_model.predict(X) if output: output_fh = open(output, 'wb') for p in prediction: output_fh.write(str(p)+'\n') output_fh.close() return label_prop_model
def tryLabelPropagation(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.semi_supervised import LabelPropagation from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid propOperator = LabelPropagation(gamma=150) propOperator.fit(training_data[:3000],training_labels[:3000]) score = accuracy_score(validation_labels, propOperator.predict(validation_data)) print str(score)
def __init__(self): self.model = LabelPropagation() #(kernel='knn', alpha=1.0)
class IndicatorIdentifier(object): ''' Identify the an indicator for a document ''' def __init__(self): self.model = LabelPropagation() #(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() labeledFile = "Database/Indicator/Indicators.xlsx" sheet = "Sheet1" df = da.readingDatabaseTetum(labeledFile, sheet, head= 0) cut = int(0.8*df.shape[0]) # re-duplicate the data => Result: one document has one label only columns = df.columns.tolist() columns.remove("Content") print columns X_train = [] Y_train = [] X_test = [] Y_test = [] for index, row in df.iterrows(): labels = list(set([row[col] for col in columns if not pd.isnull(row[col])])) content = row["Content"] if index < cut: # training part for label in labels: X_train.append(content) Y_train.append(label) else: X_test.append(content) Y_test.append(labels) fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) #print unlabeledData[0] return (X_train, Y_train, X_test, Y_test, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (X_train, Y_train, X_test, Y_test, unlabeledData) = self.readingDatabase() print "Training size: " + str(len(X_train)) print "Test size: " + str(len(X_test)) ''' X_train = X_train[:100] Y_train = Y_train[:100] X_test = X_test[:100] Y_test = Y_test[:100] ''' print "Finish reading database." #print FreqDist(indicators).most_common() k = 0 dictLabel = FreqDist(Y_train) for key in dictLabel: dictLabel[key] = k k+=1 Y_train = [dictLabel[ind] for ind in Y_train] Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test] ''' random.seed(123456) # Training z = zip(labeledData, indicators) random.shuffle(z) labeledData, indicators = zip(*z) X_train = list(labeledData[:cut]) Y_train = list(indicators[:cut]) X_test = list(labeledData[cut:]) Y_test = list(indicators[cut:]) ''' X_train += unlabeledData Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() #pprint(X_train) #print Y_train #print X_train[cut-2:cut+2] #print Y_train[cut-2:cut+2] print "Training..." self.train(X_train, Y_train) # Testing print "Testing..." Y_predicted = self.test(X_test) print Y_predicted # The Y_predicted only need to be one of the true labels in order to be calculated as correctness for i in range(len(Y_predicted)): lab = Y_predicted[i] if lab in Y_test[i]: Y_test[i] = lab else: Y_test[i] = -1 (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted) print accuracy
max_iter: 30 Complexity control for knn n_neighbors: 7 Parameter for knn, how many neighbors to consider alpha: float Clamping factor tol: 0.001 Converenge tolerance: threshold to consider system at steady state """ from sklearn.semi_supervised import LabelPropagation from sklearn import metrics import numpy as np #K nearest neighbors model ensures we dont run over our memory #rbf Kernel needs complete graph, so requires feature selection lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data #Unlabeled points - random size for now. unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels))) labels = np.copy(Ytr_labels) #Save training labels for testing labels[unlabeled_points] = -1 #Set unlabeled value, classes : 0, 1 lp_model.fit(Xtr,labels) #Train ############################################# # Models use n_neighbors and max_iteration to control kernel ############################################# #############################################
#count number of missing labels noLabels = [1 for i in Y_train if i==-1] labels = [1 for i in Y_train if not i==-1] print('unlabeled: ',np.sum(noLabels)) print('labeled: ',np.sum(labels)) # In[8]: from sklearn.semi_supervised import LabelPropagation as LP from sklearn.semi_supervised import LabelSpreading as LS # In[17]: lspr = LP(gamma = 70) lspr.fit(X_norm,Ytrain) # In[15]: print('nofClasses: ',lspr.classes_) # In[16]: pred = lspr.predict(X_norm) notN = [1 for i in pred if i>0.0] print(sum(notN))
class SentimentAnalysis(object): ''' Identify the a sentiment for each document. ''' def __init__(self): self.model = LabelPropagation()#(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx" sheet = "Sheet1" posData = da.readingDatabaseTetum(filePos, sheet) posData = posData[0].tolist() print len(posData) print posData[0] fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx" sheet = "Sheet1" negData = da.readingDatabaseTetum(fileNeg, sheet) negData = negData[0].tolist() print len(negData) print negData[0] fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) print unlabeledData[0] return (posData, negData, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (posData, negData, unlabeledData) = self.readingDatabase() print "Finish reading database." # Divide training and test data cut = 10 posDataTrain = posData[:cut] negDataTrain = negData[:cut] posDataTest = posData[cut:] negDataTest = negData[cut:] random.seed(123456) # Training X_train = posDataTrain + negDataTrain + unlabeledData Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() z = zip(X_train, Y_train) random.shuffle(z) X_train, Y_train = zip(*z) self.train(X_train, Y_train) # Testing X_test = posDataTest + negDataTest Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist() z = zip(X_test, Y_test) random.shuffle(z) X_test, Y_test = zip(*z) Y_predicted = self.test(X_test) print Y_predicted (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted) print (accuracy,precision,recall,f1)
def __init__(self, P): LabelPropagation.__init__(self, kernel='knn') self.P = P
] names = [ #"propagation", "spreading", ] for grid in params: param_grid = list(ParameterGrid(grid)) for param in param_grid: for name in names: if param["kernel"] == 'rbf': if name == "propagation": clf = LabelPropagation(kernel=param["kernel"], gamma=param["gamma"]) else: clf = LabelSpreading(kernel=param["kernel"], gamma=param["gamma"]) extra_param = param["gamma"] else: if name == "propagation": clf = LabelPropagation(kernel=param["kernel"], n_neighbors=param["n_neighbors"]) else: clf = LabelSpreading(kernel=param["kernel"], n_neighbors=param["n_neighbors"]) extra_param = param["n_neighbors"] now = datetime.datetime.now() date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000),