def semiLabelPropagation(feature_extractor, generator, val_generator, kernel, neighbors, gamma): semi = LabelPropagation(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=None, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def sklearn_lp(X, y, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3) label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X_train, y_train) y_predict = label_prop_model.predict(X_test) print 'y_train: ', y_train print 'y_predict: ', y_predict print '+--------------------------------------------------------+' print '| Report +' print '+--------------------------------------------------------+' print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def test_LabelPropagation(*data): ''' 测试 LabelPropagation 的用法 ''' X, y, unlabeled_indices, XPredict, yTrue = data #print("get ytrue") #print(yTrue) # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 print(y_train) #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1) clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5) #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20) clf.fit(X, y_train) ### 获取预测准确率 # 预测标记 predicted_labels = clf.predict(XPredict) print(XPredict) #predicted_labels = clf.transduction_[unlabeled_indices] # 真实标记 #yTrue #true_labels = y[unlabeled_indices] print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))
def lb_prop_classify(network, labels): kf = StratifiedKFold(n_splits=10) scores = [] cms = [] for test_index, train_index in kf.split(network ,labels): first_train_index, last_train_index = min(train_index), max(train_index) train_dataset = network[first_train_index:last_train_index] train_labels = labels[first_train_index:last_train_index] test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0) test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0) label_spreading_model = LabelPropagation() label_spreading_model.fit(train_dataset, train_labels) scores.append(label_spreading_model.score(test_dataset, test_labels)) prediction = label_spreading_model.predict(test_dataset) cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_)) print('label propagation media {}'.format(np.average(scores))) print('label propagation desvio padrao {}'.format(np.std(scores))) print('label propagation matriz de confusao') print(get_percentile_cm(get_average_cm(cms))) print('\n') return scores
def load_all_data(): # Read am partition the matrix data = pd.read_feather('../feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[4977:4977+3000] y_va = y[4977:4977+3000] x = np.concatenate((x[:4977],x[4977+3000:])) y = np.concatenate((y[:4977],y[4977+3000:])) nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x = np.concatenate([x_obs, x_nuls], axis=0) y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def load_all_data(): # Read am partition the matrix data = pd.read_feather('./feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] o = data.observation x = x.values x = normalize(x) y = y.values x_va = x[[i in [8, 9] for i in o.values]] y_va = y[[i in [8, 9] for i in o.values]] x = x[[i not in [8, 9] for i in o.values]] y = y[[i not in [8, 9] for i in o.values]] o.unique() nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] # apply Label Spreading x_nuls = x[nul(y)] label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs, y_obs) x_all = np.concatenate([x_obs, x_nuls], axis=0) y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) # Over sample the stages zen = SMOTE(random_state=8675309) x, y = zen.fit_resample(x_all, y_all) x, y = shuffle(x, y, random_state=42) x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
def _label_propagation(df): X = _generate_features(df) labels = _generate_labels(df) # for some reason pandas returns NaN for -1 values labels = labels.fillna(-1) label_prop_model = LabelPropagation() label_prop_model.fit(X.toarray(), labels) return label_prop_model.predict(X.toarray())
def doLabelPropagation(self,X,y,**kwargs): label_prop_model = LabelPropagation(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_prop_model.fit(X, y) if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) ) return label_prop_model.predict_proba(X)
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0],)) all_labels = np.concatenate((labels, neg_ones), axis = 0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def ss_test(images, labels, unlabeled_images, test_images): all_images = np.vstack((images, unlabeled_images)) neg_ones = -np.ones((unlabeled_images.shape[0], )) all_labels = np.concatenate((labels, neg_ones), axis=0) model = LabelPropagation() model.fit(all_images, all_labels) test_labels = model.predict(test_images) create_submission(test_labels)
def LP(source_train, target_test, label1, label3): label_prop_model = LabelPropagation() label_prop_model.fit(source_train, label1) source_predict = label_prop_model.predict(target_test) # 评价参数 accuracy = metrics.accuracy_score(label3, source_predict) recall = metrics.recall_score(label3, source_predict, average='weighted') f1 = metrics.f1_score(label3, source_predict, average='weighted') precision = metrics.precision_score(label3, source_predict, average='weighted') print("LP:", accuracy, recall, f1, precision) return accuracy, recall, f1, precision
def semi_shuffle_estimator(n_splits=10, test_size=0.6, seed=0, gamma=4, n_neighbors=6, max_iter=1000): sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed) i = 0 testsize_list.append(test_size) train_scores = [] test_scores =[] for label_index, unlabel_index in sss.split(X, Y): i += 1 X_train = X.iloc[label_index] Y_train = Y.iloc[label_index] X_test = X.iloc[unlabel_index] Y_test = Y.iloc[unlabel_index] Y_unlabel = copy.deepcopy(Y_test) Y_unlabel['Class'] = -1 X_new = pd.concat([X_train, X_test]) Y_new = pd.concat([Y_train, Y_unlabel]) shuffle_index = np.random.permutation(X.index) X_new_shuffle = X_new.take(shuffle_index) Y_new_shuffle = Y_new.take(shuffle_index) lp = LabelPropagation(gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter) lp.fit(X_new_shuffle, Y_new_shuffle.values.ravel()) Y_predict_train = lp.predict(X_train) Y_predict_test = lp.predict(X_test) train_scores.append(accuracy_score(Y_train, Y_predict_train)) test_scores.append(accuracy_score(Y_test, Y_predict_test)) # print("-------Cross_validation epoch {}--------".format(i)) # print("The accuracy in train set:", accuracy_score(Y_train, Y_predict_train)) # print("The accuracy in test set:", accuracy_score(Y_test, Y_predict_test)) mean_train_score = np.array(train_scores).mean() mean_test_score = np.array(test_scores).mean() print("For test size {}, the mean accuracy in train set is {}".format(test_size, mean_train_score)) print("For test size {}, the mean accuracy in test set is {}".format(test_size, mean_test_score)) train_socres_list.append(mean_train_score) test_scores_list.append(mean_test_score)
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def do_evaluation(X, y, kernel='knn', output=None, gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): # from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score import random size = len(X) random_seeds = np.random.randint(1, 1000, size=10) for i in range(len(random_seeds)): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i]) labels = np.copy(y) tmp = np.arange(size) np.random.shuffle(tmp) train_test_split_rate = int(size*.9) random_unlabeled_points = tmp[:train_test_split_rate] labeled_points = tmp[train_test_split_rate:] random_unlabeled_points.sort() X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points] y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points] y_train = [y[_] for _ in range(size) if _ in labeled_points] labels[random_unlabeled_points] = -1 label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, labels) y_predict = label_prop_model.predict(X_test) print '+--------------------------------------------------------+' print '| Report |' print '+--------------------------------------------------------+' print 'test round:', (i+1), ' with random seed: ', random_seeds[i] print 'training label: ', y_train print 'training post id: ', [_+1 for _ in labeled_points] print 'predict label: ', y_predict print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def process(self, n_components): X_train, y_train, X_test, y_test = self.preprocess(n_components) label_prop_model = LabelPropagation(n_jobs=-1) label_prop_model.fit(X_train, y_train) y_pred = label_prop_model.predict(X_test) mean_acc = label_prop_model.score(X_test, y_test) plot_confusion_matrix(y_test, y_pred, self.labels, normalize=False, figname=('lp_comps_%d.png' % n_components)) self.m_acc.append(mean_acc) print(label_prop_model.get_params())
def execute(self, function_context: FunctionContext, input_list: List) -> List: x_train = input_list[0] y_label = input_list[1] input_dim = 512 x_train_columns = list() x_train_columns.append('face_id') for i in range(1, input_dim + 1): x_train_columns.append('col' + str(i)) trainDf = pd.DataFrame(x_train, columns=x_train_columns) labelDf = pd.DataFrame(y_label, columns=('face_id', 'label')) trainDf = pd.merge(trainDf, labelDf, on=['face_id'], how='inner', suffixes=('_x', '_y')) y_label = trainDf['label'].values.astype(int) trainDf = trainDf.drop('face_id', 1) x_train = trainDf.drop('label', 1).values label_prop_model = None score = 0.0 while score < 0.95: print('before train ACC:', score) random_unlabeled_points = np.random.rand(len(y_label)) random_unlabeled_points = random_unlabeled_points < 0.3 # 0-1的随机数,小于0.7返回1,大于等于0.7返回0 Y = y_label[random_unlabeled_points] # label转换之前的 y_label[random_unlabeled_points] = -1 # 标签重置,将标签为1的变为-1 label_prop_model = LabelPropagation() label_prop_model.fit(x_train, y_label) Y_pred = label_prop_model.predict(x_train) Y_pred = Y_pred[random_unlabeled_points] score = accuracy_score(Y, Y_pred) y_label[random_unlabeled_points] = Y model_path = os.path.dirname(os.path.abspath(__file__)) + '/model' print('Save trained model to {}'.format(model_path)) if not os.path.exists(model_path): joblib.dump(label_prop_model, model_path) model_meta: ModelMeta = function_context.node_spec.output_model # Register model version to notify that cluster serving is ready to start loading the registered model version. register_model_version(model=model_meta, model_path=model_path) return []
def label_propagation(self, X_train, y, X_test): clf = LabelPropagation() print("X_train Shape :", X_train.shape, type(X_train)) print("X_test shape : ", X_test.shape, type(X_test)) print("y shape : ", y.shape) X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def label_prop(): labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']] model = LabelPropagation(kernel=rbf_kernel_safe) model.fit(df10, labels.values.ravel()) pred = np.array(model.predict(df12)) df13 = pd.DataFrame(pred, columns=['Prediction']) df14 = pd.concat([df12, df13], axis=1) print(df14[['ID', 'Prediction']]) # print(df14.loc[df14['Prediction'] == 'Y']) plt.style.use ( 'seaborn' ) df14['Prediction'].value_counts().plot(kind='bar') plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] ) plt.ylabel('Number of occurrences after prediction by RBF algorithm'); plt.show()
class _LabelPropagationImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def iris_semi(): X, y = load_iris(return_X_y=True) print('data shape: {}'.format(X.shape)) # 降维,方便可视化 pca = PCA(n_components=2) X = pca.fit_transform(X) # 设置画布 from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) fig = plt.figure() for i, threshold in enumerate([0.3, 0.5, 0.8, 1]): new_y = y.copy() if threshold < 1: rng = np.random.RandomState(0) random_unlabeled = rng.rand( len(y)) <= threshold # 0-1的随机数,小于等于threshold返回True # 未标记样本的标签设置为-1 new_y[random_unlabeled] = -1 model_name = 'LabelPropagation' model = LabelPropagation(kernel='rbf', gamma=20) else: model_name = 'SVC' model = SVC() model.fit(X, new_y) # 生成网格数据点 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05), np.arange(y_min, y_max, 0.05)) new_x = np.c_[xx.ravel(), yy.ravel()] z = model.predict(new_x) # 画出网格数据的预测值 ax = fig.add_subplot(2, 2, i + 1) ax.pcolormesh(xx, yy, z.reshape(xx.shape), cmap=cmap_light, alpha=0.5) # 画出真实数据分布 ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) ax.set_title('{}, {}% data'.format(model_name, threshold * 100)) plt.show()
def do_label_propagation(input_data, input_label, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=30, tol=0.001): n_neighbors += 1 # input label input_label_fh = open(input_label, 'rb') label_lines = input_label_fh.readlines() label_lines = [int(_.strip()) for _ in label_lines] y = np.array(label_lines) input_label_fh.close() size = len(y) # input data input_data_fh = open(input_data, 'rb') data_lines = input_data_fh.readlines()[:size] data_lines = [_.strip() for _ in data_lines] X = np.array(np.mat(';'.join(data_lines))) input_data_fh.close() label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X, y) prediction = label_prop_model.predict(X) if output: output_fh = open(output, 'wb') for p in prediction: output_fh.write(str(p)+'\n') output_fh.close() return label_prop_model
def load_all_data(): # Read am partition the matrix data = pd.read_feather('../feature_stage_data_all.ftr') x = data[data.columns[3:]] y = data['stage'] x = x.values x = normalize(x) y = y.values x_va = x[4977:4977 + 3000] y_va = y[4977:4977 + 3000] x = np.concatenate((x[:4977], x[4977 + 3000:])) y = np.concatenate((y[:4977], y[4977 + 3000:])) nnl = lambda a: np.invert(np.isnan(a)) nul = lambda a: np.isnan(a) x_obs = x[nnl(y)] y_obs = y[nnl(y)] x_nuls = x[nul(y)] # Undersample the stages x_obs, y_obs = shuffle(x_obs, y_obs, random_state=42) smpnum = min([sum(y_obs == i) for i in range(1, 6)]) y_obs_us = y[y == 1][:smpnum] x_obs_us = x[y == 1][:smpnum] for i in range(2, 6): x_obs_us = np.concatenate([x_obs_us, x[y == i][:smpnum]]) y_obs_us = np.concatenate([y_obs_us, y[y == i][:smpnum]]) # apply Label Spreading label_spread = LabelPropagation(kernel='knn') label_spread.fit(x_obs_us, y_obs_us) x_all = np.concatenate([x_obs, x_nuls], axis=0) y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) # Undersample the stages x, y = shuffle(x, y, random_state=42) smpnum = min([sum(y == i) for i in range(1, 6)]) y_btr = y[y == 1][:smpnum] x_btr = x[y == 1][:smpnum] for i in range(2, 6): x_btr = np.concatenate([x_btr, x[y == i][:smpnum]]) y_btr = np.concatenate([y_btr, y[y == i][:smpnum]]) x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size=0.20) return x_tr, y_tr, x_te, y_te, x_va, y_va
class LabelPropagationClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._lbl = LabelPropagation() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() matrix = matrix.toarray() self._lbl = self._lbl.fit(matrix, classes) print 'Fitting complete...' self._has_fit = True output = self._lbl.predict(self._matrix_database.make_row_from_recipe(ingredients).toarray()) return output[0]
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6): prop = LabelPropagation(kernel=kernel, n_neighbors=k, gamma=g, max_iter=MI, n_jobs=-1) prop.fit(xTrain, yTrain) evaledY = prop.predict(xTrain) #def stats(trainY,evaledY,expectedY,day_one): return lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, evaledY, yExpect, day_one) results = [ 'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'HC.csv' write_csv(file_name, results)
def semi_modelling(X_train, X_l, X_u, y_l, y_train, label_column): lp_model = LabelPropagation(gamma=1, kernel='rbf', max_iter=100000, n_jobs=-1, n_neighbors=7, tol=0.001) tt = TriTraining([ ExtraTreesClassifier(max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=10, n_estimators=100, random_state=200), RandomForestClassifier(max_depth=50, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100), XGBClassifier(max_depth=50, n_estimators=50, random_state=200) ]) pseudo = PseudoLabeler(ExtraTreesClassifier(max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=10, n_estimators=100, random_state=200), X_u, X_u.columns, label_column, sample_rate=0.3) lp_model.fit(X_train.values, y_train) tt.fit(X_l.values, y_l.values, X_u.values) pseudo.seed = 42 pseudo.fit(X_l, y_l) lp_predict = lp_model.predict(X_u.values) tt_predict = tt.predict(X_u.values) pse_predict = pseudo.predict(X_u) prediction_combine = np.vstack((lp_predict, tt_predict, pse_predict)).T return prediction_combine
def tryLabelPropagation(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.semi_supervised import LabelPropagation from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid propOperator = LabelPropagation(gamma=150) propOperator.fit(training_data[:3000],training_labels[:3000]) score = accuracy_score(validation_labels, propOperator.predict(validation_data)) print str(score)
def evaluate_model(self, X, Y, gamma, seed, max_iter=100000): #set random seed: np.random.seed(seed) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.20, random_state=seed) lp_model = LabelPropagation(kernel='rbf', gamma=gamma, max_iter=max_iter) lp_model.fit(X_train, Y_train) #test model on validation data predicted_labels = lp_model.predict(X_test) predicted_prob = lp_model.predict_proba(X_test) #get just the labeled testing data: labeled_prob = [ p[1] for i, p in enumerate(predicted_prob) if Y_test[i] in [0, 1] ] labels = [ p for i, p in enumerate(predicted_labels) if Y_test[i] in [0, 1] ] true_labels = [l for l in Y_test if l in [0, 1]] #evaluation accuracy = metrics.accuracy_score(true_labels, labels) precision = metrics.precision_score(true_labels, labels) auc = metrics.roc_auc_score(true_labels, labeled_prob) conf = metrics.confusion_matrix(true_labels, labels) return accuracy, precision, auc, conf
def simple_test(): # read and split data read_data = ReadDataset() x, y = read_data.read("dna") x_l, x_u, y_l, y_u = train_test_split(x, y, test_size=0.99, random_state=40) print("shape of labeled part:") print(x_l.shape, y_l.shape) print("shape of unlabeled part:") print(x_u.shape, y_u.shape) print("class distribution of labeled examples:") print([np.sum(y_l == i) for i in range(len(np.unique(y)))]) print("class distribution of unlabeled examples:") print([np.sum(y_u == i) for i in range(len(np.unique(y)))]) print() # partially labeled view x_train, y_train, y_u_shuffled = partially_labeled_view(x_l, y_l, x_u, y_u) # purely supervised classification print("random forest:") t0 = time.time() model = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=-1, random_state=40) model.fit(x_l, y_l) y_pred = model.predict(x_u) acc = [accuracy_score(y_u, y_pred)] f1 = [f1_score(y_u, y_pred, average="weighted")] print("accuracy:", acc[0]) print("f1-score:", f1[0]) t1 = time.time() print("random forest is done") print("time:", t1 - t0, "seconds") print() # label propagation print("label propagation:") t0 = time.time() label_prop_model = LabelPropagation(gamma=0.01, n_jobs=-1, tol=1e-3) label_prop_model.fit(x_train, y_train) y_pred = label_prop_model.predict(x_train[y_train == -1, :]) acc.append(accuracy_score(y_u_shuffled, y_pred)) f1.append(f1_score(y_u_shuffled, y_pred, average="weighted")) print("accuracy:", acc[1]) print("f1-score:", f1[1]) t1 = time.time() print("label propagation is done!") print("time:", t1 - t0, "seconds") print() # tsvm print("tsvm:") t0 = time.time() y_u_shuffled, y_pred = tsvm.ova_tsvm(x_l, y_l, x_u, y_u, db_name="dna", timeout=None) acc.append(accuracy_score(y_u_shuffled, y_pred)) f1.append(f1_score(y_u_shuffled, y_pred, average="weighted")) print("accuracy:", acc[2]) print("f1-score:", f1[2]) t1 = time.time() print("tsvm is done!") print("time:", t1 - t0, "seconds") # multi-class self-learning algorithm with fixed theta theta = 0.7 max_iter = 10 print("fsla with theta={}:".format(theta)) t0 = time.time() model = sl.fsla(x_l, y_l, x_u, theta, max_iter, random_state=40) y_pred = model.predict(x_u) acc.append(accuracy_score(y_u, y_pred)) f1.append(f1_score(y_u, y_pred, average="weighted")) print("accuracy:", acc[3]) print("f1-score:", f1[3]) t1 = time.time() print("fsla is done!") print("time:", t1 - t0, "seconds") print() # multi-class self-learning algorithm print("msla:") t0 = time.time() model, thetas = sl.msla(x_l, y_l, x_u, random_state=40) y_pred = model.predict(x_u) print("optimal theta at each step:") print(thetas) acc.append(accuracy_score(y_u, y_pred)) f1.append(f1_score(y_u, y_pred, average="weighted")) print("accuracy:", acc[4]) print("f1-score:", f1[4]) t1 = time.time() print("msla is done!") print("time:", t1 - t0, "seconds") print() # plot a graph plot_graph(acc, f1)
#########clf = LogisticRegression(multi_class='auto', solver='lbfgs').fit(X_train, y_train) ######### clf = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train) X = X[idx,:] y = y[idx] z = z[idx] X_train = X X_test = X[n_train:] y_train = np.concatenate([y[:n_train], -1*np.ones([n-n_train])]) y_test = y[n_train:] z_test = z[n_train:] g = np.mean(pairwise_distances(X)) clf = LabelPropagation(gamma = g).fit(X_train, y_train) y_pred = clf.predict(X_test) res = 100 * np.sum(y_pred == y_test) / y_test.shape[0] idx_1 = (z_test == 1) res_1 = 100 * np.sum(y_pred[idx_1] == y_test[idx_1]) / np.sum(idx_1) idx_0 = (z_test == 0) res_0 = 100 * np.sum(y_pred[idx_0] == y_test[idx_0]) / np.sum(idx_0) res_diff = np.abs(res_1 - res_0) res_var = np.var([res_1, res_0]) res_total.append(res) res_1_total.append(res_1)
class SentimentAnalysis(object): ''' Identify the a sentiment for each document. ''' def __init__(self): self.model = LabelPropagation()#(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx" sheet = "Sheet1" posData = da.readingDatabaseTetum(filePos, sheet) posData = posData[0].tolist() print len(posData) print posData[0] fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx" sheet = "Sheet1" negData = da.readingDatabaseTetum(fileNeg, sheet) negData = negData[0].tolist() print len(negData) print negData[0] fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) print unlabeledData[0] return (posData, negData, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (posData, negData, unlabeledData) = self.readingDatabase() print "Finish reading database." # Divide training and test data cut = 10 posDataTrain = posData[:cut] negDataTrain = negData[:cut] posDataTest = posData[cut:] negDataTest = negData[cut:] random.seed(123456) # Training X_train = posDataTrain + negDataTrain + unlabeledData Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() z = zip(X_train, Y_train) random.shuffle(z) X_train, Y_train = zip(*z) self.train(X_train, Y_train) # Testing X_test = posDataTest + negDataTest Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist() z = zip(X_test, Y_test) random.shuffle(z) X_test, Y_test = zip(*z) Y_predicted = self.test(X_test) print Y_predicted (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted) print (accuracy,precision,recall,f1)
# In[17]: lspr = LP(gamma = 70) lspr.fit(X_norm,Ytrain) # In[15]: print('nofClasses: ',lspr.classes_) # In[16]: pred = lspr.predict(X_norm) notN = [1 for i in pred if i>0.0] print(sum(notN)) # In[12]: Y_pred = lspr.predict_proba(X_test) # In[13]: print(Y_pred.shape) # In[ ]:
date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000), now.month, now.day, now.hour, now.minute) #classification Type #clf = OneVsOneClassifier(clf) #clf = OneVsRestClassifier(clf) logging.info("start with training ") clf.fit(X_train, y_train) #y_pred = clf.predict(X_valid) #print("min:{0} max:{0}".format(y_pred.min(),y_pred.max())) #score = accuracy_score(y_valid, y_pred, True) print("found classes are {0}".format(clf.classes_)) y_test = clf.predict(X_test) y_test = y_test.astype(np.uint32) lib_IO.write_Y("Data/pr4/{0}_{1}_{2}_{3}".format(name,param["kernel"],extra_param,date_time),y_test,Ids=ids) #Gridsearch #grid_search = GridSearchCV(clf, param, scoring='accuracy',cv=10, n_jobs=-1, verbose=1) #grid_search.fit(X_train, y_train) #clf_tmp = grid_search.best_estimator_ #score = grid_search.best_score_ #best_param = grid_search.best_params_ #lib_IO.log_best_param_score(date_time,name,score,param) clf = None #time.sleep(30)
import numpy as np from sklearn import datasets iris = datasets.load_iris() labels = np.copy(iris.target) random_unlabeled_points = np.random.rand(len(iris.target)) random_unlabeled_points = random_unlabeled_points < 0.7 Y = labels[random_unlabeled_points] labels[random_unlabeled_points] = -1 print("Unlabeled Number:", list(labels).count(-1)) from sklearn.semi_supervised import LabelPropagation label_prop_model = LabelPropagation() label_prop_model.fit(iris.data, labels) Y_pred = label_prop_model.predict(iris.data) Y_pred = Y_pred[random_unlabeled_points] from sklearn.metrics import accuracy_score, recall_score, f1_score print("ACC:", accuracy_score(Y, Y_pred)) print("REC:", recall_score(Y, Y_pred, average="micro")) print("F-Score", f1_score(Y, Y_pred, average="micro"))
from sklearn.semi_supervised import LabelPropagation from sklearn import metrics import numpy as np #K nearest neighbors model ensures we dont run over our memory #rbf Kernel needs complete graph, so requires feature selection lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data #Unlabeled points - random size for now. unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels))) labels = np.copy(Ytr_labels) #Save training labels for testing labels[unlabeled_points] = -1 #Set unlabeled value, classes : 0, 1 lp_model.fit(Xtr,labels) #Train ############################################# # Models use n_neighbors and max_iteration to control kernel ############################################# ############################################# # Test Functions ############################################# #Mean squared Error yhat = lp_model.predict(Xtr); mse = metrics.mean_squared_error(Ytr_labels,yhat); ############################################### # Cross Validation ###############################################
def build_models(trainX, trainY, testX, testY, source_pos, target_pos, window): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX) acc_ss_propagation, acc_ss_propagation_INFO = checkAccuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX) acc_ss_spreading, acc_ss_spreading_INFO = checkAccuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = checkAccuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = checkAccuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = checkAccuracy(testY, predND) # print("WITHOUT TL ACC_LR:", accLR, " ACC_DT:", accDT, " ACC_NB:", accNB) ######################## #### WITH TL ######## ######################## #################################################### ### Kernel Mean Matching (Huang et al., 2006) ### # Decision Tree print("\n Kernel Mean Matching (Huang et al., 2006) ") classifier = ImportanceWeightedClassifier(iwe='kmm', loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_KMM, acc_DT_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_KMM) # Logistic Regression classifier = ImportanceWeightedClassifier(iwe='kmm', loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_KMM, acc_LR_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_KMM) # Naive Bayes Bernoulli classifier = ImportanceWeightedClassifier(iwe='kmm', loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_KMM, acc_NB_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_KMM) #################################################### ### Nearest-neighbour-based weighting (Loog, 2015) ### # Decision Tree print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_NN, acc_DT_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_NN) # Logistic Regression print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_NN, acc_LR_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_NN) # Naive Bayes Bernoulli print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_NN, acc_NB_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_NN) #################################################### ### Transfer Component Analysis (Pan et al, 2009) ### # Decision Tree print("\n Transfer Component Analysis (Pan et al, 2009)") classifier = TransferComponentClassifier(loss="dtree", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_TCA, acc_DT_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_TCA) # Logistic Regression classifier = TransferComponentClassifier(loss="logistic", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_TCA, acc_LR_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_TCA) # Naive Bayes Bernoulli classifier = TransferComponentClassifier(loss="berno", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_TCA, acc_NB_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_TCA) #################################################### ### Subspace Alignment (Fernando et al., 2013) ### # Decision Tree print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_SA, acc_DT_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_SA) # Logistic Regression print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_SA, acc_LR_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_SA) # Naive Bayes Bernoulli print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_SA, acc_NB_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_SA) ################################# ############# ENSEMBLE ########## ################################# classifier_SA_DT = SubspaceAlignedClassifier(loss="dtree") classifier_SA_LR = SubspaceAlignedClassifier(loss="logistic") classifier_SA_NB = SubspaceAlignedClassifier(loss="berno") classifier_TCA_DT = TransferComponentClassifier(loss="dtree") classifier_TCA_LR = TransferComponentClassifier(loss="logistic") classifier_TCA_NB = TransferComponentClassifier(loss="berno") classifier_NN_DT = ImportanceWeightedClassifier(iwe='nn', loss="dtree") classifier_NN_LR = ImportanceWeightedClassifier(iwe='nn', loss="logistic") classifier_NN_NB = ImportanceWeightedClassifier(iwe='nn', loss="berno") classifier_KMM_DT = ImportanceWeightedClassifier(iwe='kmm', loss="dtree") classifier_KMM_LR = ImportanceWeightedClassifier(iwe='kmm', loss="logistic") classifier_KMM_NB = ImportanceWeightedClassifier(iwe='kmm', loss="berno") # eclf = EnsembleClassifier( clfs=[classifier_TCA_DT, classifier_NN_DT, classifier_KMM_DT]) eclf.fit(trainX, trainY, testX) pred = eclf.predict_v2(testX) acc_ENSEMBLE, acc_ENSEMBLE_INFO = checkAccuracy(testY, pred) ######################## #### RETURN ######## ######################## return pd.DataFrame([{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO': acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO': acc_ss_spreading_INFO, 'acc_ENSEMBLE': acc_ENSEMBLE, 'acc_LR': accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO), 'acc_LR_KMM': acc_LR_KMM, 'acc_LR_KMM_INFO': str(acc_LR_KMM_INFO), 'acc_LR_NN': acc_LR_NN, 'acc_LR_NN_INFO': str(acc_LR_NN_INFO), 'acc_LR_TCA': acc_LR_TCA, 'acc_LR_TCA_INFO': str(acc_LR_TCA_INFO), 'acc_LR_SA': acc_LR_SA, 'acc_LR_SA_INFO': str(acc_LR_SA_INFO), 'acc_DT_KMM': acc_DT_KMM, 'acc_DT_KMM_INFO': str(acc_DT_KMM_INFO), 'acc_DT_NN': acc_DT_NN, 'acc_DT_NN_INFO': str(acc_DT_NN_INFO), 'acc_DT_TCA': acc_DT_TCA, 'acc_DT_TCA_INFO': str(acc_DT_TCA_INFO), 'acc_DT_SA': acc_DT_SA, 'acc_DT_SA_INFO': str(acc_DT_SA_INFO), 'acc_NB_KMM': acc_NB_KMM, 'acc_NB_KMM_INFO': str(acc_NB_KMM_INFO), 'acc_NB_NN': acc_NB_NN, 'acc_NB_NN_INFO': str(acc_NB_NN_INFO), 'acc_NB_TCA': acc_NB_TCA, 'acc_NB_TCA_INFO': str(acc_NB_TCA_INFO), 'acc_NB_SA': acc_NB_SA, 'acc_NB_SA_INFO': str(acc_NB_SA_INFO) }])
class IndicatorIdentifier(object): ''' Identify the an indicator for a document ''' def __init__(self): self.model = LabelPropagation() #(kernel='knn', alpha=1.0) #self.model = LabelSpreading() def readingDatabase(self): da = DocumentsAccess() labeledFile = "Database/Indicator/Indicators.xlsx" sheet = "Sheet1" df = da.readingDatabaseTetum(labeledFile, sheet, head= 0) cut = int(0.8*df.shape[0]) # re-duplicate the data => Result: one document has one label only columns = df.columns.tolist() columns.remove("Content") print columns X_train = [] Y_train = [] X_test = [] Y_test = [] for index, row in df.iterrows(): labels = list(set([row[col] for col in columns if not pd.isnull(row[col])])) content = row["Content"] if index < cut: # training part for label in labels: X_train.append(content) Y_train.append(label) else: X_test.append(content) Y_test.append(labels) fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx" sheet = "Sheet1" unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet) unlabeledData = unlabeledData[0].tolist() print len(unlabeledData) fileUnlabeled2 = "Database/SAPO.xlsx" unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet) unlabeledData2 = unlabeledData2[0].tolist() print len(unlabeledData2) fileUnlabeled3 = "Database/Suara News.xlsx" unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet) unlabeledData3 = unlabeledData3[0].tolist() ''' fileUnlabeled4 = "Database/Haksesuk.xlsx" unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet) unlabeledData4 = unlabeledData4[0].tolist() print len(unlabeledData4) ''' unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3 print len(unlabeledData) #print unlabeledData[0] return (X_train, Y_train, X_test, Y_test, unlabeledData) def preprocessData(self, X): return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2) def train(self, X, Y): ''' Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data) ''' X = self.preprocessData(X) X = self.featureExtractionTrain(X) X = X.toarray() self.model.fit(X, Y) def test(self, X): ''' Goal: predict a new document ''' X = self.preprocessData(X) X = self.featureExtractionPredict(X) X = X.toarray() predictedY = self.model.predict(X) return predictedY def updateNewInformation(self, x1, y1): ''' Goal: Update the information from the new data (Online Learning) Run re-train model at weekend ''' #self.model.partial_fit(x1,y1) pass def featureExtractionTrain(self, X): self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF") trainTable = self.vsm.train(X) return trainTable def featureExtractionPredict(self, X): testTable = self.vsm.test(X) return testTable def evaluation(self, trueLabels, predictedLabels): accuracy = metrics.accuracy_score(trueLabels,predictedLabels) precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted') recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted') f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted') accuracy = round(accuracy,4) precision = round(precision,4) recall = round(recall,4) f1 = round(f1,4) result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)] return result def run(self): # Reading data (X_train, Y_train, X_test, Y_test, unlabeledData) = self.readingDatabase() print "Training size: " + str(len(X_train)) print "Test size: " + str(len(X_test)) ''' X_train = X_train[:100] Y_train = Y_train[:100] X_test = X_test[:100] Y_test = Y_test[:100] ''' print "Finish reading database." #print FreqDist(indicators).most_common() k = 0 dictLabel = FreqDist(Y_train) for key in dictLabel: dictLabel[key] = k k+=1 Y_train = [dictLabel[ind] for ind in Y_train] Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test] ''' random.seed(123456) # Training z = zip(labeledData, indicators) random.shuffle(z) labeledData, indicators = zip(*z) X_train = list(labeledData[:cut]) Y_train = list(indicators[:cut]) X_test = list(labeledData[cut:]) Y_test = list(indicators[cut:]) ''' X_train += unlabeledData Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist() #pprint(X_train) #print Y_train #print X_train[cut-2:cut+2] #print Y_train[cut-2:cut+2] print "Training..." self.train(X_train, Y_train) # Testing print "Testing..." Y_predicted = self.test(X_test) print Y_predicted # The Y_predicted only need to be one of the true labels in order to be calculated as correctness for i in range(len(Y_predicted)): lab = Y_predicted[i] if lab in Y_test[i]: Y_test[i] = lab else: Y_test[i] = -1 (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted) print accuracy
def rses_model(neighbor_delta, labeled_data, unlabeled_data, labels, name, alpha): def compute_NDER(dataset, radius, A, test_x, test_y): """ :param A: feature set :param X: index of samples :param d: :return: """ if not len(A): return 1 clf = Neighborhood_Classifiers(dataset, x, y, A, radius) pre = [] for s in test_x: pre.append(clf.predict(s)) cnt = np.sum(np.not_equal(pre, test_y[test_x]).astype(int), axis=0) NDER = cnt / l return NDER # LPA生成标签 lp_model = LabelPropagation() lp_model.fit(labeled_data, np.reshape(labels, len(labels))) y_inductive = lp_model.predict(unlabeled_data) # 整合数据 x = np.concatenate((labeled_data, unlabeled_data), axis=0) y_labeled = np.reshape(labels, (len(labels))) y = np.concatenate((y_labeled, y_inductive), axis=0) # Algorithm of rough set based semi-supervised feature selection via ensemble selector. # n is number of decision classes n = np.max(labels) + 1 l, c = x.shape AT = set([x for x in range(c)]) A = set() # compute NDER X = [i for i in range(len(x))] NDER = compute_NDER(name, neighbor_delta, AT, X, y) # 按类别分组 Xi = {i: np.where(y == i)[0].tolist() for i in range(n)} while True: C = set() for i in range(n): print("第{}次".format(i)) max_phi = -1 b = -1 for a in AT.difference(A): l_nder1 = compute_NDER(name, neighbor_delta, A, Xi[i], y) A.add(a) l_nder2 = compute_NDER(name, neighbor_delta, A, Xi[i], y) phi = l_nder1 - l_nder2 print("属性{}的phi={}".format(a, phi)) if phi > max_phi: max_phi = phi b = a A.remove(a) C.add(b) print("选择{}".format(b)) counter = Counter(C) b, t = counter.most_common(1)[0] # 返回n个出现次数最大的值。计数值相等的元素按首次出现的顺序排序。 print("{}: 最大".format(b)) if b == -1: break A.add(b) nder_A = compute_NDER(name, neighbor_delta, A, X, y) if nder_A <= NDER: break print("red={}".format(A)) return A, 0
def main(argv): trainFile = None testFile = None outFile = None try: opts, args = getopt.getopt(argv, "hi:t:o:") except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt == '-i': trainFile = arg elif opt == '-t': testFile = arg elif opt == '-o': outFile = arg else: usage() print('Invalid argument %s' % opt) sys.exit(2) if (None == trainFile) or (None == testFile) or (None == outFile): print("Missing arguments") usage() sys.exit(2) facialData = pd.read_csv(trainFile) testData = pd.read_csv(testFile) testData.drop(columns=['id'], inplace=True) testData.reset_index(inplace=True, drop=True) labels = testData['class'] classLabels = [] for i in range(len(labels)): classLabels.append(1 if (labels[i] == 'deceptive') else 0) testData.drop(columns=['class'], inplace=True) X_train, X_test, y_train, y_test = train_test_split(testData, classLabels, test_size=0.2, stratify=classLabels, random_state=42) X_train.insert(1, "class", y_train) sns.countplot(x="class", data=X_train) X_train = X_train.drop(columns=['class']) # Label Propagation modelLabelProp = LabelPropagation() labels = [-1] * len(facialData[:10000]) labels.extend(y_train) inputData = pd.concat([facialData[:10000], X_train], sort=False, ignore_index=True, copy=False) modelLabelProp.fit(inputData, labels) yPred = modelLabelProp.predict(X_test) print("LABEL PROPAGATION:") metricNPlot(modelLabelProp, X_test, y_test, yPred) with open(outFile, 'w') as f: f.write("Label Propagation prediction\n") for item in yPred: f.write("%s\n" % item) # Label Spreading modelLabelSpread = LabelSpreading(kernel='knn', n_neighbors=15) labels = [-1] * len(facialData[:10000]) labels.extend(y_train) inputData = pd.concat([facialData[:10000], X_train], sort=False, ignore_index=True, copy=False) modelLabelSpread.fit(inputData, labels) yPred = modelLabelSpread.predict(X_test) print("LABEL SPREADING:") metricNPlot(modelLabelSpread, X_test, y_test, yPred) with open(outFile, 'a') as f: f.write("Label Spreading prediction\n") for item in yPred: f.write("%s\n" % item) height = [0.8, 0.68] bars = ('Label Propagation', 'Label Spreading') y_pos = np.arange(len(bars)) plt.title("Performance Comparison") plt.bar(y_pos, height, color=['cyan', 'red']) plt.xticks(y_pos, bars) plt.show()
nb_unlabeled = 750 if __name__ == '__main__': # Create the dataset X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, random_state=100) Y[nb_samples - nb_unlabeled:nb_samples] = -1 # Create a LabelPropagation instance and fit it lp = LabelPropagation(kernel='rbf', gamma=10.0) lp.fit(X, Y) Y_final = lp.predict(X) # Show the final result sns.set() fig, ax = plt.subplots(1, 2, figsize=(18, 8)) ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], color='#88d7f0', marker='s', s=100, label="Class 0") ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], color='#55ffec', marker='o',
tempind['PassengerId','Ticket','Cabin'] = False tempage = tempage.loc[:,tempind] tempage_labeled = tempage[tempage['Age'].notnull()] newlabels = tempage_labeled['Age'].astype(int) plabels = tempage_labeled.loc[:,tempage_labeled.columns != 'Age'].astype(int) tempage_unlabeled = tempage[tempage['Age'].isnull()] unlabeled = tempage_unlabeled.loc[:,tempage_unlabeled.columns != 'Age'].astype(int) label_prop_modelage = LabelPropagation() label_prop_modelage.fit(plabels,newlabels) badind = temp[temp['Age'].isnull()]['Age'].index newages = pd.Series(label_prop_modelage.predict(unlabeled),index = badind) temp['Age'].fillna(newages, inplace = True) temp['Embarked'].fillna('S', inplace = True) temp.drop('Cabin', axis = 'columns',inplace = True) print(temp.info()) h = .02 # step size in the mesh names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA"]