def semiLabelSpreding(feature_extractor, generator, val_generator, kernel, neighbors, gamma, alpha): semi = LabelSpreading(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=alpha, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test): #pca = randomized_PCA(X_train) #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3) #X = pca.transform(X) #val_images = pca.transform(val_images) #y= y[:] X_train = X_train[:, :] y_train = y_train[:] Xunlabelled = Xunlabelled[:10000,:] #import ipdb; ipdb.set_trace() X_both = np.vstack((X_train, Xunlabelled)) y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],))) label_prop_model = LabelSpreading(max_iter=100) #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train))) #labels = np.copy(y_train) #labels[random_unlabeled_points] = -1 label_prop_model.fit(np.copy(X_both), np.copy(y_both)) y_pred = label_prop_model.predict(np.copy(X_both)) print(y_pred)
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def doLabelSpreading(self,X,y,**kwargs): label_spread_model = LabelSpreading(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_spread_model.fit(X, y) if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) ) return label_spread_model.predict_proba(X)
def label_spreading(self, X_train, y, X_test): clf = LabelSpreading() X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading( kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def semi_supervised_learning(data_matrix, target): if -1 in list(target): # if -1 is present in target do label spreading from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5) label_prop_model.fit(data_matrix, target) pred_target = label_prop_model.predict(data_matrix) extended_target = [] for pred_label, label in zip(pred_target, target): if label != -1 and pred_label != label: extended_target.append(label) else: extended_target.append(pred_label) else: extended_target = target return np.array(extended_target)
def augment_instances(self, X_train, y_train): if self.args.num_unlabeled == 0: return X_train, y_train X_unlabeled = self.dataset.X_train_unlabeled y_unlabeled = self.dataset.y_train_unlabeled X_unlabeled = X_unlabeled.values y_unlabeled = y_unlabeled.values X_train_text = X_train[:, self.args.text_col] self.fit_text(X_train_text, y_train) X_train_rep = self.transform_text(X_train_text) X_train_rep = self.augment_features(X_train_rep, X_train) chunk_size = 1000 num_instances = X_unlabeled.shape[0] num_cols = y_train.shape[1] for row in tqdm(range(0, self.args.num_unlabeled, chunk_size), desc='spreading labels in rows', total=int(self.args.num_unlabeled / chunk_size)): end_row = row + chunk_size end_row = np.minimum(end_row, num_instances) for col in tqdm(range(num_cols), desc='spreading labels in cols', leave=False): X_unlabeled_rep = self.transform_text( X_unlabeled[row:end_row, self.args.text_col]) X_unlabeled_rep = self.augment_features( X_unlabeled_rep, X_unlabeled[row:end_row, :]) X_spread = np.append(X_train_rep, X_unlabeled_rep, axis=0) y_spread = np.append(y_train[:, col], y_unlabeled[row:end_row, col], axis=0) labeling = LabelSpreading() labeling.fit(X_spread, y_spread) y_unlabeled[row:end_row, col] = labeling.predict(X_unlabeled_rep) X_train = np.append(X_train, X_unlabeled[:row + chunk_size], axis=0) y_train = np.append(y_train, y_unlabeled[:row + chunk_size], axis=0) return X_train, y_train
class LabelSpreadingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
class LP: def __init__(self, lmnn=False, max_iter=1000, lm_num=200): # self.clf = LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25) self.clf = LabelSpreading(kernel='knn', n_neighbors=25, max_iter=max_iter, alpha=0.2, n_jobs=-1) self.lmnn = lmnn self.lm_num = lm_num if lmnn: self.ml = LMNN(use_pca=False, max_iter=2000) def fit(self, X, y): if self.lmnn: nonzero_index = np.nonzero(y) index = random.sample(list(nonzero_index[0]), self.lm_num) X_ = X[index] y_ = y[index] print('ml fitting') self.ml.fit(X_, y_) print('transform') X = self.ml.transform(X) print('lp fitting') zero_index = np.nonzero(y == 0) negetive_index = np.nonzero(y == -1) positive_index = np.nonzero(y == 1) y[zero_index] = -1 y[negetive_index] = 2 print(zero_index[0].shape, negetive_index[0].shape, positive_index[0].shape) self.clf.fit(X, y) def predict(self, X): print('lp predict') if self.lmnn: X = self.ml.transform(X) y_pred = self.clf.predict(X) negative_index = np.nonzero(y_pred == -1) two_index = np.nonzero(y_pred == 2) y_pred[negative_index] = 0 y_pred[two_index] = -1 return y_pred
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1): spread = LabelSpreading(kernel=kernel, n_neighbors=k, gamma=g, alpha=a, max_iter=MI, n_jobs=-1) spread.fit(xTrain, yTrain) evaledY = spread.predict(xTrain) #def stats(trainY,evaledY,expectedY,day_one): return lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, evaledY, yExpect, day_one) results = [ 'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'SC.csv' write_csv(file_name, results)
def getLabelPropa(yale): n = len(yale) yale['labels'] = yale['Rank'] yale['labels'].loc[yale['Town'].isin( ['Greenwich', 'Westport', 'Fairfield', 'Trumbull', 'Ridgefield'])] = 1 #print(yale['labels']) label = yale['labels'] yale = yale.select_dtypes(include=['float64', 'int64']) label_prop_model = LabelSpreading(alpha=0.1, kernel='rbf', n_neighbors=3, max_iter=300, gamma=2) yale = yale.drop(['labels'], axis=1) yale = preprocessing.normalize(yale, axis=0, norm='max') label_prop_model.fit(yale, label) label = label_prop_model.predict(yale) ##print(label_prop_model.predict(yale)) #print(label_prop_model) #print(label_prop_model.predict_proba(yale)) return label
def run_lp_tfidf(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
class propgate_lables_predictor(): def __init__(self,X_unlabled,predictor): self.X_unlabled=X_unlabled self.prop_model = LabelSpreading(kernel='rbf',gamma=0.1,max_iter=1000,tol=0.001,n_jobs=-1,alpha=0.2) self.predictor = predictor def fit(self,X,y): unlabled =self.X_unlabled#.head(500) new_x, new_y_pre = pd.concat([pd.DataFrame(X), pd.DataFrame(unlabled.values)]), pd.concat([y, pd.DataFrame([-1] * len(unlabled))]) scale = StandardScaler() self.prop_model.fit(scale.fit_transform(new_x), np.array(new_y_pre).ravel()) new_y_post = self.prop_model.predict(scale.transform(new_x)) pred_entropies = pd.Series(scipy.stats.entropy(self.prop_model.label_distributions_.T)) X_final = new_x.reset_index(drop=True) pred_entropies.index = X_final.index #pred_entropies[new_y_pre==-1] = 0 #they are known, making sure they are in y_final = pd.concat([pd.Series(y), pd.Series(new_y_post[len(y):])]) y_final.index = X_final.index cond = (~pred_entropies.isna()) & (pred_entropies < pred_entropies.iloc[len(y):].mean()) X_final = X_final.loc[cond,:] y_final = y_final[cond] print(len(X),'final amount of instances:',len(X_final)) self.predictor.fit(X_final,np.array(y_final).ravel()) def predict(self,X): return self.predictor.predict(X) def predict_proba(self,X): return self.predictor.predict_proba(X)
def JSFS(X: np.ndarray, y: np.ndarray, test_X: np.ndarray, test_y: np.ndarray, name: str): # reference: Jiang, Bingbing, et al. # "Joint semi-supervised feature selection and classification through Bayesian approach." # Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 33. 2019. print('========== JSFS ==========') # --- Input & Initialize--- # np.set_printoptions(threshold=np.inf) n = len(X) d = len(X[0]) y.resize((n, 1)) testSize = len(test_X) # labeled sample ratio # labelRatio = 0.5 labelRatio = CONFIG[name]['labelRatio'] l = int(n * labelRatio) u = n - l # γ and µ are super parameters # Gamma = 0.001 Gamma = CONFIG[name]['Gamma'] # Mu = 0.9 Mu = CONFIG[name]['Mu'] # Beta = 0.005 Beta = 5 Omega = np.zeros((d, 1)) Omega[:] = 0.5 Lambda_vector = np.zeros((u, 1)) Lambda_vector[:] = 0.5 A = np.zeros((d, d)) for i in range(d): A[i, i] = 0.001 C = np.zeros((u, u)) for i in range(u): C[i, i] = 0.001 # --- Construct the affinity matrix S and graph Laplacian L via KNN --- print('Construct the affinity matrix S and graph Laplacian L via KNN') trainData_X = X trainData_Y = y.ravel() # y and trainData_Y address the same memory # replace the original -1 label with 0, because in this method -1 means no label for i in range(n): trainData_Y[i] = 0 if trainData_Y[i] == -1 else trainData_Y[i] trainData_Y[l:] = -1 KNN = KNeighborsClassifier(n_neighbors=5) KNN.fit(trainData_X[:l], trainData_Y[:l]) S = np.zeros((n, n)) D = np.zeros((n, n)) L = np.zeros((n, n)) for i in range(n): for j in range(i, n): if trainData_Y[i] == trainData_Y[j] and trainData_Y[i] != -1: S[i][j] = 10 elif (trainData_Y[i] == -1 and trainData_Y[j] > -1) and (KNN.predict( trainData_X[i:i + 1]) == trainData_Y[j]): S[i][j] = 1 elif (trainData_Y[j] == -1 and trainData_Y[i] > -1) and (KNN.predict( trainData_X[j:j + 1]) == trainData_Y[i]): S[i][j] = 1 else: S[i][j] = 0 S[j][i] = S[i][j] D[i, i] = sum(S[i, :]) percent = 100 * (float((2 * n - i) * (i + 1)) / ((n + 1) * n)) show_str = ('[%%-%ds]' % 50) % (int(50 * percent / 100) * "#") print('\r%s %d%%' % (show_str, percent), end='') L = D - S # --- Obtain the pseudo laber vector y_u via label progation --- print('\nObtain the pseudo laber vector y_u via label progation') LGC_rbf = LabelSpreading(kernel='knn', gamma=20, n_neighbors=7, max_iter=150) LGC_rbf.fit(trainData_X, trainData_Y) trainData_Y[l:] = LGC_rbf.predict(trainData_X[l:]) # change 0 back to the -1 """ for i in range(n): trainData_Y[i] = -1 if trainData_Y[i] == 0 else trainData_Y[i] """ # --- Data preprocessing - Normalized for X, y --- # min_max_scaler = preprocessing.MinMaxScaler((0, 0.0001)) min_max_scaler = preprocessing.MinMaxScaler( (0, CONFIG[name]['xMaxScaler'])) X = min_max_scaler.fit_transform(X) test_X = min_max_scaler.transform(test_X) # --- Convergence --- B = Gamma * np.dot(np.dot(X.T, L), X) Lambda = np.matlib.identity(n) Sigma = np.zeros((n, 1)) E = np.zeros((n, n)) P = np.zeros((u, u)) k_lambda = np.zeros((u, 1)) Eu = np.zeros((u, u)) O = np.zeros((u, u)) Omega_old = np.ones((d, 1)) Lambda_vector_old = np.zeros((u, 1)) g_omega = np.zeros((d, 1)) H_omega = np.zeros((d, d)) Sig_omega = np.zeros((d, d)) g_lambda = np.zeros((u, 1)) H_lambda = np.zeros((u, u)) Sig_lambda = np.zeros((u, u)) G = np.zeros((d, d)) cnt = 0 while np.linalg.norm(Omega - Omega_old, ord=np.inf) > 0.001: print('--------', cnt + 1, '--------') for i in range(n): if (i < l): Sigma[i, 0] = 1 / (1 + np.exp(-1 * np.dot(X[i, :], Omega))) E[i, i] = Sigma[i, 0] * (1 - Sigma[i, 0]) else: Sigma[i, 0] = 1 / \ (1 + np.exp(-1 * Lambda_vector[i-l, 0] * np.dot(X[i, :], Omega))) E[i, i] *= Mu * Lambda_vector[i-l, 0] * \ Lambda_vector[i-l, 0] * Sigma[i, 0] * (1 - Sigma[i, 0]) Lambda[i, i] = Mu * Lambda_vector[i - l, 0] P[i - l, i - l] = np.dot(X[i, :], Omega) k_lambda[i-l, 0] = Beta * \ (1 - (1 / (1 + np.exp(-(Beta * Lambda_vector[i-l, 0]))))) Eu[i - l, i - l] = Sigma[i, 0] * (1 - Sigma[i, 0]) O[i - l, i - l] = Beta * Beta * ( 1 / (1 + np.exp(-(Beta * Lambda_vector[i - l, 0])))) * ( 1 - (1 / (1 + np.exp(-(Beta * Lambda_vector[i - l, 0]))))) if (np.linalg.norm(g_omega[:, 0], ord=2) / d) < 0.001: g_omega = np.dot(np.dot(X.T, Lambda), (y - Sigma)) - \ np.dot((A + B), Omega) H_omega = -1 * (np.dot(np.dot(X.T, E), X) + A + B) Sig_omega = -1 * np.linalg.inv(H_omega) Omega_old = Omega.copy() Omega = Omega - np.dot(np.linalg.inv(H_omega), g_omega) print('gw:', np.mean(g_omega[:, 0]), ' gw_judge:', (np.linalg.norm(g_omega[:, 0], ord=2) / d), 'w_max', np.max(Omega, axis=0), 'w_min', np.min(Omega, axis=0)) for i in range(d): if (Omega[i, 0] != 0) and (abs(Omega[i, 0]) < 0.001): Omega[i, 0] = 0 if (np.linalg.norm(g_lambda[:, 0], ord=2) / u) < 0.001: g_lambda = Mu * np.dot(P, (y[l:] - Sigma[l:])) - \ np.dot(C, Lambda_vector) + k_lambda H_lambda = -1 * ((Mu * np.dot(np.dot(P.T, Eu), P)) + C + O) Sig_lambda = -1 * np.linalg.inv(H_lambda) Lambda_vector_old = Lambda_vector.copy() Lambda_vector = Lambda_vector - \ np.dot(np.linalg.inv(H_lambda), g_lambda) print('gl:', np.mean(g_lambda[:, 0]), ' gl_judge:', (np.linalg.norm(g_lambda[:, 0], ord=2) / u), 'l_max', np.max(Lambda_vector, axis=0), 'l_min', np.min(Lambda_vector, axis=0)) for i in range(u): if (Lambda_vector[i, 0] != 0) and (abs(Lambda_vector[i, 0]) < 0.001): Lambda_vector[i, 0] = 0 G = np.dot( np.dot( np.dot(np.linalg.inv(A), B), np.linalg.inv( np.matlib.identity(d) + np.dot(np.linalg.inv(A), B))), np.linalg.inv(A)) for i in range(d): A[i, i] = 1 / (Omega[i, 0] * Omega[i, 0] + G[i, i] + Sig_omega[i, i]) for i in range(u): C[i, i] = 1 / (Lambda_vector[i, 0] * Lambda_vector[i, 0] + Sig_lambda[i, i]) print('max_lambda_new-old', np.linalg.norm(Lambda_vector - Lambda_vector_old, ord=np.inf)) print('max_omega_new-old', np.linalg.norm(Omega - Omega_old, ord=np.inf)) cnt += 1 if cnt == 50: break # --- Test --- predict_y = np.zeros(testSize) predict_vector_y = np.dot(test_X, Omega).flatten() predict_vector_y *= CONFIG[name]['yScaler'] threshold = CONFIG[name]['threshold'] for i in range(testSize): if predict_vector_y[0, i] < threshold: predict_y[i] = -1 else: predict_y[i] = 1 print('predict_y:', predict_vector_y[0, :10]) tp = 0 fp = 0 fn = 0 tn = 0 for idx in range(len(test_y)): if test_y[idx] == 1 and predict_y[idx] == 1: tp += 1 elif test_y[idx] == 1 and predict_y[idx] == -1: fn += 1 elif test_y[idx] == -1 and predict_y[idx] == 1: fp += 1 elif test_y[idx] == -1 and predict_y[idx] == -1: tn += 1 p = tp / (fp + tp) pf = fp / (fp + tn) pd = tp / (tp + fn) F_measure = 2 * pd * p / (pd + p) """ print('precision:', 100 * p, '%') print('recall:', 100 * recall_score(test_y, predict_y), '%') print('pf:', 100 * pf, '%') print('F-measure:', 100 * F_measure, '%') print('accuracy:', 100 * accuracy_score(test_y, predict_y), '%') print('AUC:', 100 * roc_auc_score(test_y, predict_y), '%') """ print('precision:', p) print('recall:', recall_score(test_y, predict_y)) print('pf:', pf) print('F-measure:', F_measure) print('accuracy:', accuracy_score(test_y, predict_y)) print('AUC:', roc_auc_score(test_y, predict_y))
# -*- coding: utf-8 -*- """ http://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelSpreading.html Created on Fri Sep 14 16:13:06 2018 @author: Akitaka """ import numpy as np from sklearn import datasets from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading() iris = datasets.load_iris() rng = np.random.RandomState(42) random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 labels = np.copy(iris.target) labels[random_unlabeled_points] = -1 label_prop_model.fit(iris.data, labels) print(labels) print(iris.target) print(label_prop_model.transduction_) print(label_prop_model.predict(iris.data))
4. 1. 6. -1. 4. 4. 1. 1. 6. 6. -1. 4. 4. 4. 3. 2. 6. -1. 1. 6. 4. 4. 4. 5. 6. -1. -1. 5. 2. 6. 1. 6. 3. 2. 6. 3. 3. 1. 2. 5. 2. -1. -1. 1. 6. 6. -1. 6. 6. 6. 4. 6. -1. 2. 3. 2. 5. 4. 4. 6. 4. -1. 4. 2. 6. 1. 1. 2. -1. 5. 2. 4. 3. -1. 6. 2. 5. 2. 2. 5. 5. 4. 2. 1. -1. 1.] (500, 100) (500,) """ from sklearn.semi_supervised import LabelSpreading label_propagation_model = LabelSpreading() label_propagation_model.fit(X, y) # make predictions for first twenty samples (some will be known, some unknown) for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1)) """ y: 6.0 y_hat: [6.] y: 6.0 y_hat: [6.] y: 2.0 y_hat: [2.] y: 1.0 y_hat: [1.] y: -1.0 y_hat: [6.] * y: 2.0 y_hat: [2.] y: 6.0 y_hat: [6.] y: 4.0 y_hat: [4.] y: 3.0 y_hat: [3.] y: 5.0 y_hat: [5.] y: 6.0 y_hat: [6.] y: 4.0 y_hat: [4.] y: 3.0 y_hat: [3.] y: 3.0 y_hat: [3.]
label_prop_model = LabelPropagation(kernel=p_ss_kern, gamma=p_gamma, n_neighbors=p_neighbors, alpha=p_alpha, max_iter=70) else: label_prop_model = dic_ss_mod[p_ss_mod](kernel=p_ss_kern, gamma=p_gamma, n_neighbors=p_neighbors) print('Start to fit. Run for shelter!') label_prop_model.fit(X_tot, y_tot) temp_acc = label_prop_model.score(X_valid_lab, y_valid) print('{} / {} :accuracy = {}'.format(i, p_manyfit, temp_acc)) RESULT_ACC_SS += temp_acc y_tot = label_prop_model.transduction_ y_submit = label_prop_model.predict(X_submit) save_to_csv(X_tot, y_tot, X_valid_lab, y_valid) RESULT_ACC_SS /= p_manyfit json_dict['ss_accuracy'] = RESULT_ACC_SS print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS) else: init_variables() #PCA preprocessing if (PCA_MODE): pca_preprocess() X_tot, y_tot, X_valid, y_valid = load_xy() ##############################NEURAL NETWORK PART ################################## if (USING_NN): model = build_model()
def build_models(trainX, trainY, testX, testY, source_pos, target_pos, window): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX) acc_ss_propagation, acc_ss_propagation_INFO = checkAccuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX) acc_ss_spreading, acc_ss_spreading_INFO = checkAccuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = checkAccuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = checkAccuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = checkAccuracy(testY, predND) # print("WITHOUT TL ACC_LR:", accLR, " ACC_DT:", accDT, " ACC_NB:", accNB) ######################## #### WITH TL ######## ######################## #################################################### ### Kernel Mean Matching (Huang et al., 2006) ### # Decision Tree print("\n Kernel Mean Matching (Huang et al., 2006) ") classifier = ImportanceWeightedClassifier(iwe='kmm', loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_KMM, acc_DT_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_KMM) # Logistic Regression classifier = ImportanceWeightedClassifier(iwe='kmm', loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_KMM, acc_LR_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_KMM) # Naive Bayes Bernoulli classifier = ImportanceWeightedClassifier(iwe='kmm', loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_KMM, acc_NB_KMM_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_KMM) #################################################### ### Nearest-neighbour-based weighting (Loog, 2015) ### # Decision Tree print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_NN, acc_DT_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_NN) # Logistic Regression print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_NN, acc_LR_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_NN) # Naive Bayes Bernoulli print("\n Nearest-neighbour-based weighting (Loog, 2015) ") classifier = ImportanceWeightedClassifier(iwe='nn', loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_NN, acc_NB_NN_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_NN) #################################################### ### Transfer Component Analysis (Pan et al, 2009) ### # Decision Tree print("\n Transfer Component Analysis (Pan et al, 2009)") classifier = TransferComponentClassifier(loss="dtree", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_TCA, acc_DT_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_TCA) # Logistic Regression classifier = TransferComponentClassifier(loss="logistic", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_TCA, acc_LR_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_TCA) # Naive Bayes Bernoulli classifier = TransferComponentClassifier(loss="berno", num_components=6) classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_TCA, acc_NB_TCA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_TCA) #################################################### ### Subspace Alignment (Fernando et al., 2013) ### # Decision Tree print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="dtree") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_DT_SA, acc_DT_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_DT_SA) # Logistic Regression print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="logistic") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_LR_SA, acc_LR_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_LR_SA) # Naive Bayes Bernoulli print("\n Subspace Alignment (Fernando et al., 2013) ") classifier = SubspaceAlignedClassifier(loss="berno") classifier.fit(trainX, trainY, testX) pred_naive = classifier.predict(testX) acc_NB_SA, acc_NB_SA_INFO = checkAccuracy(testY, pred_naive) print("ACC:", acc_NB_SA) ################################# ############# ENSEMBLE ########## ################################# classifier_SA_DT = SubspaceAlignedClassifier(loss="dtree") classifier_SA_LR = SubspaceAlignedClassifier(loss="logistic") classifier_SA_NB = SubspaceAlignedClassifier(loss="berno") classifier_TCA_DT = TransferComponentClassifier(loss="dtree") classifier_TCA_LR = TransferComponentClassifier(loss="logistic") classifier_TCA_NB = TransferComponentClassifier(loss="berno") classifier_NN_DT = ImportanceWeightedClassifier(iwe='nn', loss="dtree") classifier_NN_LR = ImportanceWeightedClassifier(iwe='nn', loss="logistic") classifier_NN_NB = ImportanceWeightedClassifier(iwe='nn', loss="berno") classifier_KMM_DT = ImportanceWeightedClassifier(iwe='kmm', loss="dtree") classifier_KMM_LR = ImportanceWeightedClassifier(iwe='kmm', loss="logistic") classifier_KMM_NB = ImportanceWeightedClassifier(iwe='kmm', loss="berno") # eclf = EnsembleClassifier( clfs=[classifier_TCA_DT, classifier_NN_DT, classifier_KMM_DT]) eclf.fit(trainX, trainY, testX) pred = eclf.predict_v2(testX) acc_ENSEMBLE, acc_ENSEMBLE_INFO = checkAccuracy(testY, pred) ######################## #### RETURN ######## ######################## return pd.DataFrame([{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO': acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO': acc_ss_spreading_INFO, 'acc_ENSEMBLE': acc_ENSEMBLE, 'acc_LR': accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO), 'acc_LR_KMM': acc_LR_KMM, 'acc_LR_KMM_INFO': str(acc_LR_KMM_INFO), 'acc_LR_NN': acc_LR_NN, 'acc_LR_NN_INFO': str(acc_LR_NN_INFO), 'acc_LR_TCA': acc_LR_TCA, 'acc_LR_TCA_INFO': str(acc_LR_TCA_INFO), 'acc_LR_SA': acc_LR_SA, 'acc_LR_SA_INFO': str(acc_LR_SA_INFO), 'acc_DT_KMM': acc_DT_KMM, 'acc_DT_KMM_INFO': str(acc_DT_KMM_INFO), 'acc_DT_NN': acc_DT_NN, 'acc_DT_NN_INFO': str(acc_DT_NN_INFO), 'acc_DT_TCA': acc_DT_TCA, 'acc_DT_TCA_INFO': str(acc_DT_TCA_INFO), 'acc_DT_SA': acc_DT_SA, 'acc_DT_SA_INFO': str(acc_DT_SA_INFO), 'acc_NB_KMM': acc_NB_KMM, 'acc_NB_KMM_INFO': str(acc_NB_KMM_INFO), 'acc_NB_NN': acc_NB_NN, 'acc_NB_NN_INFO': str(acc_NB_NN_INFO), 'acc_NB_TCA': acc_NB_TCA, 'acc_NB_TCA_INFO': str(acc_NB_TCA_INFO), 'acc_NB_SA': acc_NB_SA, 'acc_NB_SA_INFO': str(acc_NB_SA_INFO) }])
# 留部分测试数据 x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20) # 生成有未标记样本的数据集 rng = np.random.RandomState(0) random_unlabeled = rng.rand(len(y_train)) < 0.8 # 未标记样本的标签设置为-1 y_train[random_unlabeled] = -1 # 调参gamma # for i in [0.005, 0.01, 0.1, 0.5, 1]: # model = LabelPropagation(kernel='rbf', gamma=i) # model.fit(x_train, y_train) # print(i, accuracy_score(y_test, model.predict(x_test))) model = LabelSpreading(kernel='rbf', gamma=0.01) # model = LabelPropagation(kernel='rbf', gamma=0.01) model.fit(x_train, y_train) print('===========y===============') print(y_test) print('===========y_pred===============') y_pred = model.predict(x_test) print(y_pred) print('=======confusion_matrix=======') print(confusion_matrix(y_test, y_pred)) print('accuracy: {}'.format(accuracy_score(y_test, y_pred))) print(model.label_distributions_)
def labelspread(train_data, semi_data, train_label, semi_label, train_name, semi_name, lib, libname): print("===========================") train_d = [] train_l = [] semi_d = [] semi_l = [] name = [] for i in range(len(train_data)): if train_label[i] in lib: train_d.append(train_data[i]) train_l.append(train_label[i]) name.append(train_name[i]) for i in range(len(semi_data)): if semi_label[i] in lib: semi_d.append(semi_data[i]) semi_l.append(semi_label[i]) name.append(semi_name[i]) train_d = np.array(train_d) / -80. train_l = np.array(train_l) semi_d = np.array(semi_d) / -80. semi_l = np.array(semi_l) name = np.array(name) print(libname, ' all num: ', train_d.shape[0] + semi_d.shape[0]) print(libname, ' train num: ', train_d.shape[0]) print(libname, ' ratio: ', train_d.shape[0] / (train_d.shape[0] + semi_d.shape[0])) #print('PCA...') #data = PCA(n_components=439,whiten=True,svd_solver="full",random_state=0).fit_transform(data) #semi_sp_data = SVD.transform(semi_sp_data) #print('data PCA size: ',data.shape) semi_unl = np.full(semi_l.shape[0], -1) label = np.concatenate((train_l, semi_unl), axis=0).astype('int') data = np.concatenate((train_d, semi_d), axis=0).astype('float') print('label size: ', label.shape) print('data size: ', data.shape) print('label propagation...') #print(label) #model = LabelPropagation(kernel='knn',n_neighbors=5,max_iter=10000,tol=0.001,n_jobs=-1) model = LabelSpreading(kernel='rbf', gamma=20, alpha=0.2, n_neighbors=5, max_iter=100000, tol=0.001, n_jobs=20) model.fit(data, label) oursemi_l = model.predict(semi_d) ourlabel = np.concatenate((train_l, oursemi_l), axis=0) csvlabel = np.concatenate((train_l, semi_l), axis=0) print('our... ', ourlabel) print('csv... ', csvlabel) similarity = 0 for i in range(len(ourlabel)): if ourlabel[i] == csvlabel[i]: similarity += 1 print('new train num: ', similarity) print('ratio: ', similarity / len(ourlabel)) return data, name, ourlabel, csvlabel
test_labels = np.argmax(test_labels_one_hot, 1) x_all = np.concatenate( (train_data, test_data )) # concatenate the train and test data (for structure exploitation) test_labels_none = -1 * np.ones([ test_labels.shape[0], ]) # the label of the test_data is set to -1 y_all = np.concatenate( (train_labels, test_labels_none)) # concatenate the train labels and -1 test labels consist_model = LabelSpreading(gamma=4, max_iter=60) consist_model.fit(x_all, y_all) clf.evaluate_sub('consistency model', test_labels, consist_model.predict(test_data)) lgr_model = clf.classifier('LGR', train_data, train_labels) clf.evaluate('LGR', lgr_model, test_data, test_labels) knn_model = clf.classifier('KNN', train_data, train_labels) clf.evaluate('KNN', knn_model, test_data, test_labels) bnb_model = clf.classifier('BNB', train_data, train_labels) clf.evaluate('BNB', bnb_model, test_data, test_labels) svm_model = clf.classifier('SVM', train_data, train_labels) clf.evaluate('SVM', svm_model, test_data, test_labels) dtc_model = clf.classifier('DTC', train_data, train_labels) clf.evaluate('DTC', dtc_model, test_data, test_labels)
def fit_with_clustering(self, X_l, y_l, X_u, y_u=None): """ Initialize the parameter using both labeled and unlabeled data. The classes of unlabeled data are assigned using similarity with labeled data. Assume unlabeled class as missing values, apply EM on unlabeled data to refine classifier. The label propagation can only use dense matrix, so it is quite time consuming. """ n_ul_docs = X_u.shape[0] # number of unlabeled samples n_l_docs = X_l.shape[0] # number of labeled samples # initialization (n_docs = n_ul_docs): # assign class to unlabeled data using similarity with labeled data if y_u is not given if (y_u == None): label_prop_model = LabelSpreading(kernel='rbf', max_iter=5, n_jobs=-1) y_u = np.array([-1.0] * n_ul_docs) X = vstack([X_l, X_u]) y = np.concatenate((y_l, y_u), axis=0) label_prop_model.fit(X.toarray(), y) y_u = label_prop_model.predict(X_u.toarray()) y = np.concatenate((y_l, y_u), axis=0) clf = deepcopy(self.clf) # build new copy of classifier clf.fit(X, y) # use labeled data only to initialize classifier parameters prev_log_lkh = self.log_lkh # record log likelihood of previous EM iteration lp_w_c = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words] b_w_d = (X_u > 0) # words in each document [n_docs, n_words] lp_d_c = get_blas_funcs( "gemm", [lp_w_c, b_w_d.T.toarray() ]) # log CP of doc given class [n_classes, n_docs] lp_d_c = lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.T.toarray()) lp_c = np.matrix( clf.class_log_prior_).T # log prob of classes [n_classes, 1] lp_c = np.repeat(lp_c, n_ul_docs, axis=1) # repeat for each doc [n_classes, n_docs] lp_dc = lp_d_c + lp_c # joint prob of doc and class [n_classes, n_docs] p_c_d = clf.predict_proba( X_u) # weight of each class in each doc [n_docs, n_classes] expectation = get_blas_funcs( "gemm", [p_c_d, lp_dc ]) # expectation of log likelihood over all unlabeled docs expectation = expectation(alpha=1.0, a=p_c_d, b=lp_dc).trace() self.clf = deepcopy(clf) self.log_lkh = expectation if self.print_log_lkh: print("Initial expected log likelihood = %0.3f\n" % expectation) # Loop until log likelihood does not improve iter_count = 0 # count EM iteration while (self.log_lkh - prev_log_lkh >= self.tol and iter_count < self.max_iter): # while (iter_count<self.max_iter): iter_count += 1 if self.print_log_lkh: print("EM iteration #%d" % iter_count) # debug # E-step: Estimate class membership of unlabeled documents y_u = clf.predict(X_u) # M-step: Re-estimate classifier parameters X = vstack([X_l, X_u]) y = np.concatenate((y_l, y_u), axis=0) clf.fit(X, y) # check convergence: update log likelihood p_c_d = clf.predict_proba(X_u) lp_w_c = clf.feature_log_prob_ # log CP of word given class [n_classes, n_words] b_w_d = (X_u > 0) # words in each document lp_d_c = get_blas_funcs( "gemm", [lp_w_c, b_w_d.transpose().toarray() ]) # log CP of doc given class [n_classes, n_docs] lp_d_c = lp_d_c(alpha=1.0, a=lp_w_c, b=b_w_d.transpose().toarray()) lp_c = np.matrix( clf.class_log_prior_).T # log prob of classes [n_classes, 1] lp_c = np.repeat(lp_c, n_ul_docs, axis=1) # repeat for each doc [n_classes, n_docs] lp_dc = lp_d_c + lp_c # joint prob of doc and class [n_classes, n_docs] expectation = get_blas_funcs( "gemm", [p_c_d, lp_dc ]) # expectation of log likelihood over all unlabeled docs expectation = expectation(alpha=1.0, a=p_c_d, b=lp_dc).trace() if self.print_log_lkh: print("\tExpected log likelihood = %0.3f" % expectation) if (expectation - self.log_lkh >= self.tol): prev_log_lkh = self.log_lkh self.log_lkh = expectation self.clf = deepcopy(clf) else: break self.feature_log_prob_ = self.clf.feature_log_prob_ self.coef_ = self.clf.coef_ return self
# training the scaler scaler = StandardScaler(with_mean=True, with_std=True) scaler = scaler.fit(x_train_labeled) # scaling the training and test data x_train_labeled_scaled = scaler.transform(x_train_labeled) # x_test_scaled = scaler.transform(x_test) # stratified ten fold cross validation cv = StratifiedKFold(n_splits=10, shuffle=False, random_state=seed) # setup the model for train_index, val_index in cv.split(x_train_labeled_scaled, y_train_labeled): # create training and validation splits x_train, x_val = x_train_labeled_scaled[train_index], x_train_labeled_scaled[val_index] y_train, y_val = y_train_labeled[train_index], y_train_labeled[val_index] # my_kernel = polynomial_kernel(x_train, y_train, degree=5, gamma=None, coef0=1) # create model and fit data model = LabelSpreading(kernel=polynomial_kernel, gamma=20, alpha=0.2, max_iter=1, tol=0.001, n_jobs=1) model = model.fit(x_train, y_train) # evaluate model y_pred = model.predict(x_val) acc = accuracy_score(y_val, y_pred) print("Model Result: Split {} - Acc: {}".format(train_index, acc))
test_svm(x_all, y_all) # make a more select dataset # Filter the rest of the data x_obs, y_obs, x_nuls = load_data() keep = list(best.k_feature_idx_) np.save('sfs_features', keep) # keep = np.load('sfs_features.npy') x_obs = x_obs[:, keep] x_nuls = x_nuls[:, keep] # apply LabelSpreading label_spread = LabelSpreading(kernel='knn', alpha=0.8) label_spread.fit(x_obs, y_obs) x_all = np.concatenate([x_obs, x_nuls], axis=0) y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0) x, y = shuffle(x_all, y_all, random_state=42) smpnum = min([sum(y == i) for i in range(1, 6)]) y_btr = y[y == 1][:smpnum] x_btr = x[y == 1][:smpnum] for i in range(2, 6): x_btr = np.concatenate([x_btr, x[y == i][:smpnum]]) y_btr = np.concatenate([y_btr, y[y == i][:smpnum]]) x_tr, x_te, y_tr, y_te = train_test_split(x_btr, y_btr, test_size=0.20) mod = svm.SVC(kernel='rbf') mod.fit(x_tr, y_tr) mod.score(x_te, y_te)
nb_samples = 5000 nb_unlabeled = 1000 if __name__ == '__main__': # Create the dataset X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, random_state=100) Y[nb_samples - nb_unlabeled:nb_samples] = -1 # Create and fit a LabelSpreading instance ls = LabelSpreading(kernel='rbf', gamma=10.0, alpha=0.2) ls.fit(X, Y) Y_final = ls.predict(X) # Show the final result fig, ax = plt.subplots(1, 2, figsize=(18, 8)) ax[0].scatter(X[Y == 0, 0], X[Y == 0, 1], color='#88d7f0', marker='s', s=100) ax[0].scatter(X[Y == 1, 0], X[Y == 1, 1], color='#55ffec', marker='o', s=100) ax[0].scatter(X[Y == -1, 0], X[Y == -1, 1], color='r', marker='x', s=20) ax[0].set_xlabel(r'$x_0$') ax[0].set_ylabel(r'$x_1$') ax[0].set_title('Dataset') ax[0].grid() ax[1].scatter(X[Y_final == 0, 0], X[Y_final == 0, 1], color='#88d7f0', marker='s', s=100) ax[1].scatter(X[Y_final == 1, 0], X[Y_final == 1, 1], color='#55ffec', marker='o', s=100)
hist, bins = np.histogram( lables, bins=[-0.1, 0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1]) print(hist) print(bins) print(train_labeled.shape) print(train_labeled[:, 0]) train_unlabeled = sklearn.preprocessing.scale(train_unlabeled) features = sklearn.preprocessing.scale(features) lp = LabelSpreading(kernel='knn', gamma=20, n_neighbors=7, alpha=0.2, max_iter=50, tol=0.01, n_jobs=-1) y = lables for i in range(21000): y = np.concatenate((y, np.array([-1])), axis=0) all_data = np.concatenate((features, train_unlabeled), axis=0) lp.fit(all_data, y) Yresult = lp.predict(all_data) print(lp.score(all_data, Yresult)) np.savetxt('semiLabelsOfUnlabeled2.csv', Yresult, delimiter=",")
print cutdown_labels ''' [ 0 0 0 0 -1 -1 -1 0 0 0 -1 0 0 -1 -1 -1 0 0 0 -1 0 -1 -1 0 0 0 -1 0 0 -1 0 -1 -1 0 0 0 0 -1 0 0 -1 0 -1 0 -1 0 0 0 0 -1 1 1 1 1 1 1 -1 -1 -1 1 1 -1 1 1 -1 1 -1 1 -1 1 1 -1 -1 1 1 1 1 -1 1 -1 1 1 1 -1 1 1 1 1 1 1 -1 1 1 1 1 1 1 1 -1 -1 -1 2 2 2 2 -1 2 2 -1 -1 -1 -1 2 2 2 2 2 -1 2 2 2 2 2 -1 -1 2 2 2 -1 2 2 -1 -1 2 2 2 2 2 2 2 2 -1 2 2 -1 -1 2 2 -1 -1] ''' # fit LabelSpreading model label_propagation_model.fit(iris['data'], cutdown_labels) # quick test print 'y: ', full_labels[-1] print 'y_hat: ', label_propagation_model.predict(iris['data'][-1]) ''' y: 2 y_hat: [2] ''' # overall accuracy correct = 0.0 for i in range(len(iris['data'])): if label_propagation_model.predict(iris['data'][i])[0] == full_labels[i]: correct += 1 print 'Overall accuracy: ', correct/ len(iris['data']) ''' Overall accuracy: 0.98
y = np.asarray(Y_train) for i in idxs: y[i] = -1 Y_train = y # Train model and print statistics (use 'knn' as kernel) from sklearn.semi_supervised import LabelSpreading model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train) print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2))) pred = model.predict(X_test) == Y_test print("Correct: {}".format(np.count_nonzero(pred==True)),"/", "Incorrect: {}".format(np.count_nonzero(pred==False))) Z1 = model.predict(X_test).reshape(Y_test.size,1) Z2 = np.asarray(Y_test).reshape(Y_test.size,1) Z3 = np.around(model.predict_proba(X_test),decimals=2) data = np.concatenate((Z1,Z2,Z3),axis=1) outcome = pd.DataFrame(data, columns = ["Predicted Label", "Actual Label", "Prob. Label = 0.0", "Prob. Label = 1.0"]) indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"] print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep]))
server.sendmail("*****@*****.**", "*****@*****.**", msg) server.quit() # In[15]: targets # # Measuring effectiveness. # # # In[26]: from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score t_pred = label_prop_model.predict(x_test) print("Metrics based on 50 hold-out points") print("Macro") print("accuracy: %f" % accuracy_score(t_test, t_pred)) print("precision: %f" % precision_score(t_test, t_pred, average='macro')) print("recall: %f" % recall_score(t_test, t_pred, average='macro')) print("f1: %f" % f1_score(t_test, t_pred, average='macro')) print("\n\nMicro") print("accuracy: %f" % accuracy_score(t_test, t_pred)) print("precision: %f" % precision_score(t_test, t_pred, average='micro')) print("recall: %f" % recall_score(t_test, t_pred, average='micro')) print("f1: %f" % f1_score(t_test, t_pred, average='micro')) from sklearn import metrics
def main(argv): trainFile = None testFile = None outFile = None try: opts, args = getopt.getopt(argv, "hi:t:o:") except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt == '-i': trainFile = arg elif opt == '-t': testFile = arg elif opt == '-o': outFile = arg else: usage() print('Invalid argument %s' % opt) sys.exit(2) if (None == trainFile) or (None == testFile) or (None == outFile): print("Missing arguments") usage() sys.exit(2) facialData = pd.read_csv(trainFile) testData = pd.read_csv(testFile) testData.drop(columns=['id'], inplace=True) testData.reset_index(inplace=True, drop=True) labels = testData['class'] classLabels = [] for i in range(len(labels)): classLabels.append(1 if (labels[i] == 'deceptive') else 0) testData.drop(columns=['class'], inplace=True) X_train, X_test, y_train, y_test = train_test_split(testData, classLabels, test_size=0.2, stratify=classLabels, random_state=42) X_train.insert(1, "class", y_train) sns.countplot(x="class", data=X_train) X_train = X_train.drop(columns=['class']) # Label Propagation modelLabelProp = LabelPropagation() labels = [-1] * len(facialData[:10000]) labels.extend(y_train) inputData = pd.concat([facialData[:10000], X_train], sort=False, ignore_index=True, copy=False) modelLabelProp.fit(inputData, labels) yPred = modelLabelProp.predict(X_test) print("LABEL PROPAGATION:") metricNPlot(modelLabelProp, X_test, y_test, yPred) with open(outFile, 'w') as f: f.write("Label Propagation prediction\n") for item in yPred: f.write("%s\n" % item) # Label Spreading modelLabelSpread = LabelSpreading(kernel='knn', n_neighbors=15) labels = [-1] * len(facialData[:10000]) labels.extend(y_train) inputData = pd.concat([facialData[:10000], X_train], sort=False, ignore_index=True, copy=False) modelLabelSpread.fit(inputData, labels) yPred = modelLabelSpread.predict(X_test) print("LABEL SPREADING:") metricNPlot(modelLabelSpread, X_test, y_test, yPred) with open(outFile, 'a') as f: f.write("Label Spreading prediction\n") for item in yPred: f.write("%s\n" % item) height = [0.8, 0.68] bars = ('Label Propagation', 'Label Spreading') y_pos = np.arange(len(bars)) plt.title("Performance Comparison") plt.bar(y_pos, height, color=['cyan', 'red']) plt.xticks(y_pos, bars) plt.show()
def __call__(self, *args, **kwargs): """ Augment the labels Inputs: tr_percs: percentage of splitting between labeled and unlabeled observations algs: methods to perform the label propagation max_iter: parameter for 'gtg': number of iterations """ tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1]) algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only']) max_iter = kwargs.pop('max_iter', 25) if not osp.exists(self.label_dir): os.makedirs(self.label_dir) with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst: loader = prepare_loader( osp.join(self.splitting_dir, 'test.txt'), img_root=self.dset['src'], stats=self.dset['stats'], batch_size=1, shuffle=False, ) for _, label, path in loader: dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n')) for net_name in self.net_names: with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'), 'rb') as pkl: net_name, labels, features, fnames = pickle.load(pkl) labels = labels.ravel() # uncomment to debug code # labels = labels[:5000] # features = features[:5000] # fnames = fnames[:5000] for tr_perc in tr_percs: labeled, unlabeled = equiclass_mapping(labels, tr_perc) for alg in algs: print(net_name + ' - ' + str(self.dset['nr_classes']) + ' classes') # generate alg label file name alg_path = osp.join(self.label_dir, alg, net_name, 'labels_{}.txt'.format(tr_perc)) if self.hard_labels: alg_labels = np.full(labels.shape[0], -1) alg_labels[labeled] = labels[labeled] else: alg_labels = np.zeros( (len(labels), self.dset['nr_classes'])) alg_labels[labeled, labels[labeled].ravel().astype(int)] = 1.0 if alg == 'gtg': # predict labels with gtg if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) ps = init_rand_probability(labels, labeled, unlabeled) res = gtg.gtg(W, ps, max_iter=max_iter, labels=labels, U=unlabeled, L=labeled) if self.hard_labels: alg_labels[unlabeled] = res[unlabeled].argmax( axis=1) else: alg_labels[unlabeled] = res[unlabeled] elif alg == 'svm': # predict labels with a linear SVM lin_svm = svm.LinearSVC() if self.hard_labels: lin_svm.fit(features[labeled, :], labels[labeled]) svm_labels = lin_svm.predict( features[unlabeled]).astype(int) else: cv = min( np.unique(labels[labeled], return_counts=True)[1].min(), 3) clf = CalibratedClassifierCV(lin_svm, cv=cv) clf.fit(features[labeled, :], labels[labeled]) svm_labels = clf.predict_proba(features[unlabeled]) alg_labels[unlabeled] = svm_labels elif alg == 'label_propagation': # predict labels with a label propagation model label_propagation = LabelPropagation(kernel='rbf', gamma=0.05, max_iter=4000) labels[unlabeled] = -1 label_propagation.fit(features, labels) if self.hard_labels: label_propagation_labels = label_propagation.predict( features[unlabeled]).astype(int) else: label_propagation_labels = label_propagation.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_propagation_labels elif alg == 'label_spreading': # predict labels with a label propagation model label_spreading = LabelSpreading(kernel='rbf', gamma=0.05) labels[unlabeled] = -1 label_spreading.fit(features, labels) if self.hard_labels: label_spreading_labels = label_spreading.predict( features[unlabeled]).astype(int) else: label_spreading_labels = label_spreading.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_spreading_labels elif alg == 'harmonic': if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) soft_labels, hard_labels = harmonic_function( W, labels, labeled, unlabeled) if self.hard_labels: label_harmonic = hard_labels else: label_harmonic = soft_labels alg_labels[unlabeled] = label_harmonic elif alg == 'labels_only': # generate labeled only file alg_labels = alg_labels[labeled] if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or \ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError( 'There is some unlabeled observation, check \'' + alg + '\' algorithm,') create_relabeled_file([fnames[i] for i in labeled], alg_path, alg_labels, sep=',') break else: raise ValueError('algorithm \'' + alg + '\' not recognized.') if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or\ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError('There is some unlabeled observation,' 'check \'' + alg + '\' algorithm,') create_relabeled_file(fnames, alg_path, alg_labels, sep=',') if 'W' in locals(): del W