def do_label_propagation_after_kmeans(args): """ Applies label propagation to k-means clusters """ log.info("Applying label propagataion to the k-mer spectrums") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT scaffold, cluster FROM {0} """.format( db.KmeansResultsTable) assigned_scaffolds = db.retrieve_data(sql_command) # calculate labels encoder = sklearn.preprocessing.LabelEncoder() known_labels = encoder.fit_transform( [r["cluster"] for r in assigned_scaffolds]) log.debug("Labels %s", encoder.classes_) log.debug("Number of labels: %s", len(known_labels)) # check that the encoder recovers the genus correctly #for r,c in zip(assigned_scaffolds,known_labels): # print r["scaffold"],r["genus"], encoder.inverse_transform(c) scaffold2label_dict = dict() for r in assigned_scaffolds: scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"] ])[0] sql_command = """SELECT scaffold, coverage, spectrum FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable) data = db.retrieve_data(sql_command) mat = design_matrices.get_spectrums_coverage_matrix(data) all_labels = [] scaffolds = [] for r in data: s = r["scaffold"] if s not in scaffold2label_dict: all_labels.append(-1) # unknown label else: all_labels.append(scaffold2label_dict[s]) scaffolds.append(s) clamping_factor = 0.5 label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor) label_spread.fit(mat, all_labels) output_labels = label_spread.predict(mat) probabilities = label_spread.predict_proba(mat) # label_spread.fit(mat[0:1000], all_labels[0:1000]) # output_labels = label_spread.predict(mat[0:1000]) # probabilities = label_spread.predict_proba(mat[0:1000]) if db.table_exists(db.KmeansLPResultsTable): db.drop_table(db.KmeansLPResultsTable) db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields, db.KmeansLPResultsTypes) data = [] for s, lab, probs in zip(scaffolds, output_labels, probabilities): p = probs.max() if np.isnan(p): data.append((s, defs.not_assigned, 0)) else: data.append((s, encoder.inverse_transform(lab), p)) db.store_data(db.KmeansLPResultsTable, data) db.close()
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=False )), ('select', feature_selection.SelectPercentile( percentile=71, score_func=feature_selection.f_classif )), ('estim', semi_supervised.LabelSpreading( kernel='knn', alpha=0.17, n_neighbors=7, n_jobs=-1 )), ]) pipe.fit(x, y) self._transduction = pipe.named_steps['estim'].transduction_ self._model = pipe.predict
def do_label_propagation(mat, input_labels): """ do label propagation on a matrix of features @param mat Input matrix. Each row is a datapoint and each column a feature @param input_labels A list with the labels of the datapoints. If the Label of a datapoint is not know, it must be -1 @return the labels proposed for each point and the matrix of probabilities of each label for each datapoint """ log.info("Doing label propagation with kmer-spectrums and coverage values") encoder = sklearn.preprocessing.LabelEncoder() known_labels = encoder.fit_transform(input_labels) log.debug("Different labels to propagate: %s",set(input_labels)) log.debug("After fit transform: %s",set(known_labels)) log.debug("data matrix %s",mat.shape) clamping_factor = 1 label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor) label_spread.fit(mat, input_labels) labs = label_spread.predict(mat) probs = label_spread.predict_proba(mat) n_points = len(input_labels) output_labels = [] probabilities = np.zeros(n_points, dtype=float ) for i in range(n_points): p = probs[i,:].max() if np.isnan(p) : output_labels.append(defs.not_assigned) probabilities[i] = 0 else: output_labels.append(encoder.inverse_transform(labs[i])) probabilities[i] = p return output_labels, probabilities
def do_label_propagation(self, input_labels): """ Same as label propagation but the coverage is part of the vector of features """ log.info("Doing label propagation with kmer-spectrums and coverage values") sql_command = """SELECT scaffold, spectrum, coverage FROM {0} ORDER BY scaffold""".format(self.db.ScaffoldsTable) data = self.db.retrieve_data(sql_command) scaffolds = [r["scaffold"] for r in data] mat = self.get_spectrums_coverage_matrix(data) encoder = sklearn.preprocessing.LabelEncoder() known_labels = encoder.fit_transform(input_labels) log.debug("Different labels to propagate: %s",set(input_labels)) log.debug("After fit transform: %s",set(known_labels)) log.debug("data matrix %s",mat.shape) clamping_factor = 1 label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor) label_spread.fit(mat, input_labels) output_cluster_labels = label_spread.predict(mat) log.debug("Different output labels : %s",set(output_cluster_labels)) probabilities = label_spread.predict_proba(mat) # label_spread.fit(mat[0:5000], input_labels[0:5000]) # output_cluster_labels = label_spread.predict(mat[0:5000]) # probabilities = label_spread.predict_proba(mat[0:5000]) if self.db.get_table_exists(self.db.LabelPropagationResultsTable): self.db.drop_table(self.db.LabelPropagationResultsTable) self.db.create_label_propagation_results_table() # store the assignments in the database data = [] for sc, lab, probs in zip(scaffolds, output_cluster_labels, probabilities): p = probs.max() if np.isnan(p) : g = -1 else: g = lab # g = encoder.inverse_transform(lab) self.scaffold2cluster_dict[sc] = g # store as genus the cluster index data.append((sc, g, p)) self.db.store_data(self.db.LabelPropagationResultsTable, data) raw_input("label propagation. Data stored. Press ENTER")
def predict(self, mask, ref, pst): from sklearn import semi_supervised from skimage import morphology nodata = np.logical_or( np.isnan(ref).any(axis=2), np.isnan(pst).any(axis=2)) X = self.generate_features(ref, pst) X[nodata] = 0 outliers = np.prod(1 + np.clip(X, 0, None), axis=-1) outliers[~np.isfinite(outliers)] = 0 cloud = np.nanmean(outliers[mask == 2]) water = np.nanmean(outliers[mask == 5]) cutoff = 0.5 * (cloud + water) outliers -= max(1, cutoff) outliers /= min(1, np.nanmax(outliers)) outliers = np.clip(outliers, 0, 1, out=outliers) known = morphology.binary_erosion(outliers, morphology.diamond(6)) unknown = morphology.binary_dilation(outliers, morphology.diamond(20)) focus = morphology.binary_dilation(outliers, morphology.diamond(30)) outliers[:, :] = 0 outliers[unknown] = -1 outliers[known] = 1 y = outliers[focus].reshape((-1, )) X = X[focus].reshape((-1, X.shape[-1])) lblspread = semi_supervised.LabelSpreading(kernel="knn", alpha=0.8, max_iter=100, n_neighbors=20, n_jobs=1) lblspread.fit(X, y) self.log(f"Iters: {lblspread.n_iter_}") outliers[focus] = lblspread.transduction_ outliers = outliers.reshape(mask.shape) outliers[nodata] = 0 return outliers[:, :, np.newaxis]
def main(spca_file_1, labels_file, output_file, n_neighbours, gamma, n_iter, **kwargs): df_1 = pd.read_csv(spca_file_1, index_col=0).to_numpy() y_gt = pd.read_csv(labels_file, index_col=0).to_numpy().ravel() output_df = pd.DataFrame() print('Data loaded...') print('Spreading labels') prop = sm.LabelSpreading(kernel='knn', n_neighbors=n_neighbours, gamma=gamma, alpha=0.7, max_iter=n_iter, n_jobs=-1) prop.fit(df_1, y_gt) output = prop.transduction_.reshape(-1, 1) output_prob = prop.predict_proba(df_1) output_dist = prop.label_distributions_ output_pred = prop.predict(df_1).reshape(-1, 1) np.savetxt(output_file, np.hstack((output, output_prob, output_dist, output_pred)), delimiter=',')
def get_skl_estimator(self, **default_parameters): return semi_supervised.LabelSpreading(**default_parameters)
###### 混洗样本 ######## rng = np.random.RandomState(0) indices = np.arange(len(digits.data)) # 样本下标集合 rng.shuffle(indices) # 混洗样本下标集合 X = digits.data[indices] y = digits.target[indices] ###### 生成未标记样本的下标集合 #### n_labeled_points = int(len(y)/10) # 只有 10% 的样本有标记 unlabeled_indices = np.arange(len(y))[n_labeled_points:] # 后面 90% 的样本未标记 X,y,unlabeled_indices # 测试 LabelSpreading 的用法 y_train=np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1 clf=semi_supervised.LabelSpreading(max_iter=100,kernel='rbf',gamma=0.1) clf.fit(X,y_train) ### 获取预测准确率 predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记 true_labels = y[unlabeled_indices] # 真实标记 myML.Semi.showModelTest(clf,X[unlabeled_indices],true_labels) # 测试 LabelSpreading 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 y_train=np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1 gammas=np.logspace(-2,2,num=10) myML.plotML.PlotParam_Score(X,X[unlabeled_indices],y_train,y[unlabeled_indices], "semi_supervised.LabelSpreading()",drawParam=1,logX=True, gamma=gammas,max_iter=[100],kernel=['rbf']) # 测试 LabelSpreading 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化
def predict(self, mask, pre, pst): from sklearn import semi_supervised from skimage import morphology F = self.generate_features(pre, pst) self.log(f"Features shape: {F.shape}") good = np.isfinite(F).all(axis=2) F[~good] = 0 cutoff = np.nanpercentile(F[:, :, 0], 99, axis=(0, 1)) pos = F[:, :, 0] > cutoff lbls = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8) pos = morphology.binary_erosion(pos, morphology.diamond(3)) unknown = morphology.binary_dilation(pos, morphology.diamond(20)) focus = morphology.binary_dilation(pos, morphology.diamond(50)) outliers = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8) outliers[unknown] = -1 outliers[pos] = 1 X = F[focus].reshape((-1, F.shape[-1])) y = outliers[focus].reshape((-1, )) lblspread = semi_supervised.LabelSpreading(kernel="knn", alpha=0.1, max_iter=100, n_neighbors=30, n_jobs=-1) lblspread.fit(X, y) outliers[focus] = lblspread.transduction_ outliers = outliers.reshape(mask.shape) outliers[mask == 0] = 0 outliers = morphology.binary_closing(outliers, morphology.diamond(5)) lbls[outliers == 1] = 1 neg = F[:, :, 0] < -1 * cutoff neg = morphology.binary_erosion(neg, morphology.diamond(3)) unknown = morphology.binary_dilation(neg, morphology.diamond(20)) focus = morphology.binary_dilation(neg, morphology.diamond(50)) outliers = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8) outliers[unknown] = -1 outliers[neg] = 1 X = F[focus].reshape((-1, F.shape[-1])) y = outliers[focus].reshape((-1, )) lblspread = semi_supervised.LabelSpreading(kernel="knn", alpha=0.1, max_iter=100, n_neighbors=30, n_jobs=-1) lblspread.fit(X, y) outliers[focus] = lblspread.transduction_ outliers = outliers.reshape(mask.shape) outliers[mask == 0] = 0 outliers = morphology.binary_closing(outliers, morphology.diamond(5)) lbls[outliers == 1] = 2 return lbls[:, :, np.newaxis]
def predict(self, mask, pre, pst): from skimage import feature, draw, morphology from sklearn import semi_supervised X = self.generate_features(pre, pst) self.log(f'Features shape: {X.shape}') self.log('Masking data') good = np.isfinite(X).all(axis=2) X[~good] = 0 bX = (X[:, :, 0] > 0) * ((1 + X[:, :, 0]) * (1 + X[:, :, 1]) - 1).astype(np.float64) #bX = ((1 + X[:, :, 0]) * (1 + X[:, :, 1]) - 1).astype(np.float64) bX[mask] = 0 self.log('Change layer generated') # TODO: Parameterise with np.errstate(all='ignore'): blobs = feature.blob_doh(bX, min_sigma=5, max_sigma=30, overlap=0.9, threshold=0.008) self.log('Blob detection') focus = np.zeros_like(bX, dtype=bool) for blob in blobs: y, x, r = blob rr, cc = draw.disk((y, x), r * 5, shape=focus.shape) focus[rr, cc] = True outliers = np.zeros_like(bX, dtype=np.int8) self.log(f"blobs: {blobs.shape[0]}") if blobs.shape[0] == 0: return outliers[:, :, np.newaxis] # TODO: Parameterise radius for blob in blobs: y, x, r = blob rr, cc = draw.disk((y, x), r * 3, shape=outliers.shape) outliers[rr, cc] = -1 for blob in blobs: y, x, r = blob rr, cc = draw.disk((y, x), r / 2, shape=outliers.shape) outliers[rr, cc] = 1 outliers[mask] = 0 self.log('Potential outliers masked') nunknown = np.count_nonzero(outliers == -1) ntotal = np.count_nonzero(focus) self.log(f"Spreading: {nunknown} / {ntotal} ({nunknown / ntotal:.4f})") y = outliers[focus].reshape((-1, )) X = X[focus].reshape((-1, X.shape[-1])) # TODO: Parametrise lblspread = semi_supervised.LabelSpreading(kernel="knn", alpha=0.8, max_iter=100, n_neighbors=20, n_jobs=1) lblspread.fit(X, y) self.log(f"Iters: {lblspread.n_iter_}") outliers[focus] = lblspread.transduction_ outliers = outliers.reshape(bX.shape) self.log('Done.') return outliers[:, :, np.newaxis]
def _apply_label_propagation_model(self, contexts, labels): label_prop_model = semi_supervised.LabelSpreading( kernel=self.affinity_func) label_prop_model.fit(contexts, labels) return label_prop_model.transduction_