Пример #1
0
def do_label_propagation_after_kmeans(args):
    """ Applies label propagation to k-means clusters
    """
    log.info("Applying label propagataion to the k-mer spectrums")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT scaffold, cluster FROM {0} """.format(
        db.KmeansResultsTable)
    assigned_scaffolds = db.retrieve_data(sql_command)
    # calculate labels
    encoder = sklearn.preprocessing.LabelEncoder()
    known_labels = encoder.fit_transform(
        [r["cluster"] for r in assigned_scaffolds])
    log.debug("Labels %s", encoder.classes_)
    log.debug("Number of labels: %s", len(known_labels))
    # check that the encoder recovers the genus correctly
    #for r,c in zip(assigned_scaffolds,known_labels):
    #    print r["scaffold"],r["genus"], encoder.inverse_transform(c)
    scaffold2label_dict = dict()
    for r in assigned_scaffolds:
        scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"]
                                                                ])[0]
    sql_command = """SELECT scaffold, coverage, spectrum
                     FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    mat = design_matrices.get_spectrums_coverage_matrix(data)
    all_labels = []
    scaffolds = []
    for r in data:
        s = r["scaffold"]
        if s not in scaffold2label_dict:
            all_labels.append(-1)  # unknown label
        else:
            all_labels.append(scaffold2label_dict[s])
        scaffolds.append(s)

    clamping_factor = 0.5
    label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                    n_neighbors=7,
                                                    alpha=clamping_factor)
    label_spread.fit(mat, all_labels)
    output_labels = label_spread.predict(mat)
    probabilities = label_spread.predict_proba(mat)

    #    label_spread.fit(mat[0:1000], all_labels[0:1000])
    #    output_labels = label_spread.predict(mat[0:1000])
    #    probabilities = label_spread.predict_proba(mat[0:1000])

    if db.table_exists(db.KmeansLPResultsTable):
        db.drop_table(db.KmeansLPResultsTable)
    db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields,
                    db.KmeansLPResultsTypes)
    data = []
    for s, lab, probs in zip(scaffolds, output_labels, probabilities):
        p = probs.max()
        if np.isnan(p):
            data.append((s, defs.not_assigned, 0))
        else:
            data.append((s, encoder.inverse_transform(lab), p))
    db.store_data(db.KmeansLPResultsTable, data)
    db.close()
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=False
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=71,
                score_func=feature_selection.f_classif
            )),
            ('estim', semi_supervised.LabelSpreading(
                kernel='knn',
                alpha=0.17,
                n_neighbors=7,
                n_jobs=-1
            )),
        ])

        pipe.fit(x, y)
        self._transduction = pipe.named_steps['estim'].transduction_
        self._model = pipe.predict
Пример #3
0
def do_label_propagation(mat, input_labels):
    """ do label propagation on a matrix of features

    @param mat Input matrix. Each row is a datapoint and each column a feature
    @param input_labels A list with the labels of the datapoints. If the Label
    of a datapoint is not know, it must be -1
    @return the labels proposed for each point and the matrix of probabilities
    of each label for each datapoint

    """
    log.info("Doing label propagation with kmer-spectrums and coverage values")
    encoder  = sklearn.preprocessing.LabelEncoder()
    known_labels = encoder.fit_transform(input_labels)
    log.debug("Different labels to propagate: %s",set(input_labels))
    log.debug("After fit transform: %s",set(known_labels))
    log.debug("data matrix %s",mat.shape)
    clamping_factor = 1
    label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor)
    label_spread.fit(mat, input_labels)
    labs = label_spread.predict(mat)
    probs = label_spread.predict_proba(mat)
    n_points = len(input_labels)
    output_labels = []
    probabilities = np.zeros(n_points, dtype=float )
    for i in range(n_points):
        p = probs[i,:].max()
        if np.isnan(p) :
            output_labels.append(defs.not_assigned)
            probabilities[i] = 0
        else:
            output_labels.append(encoder.inverse_transform(labs[i]))
            probabilities[i] = p
    return output_labels, probabilities
Пример #4
0
    def do_label_propagation(self, input_labels):
        """ Same as label propagation but the coverage is part of the vector of features
        """
        log.info("Doing label propagation with kmer-spectrums and coverage values")

        sql_command = """SELECT scaffold, spectrum, coverage
                         FROM {0}
                         ORDER BY scaffold""".format(self.db.ScaffoldsTable)
        data = self.db.retrieve_data(sql_command)
        scaffolds = [r["scaffold"] for r in data]
        mat = self.get_spectrums_coverage_matrix(data)
        encoder  = sklearn.preprocessing.LabelEncoder()

        known_labels = encoder.fit_transform(input_labels)

        log.debug("Different labels to propagate: %s",set(input_labels))
        log.debug("After fit transform: %s",set(known_labels))

        log.debug("data matrix %s",mat.shape)
        clamping_factor = 1
        label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor)
        label_spread.fit(mat, input_labels)
        output_cluster_labels = label_spread.predict(mat)
        log.debug("Different output labels : %s",set(output_cluster_labels))
        probabilities = label_spread.predict_proba(mat)

#        label_spread.fit(mat[0:5000], input_labels[0:5000])
#        output_cluster_labels = label_spread.predict(mat[0:5000])
#        probabilities = label_spread.predict_proba(mat[0:5000])

        if self.db.get_table_exists(self.db.LabelPropagationResultsTable):
            self.db.drop_table(self.db.LabelPropagationResultsTable)
        self.db.create_label_propagation_results_table()

        # store the assignments in the database
        data = []
        for sc, lab, probs in zip(scaffolds, output_cluster_labels, probabilities):

            p = probs.max()
            if np.isnan(p) :
                g = -1
            else:
                g = lab
#                g = encoder.inverse_transform(lab)
            self.scaffold2cluster_dict[sc] = g
            # store as genus the cluster index
            data.append((sc, g, p))
        self.db.store_data(self.db.LabelPropagationResultsTable, data)
        raw_input("label propagation. Data stored. Press ENTER")
Пример #5
0
    def predict(self, mask, ref, pst):
        from sklearn import semi_supervised
        from skimage import morphology

        nodata = np.logical_or(
            np.isnan(ref).any(axis=2),
            np.isnan(pst).any(axis=2))

        X = self.generate_features(ref, pst)
        X[nodata] = 0

        outliers = np.prod(1 + np.clip(X, 0, None), axis=-1)
        outliers[~np.isfinite(outliers)] = 0
        cloud = np.nanmean(outliers[mask == 2])
        water = np.nanmean(outliers[mask == 5])
        cutoff = 0.5 * (cloud + water)
        outliers -= max(1, cutoff)
        outliers /= min(1, np.nanmax(outliers))
        outliers = np.clip(outliers, 0, 1, out=outliers)

        known = morphology.binary_erosion(outliers, morphology.diamond(6))
        unknown = morphology.binary_dilation(outliers, morphology.diamond(20))
        focus = morphology.binary_dilation(outliers, morphology.diamond(30))

        outliers[:, :] = 0
        outliers[unknown] = -1
        outliers[known] = 1

        y = outliers[focus].reshape((-1, ))
        X = X[focus].reshape((-1, X.shape[-1]))

        lblspread = semi_supervised.LabelSpreading(kernel="knn",
                                                   alpha=0.8,
                                                   max_iter=100,
                                                   n_neighbors=20,
                                                   n_jobs=1)
        lblspread.fit(X, y)

        self.log(f"Iters: {lblspread.n_iter_}")

        outliers[focus] = lblspread.transduction_
        outliers = outliers.reshape(mask.shape)

        outliers[nodata] = 0

        return outliers[:, :, np.newaxis]
Пример #6
0
def main(spca_file_1, labels_file, output_file, n_neighbours, gamma, n_iter,
         **kwargs):
    df_1 = pd.read_csv(spca_file_1, index_col=0).to_numpy()
    y_gt = pd.read_csv(labels_file, index_col=0).to_numpy().ravel()
    output_df = pd.DataFrame()
    print('Data loaded...')
    print('Spreading labels')
    prop = sm.LabelSpreading(kernel='knn',
                             n_neighbors=n_neighbours,
                             gamma=gamma,
                             alpha=0.7,
                             max_iter=n_iter,
                             n_jobs=-1)
    prop.fit(df_1, y_gt)
    output = prop.transduction_.reshape(-1, 1)
    output_prob = prop.predict_proba(df_1)
    output_dist = prop.label_distributions_
    output_pred = prop.predict(df_1).reshape(-1, 1)
    np.savetxt(output_file,
               np.hstack((output, output_prob, output_dist, output_pred)),
               delimiter=',')
Пример #7
0
 def get_skl_estimator(self, **default_parameters):
     return semi_supervised.LabelSpreading(**default_parameters)
Пример #8
0
######   混洗样本 ########
rng = np.random.RandomState(0)
indices = np.arange(len(digits.data)) # 样本下标集合
rng.shuffle(indices) # 混洗样本下标集合
X = digits.data[indices]
y = digits.target[indices]
###### 生成未标记样本的下标集合 ####
n_labeled_points = int(len(y)/10) # 只有 10% 的样本有标记
unlabeled_indices = np.arange(len(y))[n_labeled_points:] # 后面 90% 的样本未标记

X,y,unlabeled_indices

# 测试 LabelSpreading 的用法
y_train=np.copy(y) # 必须拷贝,后面要用到 y
y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1
clf=semi_supervised.LabelSpreading(max_iter=100,kernel='rbf',gamma=0.1)
clf.fit(X,y_train)
### 获取预测准确率
predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记
true_labels = y[unlabeled_indices] # 真实标记
myML.Semi.showModelTest(clf,X[unlabeled_indices],true_labels)

# 测试 LabelSpreading 的 rbf 核时,预测性能随 alpha 和 gamma 的变化
y_train=np.copy(y) # 必须拷贝,后面要用到 y
y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1
gammas=np.logspace(-2,2,num=10)
myML.plotML.PlotParam_Score(X,X[unlabeled_indices],y_train,y[unlabeled_indices],
                            "semi_supervised.LabelSpreading()",drawParam=1,logX=True,
                            gamma=gammas,max_iter=[100],kernel=['rbf'])

# 测试 LabelSpreading 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化
Пример #9
0
    def predict(self, mask, pre, pst):
        from sklearn import semi_supervised
        from skimage import morphology

        F = self.generate_features(pre, pst)

        self.log(f"Features shape: {F.shape}")

        good = np.isfinite(F).all(axis=2)
        F[~good] = 0

        cutoff = np.nanpercentile(F[:, :, 0], 99, axis=(0, 1))

        pos = F[:, :, 0] > cutoff

        lbls = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8)

        pos = morphology.binary_erosion(pos, morphology.diamond(3))
        unknown = morphology.binary_dilation(pos, morphology.diamond(20))
        focus = morphology.binary_dilation(pos, morphology.diamond(50))

        outliers = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8)
        outliers[unknown] = -1
        outliers[pos] = 1

        X = F[focus].reshape((-1, F.shape[-1]))
        y = outliers[focus].reshape((-1, ))

        lblspread = semi_supervised.LabelSpreading(kernel="knn",
                                                   alpha=0.1,
                                                   max_iter=100,
                                                   n_neighbors=30,
                                                   n_jobs=-1)
        lblspread.fit(X, y)

        outliers[focus] = lblspread.transduction_
        outliers = outliers.reshape(mask.shape)

        outliers[mask == 0] = 0

        outliers = morphology.binary_closing(outliers, morphology.diamond(5))

        lbls[outliers == 1] = 1

        neg = F[:, :, 0] < -1 * cutoff
        neg = morphology.binary_erosion(neg, morphology.diamond(3))
        unknown = morphology.binary_dilation(neg, morphology.diamond(20))
        focus = morphology.binary_dilation(neg, morphology.diamond(50))

        outliers = np.zeros((F.shape[0], F.shape[1]), dtype=np.int8)
        outliers[unknown] = -1
        outliers[neg] = 1

        X = F[focus].reshape((-1, F.shape[-1]))
        y = outliers[focus].reshape((-1, ))

        lblspread = semi_supervised.LabelSpreading(kernel="knn",
                                                   alpha=0.1,
                                                   max_iter=100,
                                                   n_neighbors=30,
                                                   n_jobs=-1)
        lblspread.fit(X, y)

        outliers[focus] = lblspread.transduction_
        outliers = outliers.reshape(mask.shape)

        outliers[mask == 0] = 0

        outliers = morphology.binary_closing(outliers, morphology.diamond(5))

        lbls[outliers == 1] = 2

        return lbls[:, :, np.newaxis]
Пример #10
0
    def predict(self, mask, pre, pst):
        from skimage import feature, draw, morphology
        from sklearn import semi_supervised

        X = self.generate_features(pre, pst)

        self.log(f'Features shape: {X.shape}')

        self.log('Masking data')

        good = np.isfinite(X).all(axis=2)
        X[~good] = 0

        bX = (X[:, :, 0] > 0) * ((1 + X[:, :, 0]) *
                                 (1 + X[:, :, 1]) - 1).astype(np.float64)
        #bX = ((1 + X[:, :, 0]) * (1 + X[:, :, 1]) - 1).astype(np.float64)

        bX[mask] = 0

        self.log('Change layer generated')

        # TODO: Parameterise
        with np.errstate(all='ignore'):
            blobs = feature.blob_doh(bX,
                                     min_sigma=5,
                                     max_sigma=30,
                                     overlap=0.9,
                                     threshold=0.008)

        self.log('Blob detection')

        focus = np.zeros_like(bX, dtype=bool)
        for blob in blobs:
            y, x, r = blob
            rr, cc = draw.disk((y, x), r * 5, shape=focus.shape)
            focus[rr, cc] = True

        outliers = np.zeros_like(bX, dtype=np.int8)

        self.log(f"blobs: {blobs.shape[0]}")

        if blobs.shape[0] == 0:
            return outliers[:, :, np.newaxis]

        # TODO: Parameterise radius

        for blob in blobs:
            y, x, r = blob
            rr, cc = draw.disk((y, x), r * 3, shape=outliers.shape)
            outliers[rr, cc] = -1

        for blob in blobs:
            y, x, r = blob
            rr, cc = draw.disk((y, x), r / 2, shape=outliers.shape)
            outliers[rr, cc] = 1

        outliers[mask] = 0

        self.log('Potential outliers masked')

        nunknown = np.count_nonzero(outliers == -1)
        ntotal = np.count_nonzero(focus)

        self.log(f"Spreading: {nunknown} / {ntotal} ({nunknown / ntotal:.4f})")

        y = outliers[focus].reshape((-1, ))
        X = X[focus].reshape((-1, X.shape[-1]))

        # TODO: Parametrise
        lblspread = semi_supervised.LabelSpreading(kernel="knn",
                                                   alpha=0.8,
                                                   max_iter=100,
                                                   n_neighbors=20,
                                                   n_jobs=1)
        lblspread.fit(X, y)

        self.log(f"Iters: {lblspread.n_iter_}")

        outliers[focus] = lblspread.transduction_
        outliers = outliers.reshape(bX.shape)

        self.log('Done.')

        return outliers[:, :, np.newaxis]
 def _apply_label_propagation_model(self, contexts, labels):
     label_prop_model = semi_supervised.LabelSpreading(
         kernel=self.affinity_func)
     label_prop_model.fit(contexts, labels)
     return label_prop_model.transduction_