Пример #1
0
def do_label_propagation_after_kmeans(args):
    """ Applies label propagation to k-means clusters
    """
    log.info("Applying label propagataion to the k-mer spectrums")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT scaffold, cluster FROM {0} """.format(
        db.KmeansResultsTable)
    assigned_scaffolds = db.retrieve_data(sql_command)
    # calculate labels
    encoder = sklearn.preprocessing.LabelEncoder()
    known_labels = encoder.fit_transform(
        [r["cluster"] for r in assigned_scaffolds])
    log.debug("Labels %s", encoder.classes_)
    log.debug("Number of labels: %s", len(known_labels))
    # check that the encoder recovers the genus correctly
    #for r,c in zip(assigned_scaffolds,known_labels):
    #    print r["scaffold"],r["genus"], encoder.inverse_transform(c)
    scaffold2label_dict = dict()
    for r in assigned_scaffolds:
        scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"]
                                                                ])[0]
    sql_command = """SELECT scaffold, coverage, spectrum
                     FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    mat = design_matrices.get_spectrums_coverage_matrix(data)
    all_labels = []
    scaffolds = []
    for r in data:
        s = r["scaffold"]
        if s not in scaffold2label_dict:
            all_labels.append(-1)  # unknown label
        else:
            all_labels.append(scaffold2label_dict[s])
        scaffolds.append(s)

    clamping_factor = 0.5
    label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                    n_neighbors=7,
                                                    alpha=clamping_factor)
    label_spread.fit(mat, all_labels)
    output_labels = label_spread.predict(mat)
    probabilities = label_spread.predict_proba(mat)

    #    label_spread.fit(mat[0:1000], all_labels[0:1000])
    #    output_labels = label_spread.predict(mat[0:1000])
    #    probabilities = label_spread.predict_proba(mat[0:1000])

    if db.table_exists(db.KmeansLPResultsTable):
        db.drop_table(db.KmeansLPResultsTable)
    db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields,
                    db.KmeansLPResultsTypes)
    data = []
    for s, lab, probs in zip(scaffolds, output_labels, probabilities):
        p = probs.max()
        if np.isnan(p):
            data.append((s, defs.not_assigned, 0))
        else:
            data.append((s, encoder.inverse_transform(lab), p))
    db.store_data(db.KmeansLPResultsTable, data)
    db.close()
Пример #2
0
 def use_spectrums_coverage_matrix(self):
     """ Use the spectrums-coverage matrix as the design matrix for the ML
         algorithms.
     """
         
     sql_command = """SELECT coverage, scaffold, spectrum
                  FROM {0} ORDER BY scaffold""".format(self.db.ScaffoldsTable)
     data = self.db.retrieve_data(sql_command)
     self.mat = design_matrices.get_spectrums_coverage_matrix(data)
     self.scaffolds = [r["scaffold"] for r in data]
Пример #3
0
def do_label_propagation_after_kmeans(args):
    """ Applies label propagation to k-means clusters
    """
    log.info("Applying label propagataion to the k-mer spectrums")
    db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
    sql_command = """SELECT scaffold, cluster FROM {0} """.format(db.KmeansResultsTable)
    assigned_scaffolds = db.retrieve_data(sql_command)
    # calculate labels
    encoder  = sklearn.preprocessing.LabelEncoder()
    known_labels = encoder.fit_transform([r["cluster"] for r in assigned_scaffolds])
    log.debug("Labels %s",encoder.classes_)
    log.debug("Number of labels: %s", len(known_labels))
    # check that the encoder recovers the genus correctly
    #for r,c in zip(assigned_scaffolds,known_labels):
    #    print r["scaffold"],r["genus"], encoder.inverse_transform(c)
    scaffold2label_dict = dict()
    for r in assigned_scaffolds:
        scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"]])[0]
    sql_command = """SELECT scaffold, coverage, spectrum
                     FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable)
    data = db.retrieve_data(sql_command)
    mat = design_matrices.get_spectrums_coverage_matrix(data)
    all_labels = []
    scaffolds = []
    for r in data:
        s = r["scaffold"]
        if s not in scaffold2label_dict:
            all_labels.append(-1) # unknown label
        else:
            all_labels.append( scaffold2label_dict[s] )
        scaffolds.append(s)

    clamping_factor = 0.5
    label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor)
    label_spread.fit(mat, all_labels)
    output_labels = label_spread.predict(mat)
    probabilities = label_spread.predict_proba(mat)

#    label_spread.fit(mat[0:1000], all_labels[0:1000])
#    output_labels = label_spread.predict(mat[0:1000])
#    probabilities = label_spread.predict_proba(mat[0:1000])

    if db.table_exists(db.KmeansLPResultsTable):
        db.drop_table(db.KmeansLPResultsTable)
    db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields,db.KmeansLPResultsTypes)
    data = []
    for s, lab, probs in zip(scaffolds, output_labels, probabilities):
        p = probs.max()
        if np.isnan(p) :
            data.append((s, defs.not_assigned, 0))
        else:
            data.append((s, encoder.inverse_transform(lab), p))
    db.store_data(db.KmeansLPResultsTable, data)
    db.close()
Пример #4
0
    def use_spectrums_coverage_matrix(self):
        """ Use the spectrums-coverage matrix as the design matrix for the ML
            algorithms.
        """

        sql_command = """SELECT coverage, scaffold, spectrum
                     FROM {0} ORDER BY scaffold""".format(
            self.db.ScaffoldsTable)
        data = self.db.retrieve_data(sql_command)
        self.mat = design_matrices.get_spectrums_coverage_matrix(data)
        self.scaffolds = [r["scaffold"] for r in data]