def plot_label_propagation(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.genus, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format(db.ScaffoldsTable, db.LabelPropagationResultsTable) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.lbl_prob: genera.append(r["genus"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) # Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot) # plot a test ( coverage vs coverage/gcs) Plots.fig3(coverages, cgs, lengths, genera, args.fn_plot)
def plot_kmeans_clusters(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means clusters") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length, {1}.cluster FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold ORDER BY {0}.scaffold """.format(db.ScaffoldsTable, db.KmeansResultsTable) data = db.retrieve_data(sql_command) db.close() scaffolds = [] coverages = [] cgs = [] lengths = [] clusters = [] for r in data: coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) clusters.append(r["cluster"]) Plots.fig2(coverages, cgs, lengths, clusters, args.fn_plot)
def go(args): # Read file marker cogs fhandle = open(args.fn_marker_cogs, "rU") reader = csv.reader(fhandle, delimiter=" ") reader.next() # ignore comment markercogs = [row[0] for row in reader] if len(markercogs) == 0: raise ValueError("No marker COGs provided") fhandle.close() db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) for cog in markercogs: log.info("Getting the sequences of all the genes belonging to COG %s", cog) sql_command = """SELECT {0}.gene_id, {0}.cog_id, {1}.sequence FROM {0} INNER JOIN {1} WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id """.format(db.GenesTable, db.SequenceTable, cog) data = db.retrieve_data(sql_command) fhandle = open("{0}.faa".format(cog), "w") for row in data: fhandle.write(">{0},{1}\n".format(row["gene_id"], row["cog_id"])) fhandle.write("{0}\n".format(row["sequence"])) fhandle.close() db.close()
def do_label_propagation_after_kmeans(args): """ Applies label propagation to k-means clusters """ log.info("Applying label propagataion to the k-mer spectrums") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT scaffold, cluster FROM {0} """.format( db.KmeansResultsTable) assigned_scaffolds = db.retrieve_data(sql_command) # calculate labels encoder = sklearn.preprocessing.LabelEncoder() known_labels = encoder.fit_transform( [r["cluster"] for r in assigned_scaffolds]) log.debug("Labels %s", encoder.classes_) log.debug("Number of labels: %s", len(known_labels)) # check that the encoder recovers the genus correctly #for r,c in zip(assigned_scaffolds,known_labels): # print r["scaffold"],r["genus"], encoder.inverse_transform(c) scaffold2label_dict = dict() for r in assigned_scaffolds: scaffold2label_dict[r["scaffold"]] = encoder.transform([r["cluster"] ])[0] sql_command = """SELECT scaffold, coverage, spectrum FROM {0} ORDER BY scaffold""".format(db.ScaffoldsTable) data = db.retrieve_data(sql_command) mat = design_matrices.get_spectrums_coverage_matrix(data) all_labels = [] scaffolds = [] for r in data: s = r["scaffold"] if s not in scaffold2label_dict: all_labels.append(-1) # unknown label else: all_labels.append(scaffold2label_dict[s]) scaffolds.append(s) clamping_factor = 0.5 label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=7, alpha=clamping_factor) label_spread.fit(mat, all_labels) output_labels = label_spread.predict(mat) probabilities = label_spread.predict_proba(mat) # label_spread.fit(mat[0:1000], all_labels[0:1000]) # output_labels = label_spread.predict(mat[0:1000]) # probabilities = label_spread.predict_proba(mat[0:1000]) if db.table_exists(db.KmeansLPResultsTable): db.drop_table(db.KmeansLPResultsTable) db.create_table(db.KmeansLPResultsTable, db.KmeansLPResultsFields, db.KmeansLPResultsTypes) data = [] for s, lab, probs in zip(scaffolds, output_labels, probabilities): p = probs.max() if np.isnan(p): data.append((s, defs.not_assigned, 0)) else: data.append((s, encoder.inverse_transform(lab), p)) db.store_data(db.KmeansLPResultsTable, data) db.close()
def plot_genus_assignments(args): """ Draws a plot of the read coverage for the scaffolds vs their GC content Each of the genera is assigned a color. This new version assumes that the ScaffoldKmerComparisonTable of final assignments has merged the results from ScaffoldsAssignmentsTable (the scaffolds assigned with BLAST) """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {1}.scaffold, {1}.genus, {0}.length, {0}.GC, {0}.coverage FROM {1} INNER JOIN {0} WHERE {1}.scaffold = {0}.scaffold """.format(db.ScaffoldsTable, db.ScaffoldKmerComparisonTable) data = db.retrieve_data(sql_command) coverages = [] gcs = [] lengths = [] genera = [] for r in data: coverages.append(r["coverage"]) gcs.append(r["GC"]) lengths.append(r["length"]) genera.append(r["genus"]) print "coverages", len(coverages), "gcs", len(gcs), "lengths", len( lengths), "genera", len(genera) Plots.fig2(coverages, gcs, lengths, genera, args.fn_plot)
def test_database(self): """ Test the creation of the database for the metagenome """ log.debug("Test creating a database with the metagenome data") fn_database = os.path.join(self.datadir, "tmp_database.db") db = MetagenomeDatabase.MetagenomeDatabase(fn_database, overwrite=True) # test the gene table fn_genes = os.path.join(self.datadir, "gene_info_test_file.xls") db.create_genes_table(fn_genes) sql_command = "SELECT * FROM {0}".format(db.GenesTable) genes = db.retrieve_data(sql_command) self.assertEqual(len(genes), 171) sql_command = """ SELECT * FROM {0} WHERE locus_tag="sg4i_00000050" """.format( db.GenesTable) genes = db.retrieve_data(sql_command) self.assertEqual(len(genes), 1) gene_t = GeneParser.GeneRecordTuple._make(genes[0]) self.assertEqual(gene_t.gene_id, "2061973757", "Gene id test failed") # test the table of sequences fn_sequences = os.path.join(self.datadir, "proteins.faa") db.create_protein_sequences_table(fn_sequences) sql_command = """ SELECT * FROM {0}""".format(db.SequenceTable) sequences = db.retrieve_data(sql_command) self.assertEqual(len(sequences), 5) sql_command = """ SELECT * FROM {0} WHERE gene_id="2061973757" """.format( db.SequenceTable) sequences = db.retrieve_data(sql_command) self.assertEqual(len(sequences), 1) self.assertEqual(gene_t.protein_length, len(sequences[0]["sequence"])) db.close() os.remove(fn_database)
def plot_kmeans_assignments(args): """ PLot of the genus assignments for each of the scaffolds after performing k-means clustering """ log.info("Plotting the K-means assignments") db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """ SELECT DISTINCT cluster FROM {0} """.format(db.KmeansResultsTable) data = db.retrieve_data(sql_command) clusters = [r["cluster"] for r in data] pairs_scaffold_genus = [] for cluster in clusters: # Select the scaffolds assinged in the cluster, sum the # bit scores of of each of the genera, and sort by the sum sql_command = """ SELECT {0}.scaffold, {0}.genus, SUM({0}.bits) FROM {0} INNER JOIN {1} WHERE cluster = {2} AND {0}.scaffold = {1}.scaffold GROUP BY {0}.genus ORDER BY {0}.bits DESC """.format(db.ScaffoldsAssignmentsTable, db.KmeansResultsTable, cluster) data = db.retrieve_data(sql_command) # get the genus with the largest number of bits assigned is the # first entry: if len(data) == 0: genus = defs.not_assigned else: genus = data[0]["genus"] # Assign the genus to all the scaffolds in the cluster sql_command = """ SELECT {0}.scaffold FROM {0} WHERE cluster = {1} """.format(db.KmeansResultsTable, cluster) data = db.retrieve_data(sql_command) pairs_scaffold_genus.extend([(r["scaffold"], genus) for r in data]) pairs_scaffold_genus.sort() sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.GC, {0}.length FROM {0} ORDER BY scaffold """.format(db.ScaffoldsTable) data = db.retrieve_data(sql_command) db.close() if len(data) != len(pairs_scaffold_genus): raise ValueError("The number of scaffolds in the database is not the " \ "same as the number of scaffolds assigned with k-means") scaffolds = [] coverages = [] cgs = [] lengths = [] genera = [] for r, pair in zip(data, pairs_scaffold_genus): coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) genera.append(pair[1]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
def create_database(args): db = MetagenomeDatabase.MetagenomeDatabase( args.fn_database) #, overwrite=True) if args.fn_genes: db.create_genes_table(args.fn_genes) if args.fn_protein_sequences: db.create_protein_sequences_table(args.fn_protein_sequences) if args.fn_scaffolds: db.fill_scaffolds_table(args.fn_scaffolds) if args.fn_scaffold_coverage: db.add_scaffold_coverage(args.fn_scaffold_coverage) db.close()
def blast_marker_cogs(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if not db.GenesTable in names: raise ValueError("The database does not have a table of genes") if not db.SequenceTable in names: raise ValueError("The database does not have a table sequences") # Read file marker cogs fhandle = open(args.fn_marker_cogs, "r") reader = csv.reader(fhandle, delimiter="\t") markercogs = frozenset([row[0] for row in reader]) if len(markercogs) == 0: raise ValueError("No marker COGs provided") for cog in markercogs: fn = os.path.join(args.cogsdbdir, cog + ".phr") if not os.path.exists(fn): raise IOError( "The database file {0} for the COG {1} does not exist".format(fn,cog)) # Get genes sql_command = """SELECT gene_id,cog_id FROM {0}""".format(db.GenesTable) data = db.retrieve_data(sql_command) if db.BlastResultsTable in names: db.drop_table(db.BlastResultsTable) db.create_blast_results_table() log.info("Running BLAST for %s marker COGS",len(markercogs)) n_batch_sequences = 100 # sequences to blast per batch sequence_tuples = [] for gene_id,cog_id in data: if cog_id in markercogs: sql_command = """SELECT sequence FROM {0} WHERE gene_id="{1}" """.format(db.SequenceTable, gene_id) records = db.retrieve_data(sql_command) if len(records) != 1: # Report but do not raise, continue processing other genes log.error("Problem with gene_id %s. There are no sequences in the database or " "there are more than one", gene_id) continue blast_database = os.path.join(args.cogsdbdir, cog_id) sequence_tuples.append((records[0][0], gene_id, blast_database)) if len(sequence_tuples) == n_batch_sequences: batch_results = blast(sequence_tuples) db.store_blast_results(batch_results) sequence_tuples = [] # Final run if len(sequence_tuples): batch_results = blast(sequence_tuples) batch_genes_ids = [tup[1] for tup in sequence_tuples] db.store_blast_results(batch_results) db.close()
def assignments2csv(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.scaffold, {0}.coverage, {0}.length, {0}.GC, {1}.genus FROM {0} INNER JOIN {1} WHERE {0}.scaffold={1}.scaffold """.format("Scaffolds", "ScaffoldKmerComparison") cursor = db.execute(sql_command) record = cursor.fetchone() f = open(args.fn_csv, "w") writer = csv.writer(f, delimiter=",") while record: writer.writerow([w for w in record]) record = cursor.fetchone() f.close() db.close()
def do_kmer_comparison(args): """ Compares the Kmer spectrums. Compares the scaffolds assigned using blast with the not assigned scaffolds """ log.info("Performing kmer comparison. Parameters: ") log.info("kmer size: %s dist12: %s threshold: %s", args.kmer, args.dist12,args.threshold) db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = \ db.get_genera_sequences_from(db.ScaffoldKmerComparisonTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_assignments = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() all_assignments.extend(matches) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_assignments.extend(matches) db.store_data(db.ScaffoldKmerComparisonTable, all_assignments) db.close()
def plot_dpgmm(args): db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) sql_command = """SELECT {0}.coverage, {0}.GC, {0}.length, {1}.cluster, {1}.probability FROM {0} INNER JOIN {1} WHERE {0}.scaffold = {1}.scaffold """.format(db.ScaffoldsTable, db.DPGMMResultsTable) data = db.retrieve_data(sql_command) db.close() coverages = [] cgs = [] lengths = [] genera = [] for r in data: if r["probability"] > args.dpgmm: genera.append(r["cluster"]) else: genera.append(defs.not_assigned) coverages.append(r["coverage"]) cgs.append(r["GC"]) lengths.append(r["length"]) Plots.fig2(coverages, cgs, lengths, genera, args.fn_plot)
def kmer_comparison_one_iteration(args): """ This function is the one-iteration version of the iterative function """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if db.ScaffoldKmerComparisonTable in names: db.drop_table(db.ScaffoldKmerComparisonTable) db.create_scaffold_kmer_comparison_table() kcounter = Kmer.KmerCounter(args.kmer) kcomparer = Kmer.KmerComparer(kcounter) kcomparer.set_kmer_distance_threshold(args.threshold) kcomparer.set_first_to_second_distance_ratio(args.dist12) # add the combined sequences of the scaffolds belonging to the same genera genus2sequence_dict, assigned_scaffolds = db.get_genera_sequences_from(db.ScaffoldsAssignmentsTable) for genus in genus2sequence_dict: kcomparer.add_reference_sequence(genus2sequence_dict[genus],genus) sql_command = "SELECT scaffold, sequence FROM {0}".format(db.ScaffoldsTable) cursor = db.execute(sql_command) batch_size = 1000 all_matches = [] record = cursor.fetchone() while record: scaffold = record["scaffold"] if scaffold not in assigned_scaffolds: kcomparer.add_sequence(record["sequence"], scaffold) if kcomparer.get_number_of_sequences() == batch_size: matches = kcomparer.run() # kcomparer will return False if a reliable match has not been found all_matches.extend([m for m in matches if m[1] != False]) record = cursor.fetchone() if kcomparer.get_number_of_sequences() > 0: matches = kcomparer.run() all_matches.extend([m for m in matches if m[1] != False]) db.store_data(db.ScaffoldKmerComparisonTable, all_matches) db.close()
def iterative_kmer_comparison(args): """ Compares not assigned scaffolds with the scaffolds assigned using BLAST using an iterative method. The function do_kmer_comparison uptades the sequences for each genus based on the scaffolds that have been assgined already. This way the most confident assignments are done first. """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if db.ScaffoldKmerComparisonTable in names: db.drop_table(db.ScaffoldKmerComparisonTable) db.create_scaffold_kmer_comparison_table() db.pass_blast_assigned_scaffolds_to_kmer_table() n_elements = db.count(db.ScaffoldKmerComparisonTable) i = 0 while True: log.info("Iterative comparison. Iteration %s",i) i += 1 do_kmer_comparison(args) count = db.count(db.ScaffoldKmerComparisonTable) if count == n_elements: break n_elements = count
import MetaBinner.paranoid_log as paranoid_log import MetaBinner.MetagenomeDatabase as MetagenomeDatabase import MetaBinner.Kmer as Kmer import MetaBinner.Plots as Plots import MetaBinner.definitions as defs import sys import logging logging.basicConfig(stream=sys.stdout) logging.root.setLevel(logging.DEBUG) db = MetagenomeDatabase.MetagenomeDatabase("2061766001_4mers.db") db.add_scaffold_spectrums(4) db.close()
def __init__(self, fn_database): self.db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database)
def assign_genus_to_scaffolds(args): """ Assign genus to scaffolds in the database The function: 1) Reads the genes in the database that belong to a given COG 2) Reads the BLAST results for each of the genes. 3) Recovers the best hit (genus and bit score) for the gene and identifies the scaffold where the gene is located 4) Assigns the genus found in the hit to the scaffold. Various scaffolds can have different assignments. To select one assignment, 1) sum the bit scores for the each of the genus assigned to a scaffold. 2) Chose the genus with the largest total bit score Finally, store the assignments in the database """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if not db.GenesTable in names: raise ValueError("The database does not have a table of genes") if not db.BlastResultsTable in names: raise ValueError("The database does not have a table of BLAST results") # Read file marker cogs fhandle = open(args.fn_marker_cogs, "rU") reader = csv.reader(fhandle, delimiter=" ") marker_cogs = frozenset([row[0] for row in reader]) if len(marker_cogs) == 0: raise ValueError("No marker COGs provided") if db.ScaffoldsAssignmentsTable in names: db.drop_table(db.ScaffoldsAssignmentsTable) db.create_scaffold_assignments_table() blast_result = BLASTUtilities.BLASTResult() scaffolds_dict = {} for cog_id in marker_cogs: # read the genes and scaffolds for the cog sql_command = """SELECT {0}.gene_id,{0}.scaffold, {0}.dna_length,{1}.titles,{1}.bits FROM {0} INNER JOIN {1} WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id """.format(db.GenesTable, db.BlastResultsTable, cog_id) cursor = db.execute(sql_command) r = cursor.fetchone() while r: sc = r["scaffold"] organism, bit_score = blast_result.get_best_hit( r["titles"], r["bits"]) genus = organism.split(" ")[0] add_to_scaffold_dictionary(scaffolds_dict, sc, genus, float(bit_score)) r = cursor.fetchone() # Assign the genus with the largest bit score data = [] for scaffold in scaffolds_dict: genus, bit_score = max(scaffolds_dict[scaffold].iteritems(), key=operator.itemgetter(1)) data.append((scaffold, genus, bit_score)) data = BiologyBasedRules.filter_genus_assignments(data, n_appearances=2, bit_score_threshold=30) db.store_data(db.ScaffoldsAssignmentsTable, data) db.close()