def test_description_parsing(self): """ Test the parsing of a blast description """ # File with all the microoganisms in nr.COG1528 fn_check_file = os.path.join(self.datadir, "nr.COG1528.check_file") organisms = set() for words in csv.reader(open(fn_check_file), delimiter=" "): if len(words) >= 2: genus = words[0].lower() species = words[1].lower() name = genus + " " + species organisms.add(name) log.debug("organisms in the check file: %s", organisms) # Parse all fasta descriptions fn_database = os.path.join(self.datadir, "nr.COG1528") parser = SeqIO.parse(fn_database, "fasta") organisms_parsed = set() p = BLASTUtilities.BLASTResult() for seq_record in parser: map(organisms_parsed.add, p.parse_organisms(seq_record.description)) log.debug("organisms_parsed: %s", organisms_parsed) self.assertEqual(len(organisms), len(organisms_parsed), "The number of organisms parsed is not correct")
def assign_genus_to_scaffolds(args): """ Assign genus to scaffolds in the database The function: 1) Reads the genes in the database that belong to a given COG 2) Reads the BLAST results for each of the genes. 3) Recovers the best hit (genus and bit score) for the gene and identifies the scaffold where the gene is located 4) Assigns the genus found in the hit to the scaffold. Various scaffolds can have different assignments. To select one assignment, 1) sum the bit scores for the each of the genus assigned to a scaffold. 2) Chose the genus with the largest total bit score Finally, store the assignments in the database """ db = MetagenomeDatabase.MetagenomeDatabase(args.fn_database) names = db.get_tables_names() if not db.GenesTable in names: raise ValueError("The database does not have a table of genes") if not db.BlastResultsTable in names: raise ValueError("The database does not have a table of BLAST results") # Read file marker cogs fhandle = open(args.fn_marker_cogs, "rU") reader = csv.reader(fhandle, delimiter=" ") marker_cogs = frozenset([row[0] for row in reader]) if len(marker_cogs) == 0: raise ValueError("No marker COGs provided") if db.ScaffoldsAssignmentsTable in names: db.drop_table(db.ScaffoldsAssignmentsTable) db.create_scaffold_assignments_table() blast_result = BLASTUtilities.BLASTResult() scaffolds_dict = {} for cog_id in marker_cogs: # read the genes and scaffolds for the cog sql_command = """SELECT {0}.gene_id,{0}.scaffold, {0}.dna_length,{1}.titles,{1}.bits FROM {0} INNER JOIN {1} WHERE {0}.cog_id="{2}" AND {0}.gene_id={1}.gene_id """.format(db.GenesTable, db.BlastResultsTable, cog_id) cursor = db.execute(sql_command) r = cursor.fetchone() while r: sc = r["scaffold"] organism, bit_score = blast_result.get_best_hit( r["titles"], r["bits"]) genus = organism.split(" ")[0] add_to_scaffold_dictionary(scaffolds_dict, sc, genus, float(bit_score)) r = cursor.fetchone() # Assign the genus with the largest bit score data = [] for scaffold in scaffolds_dict: genus, bit_score = max(scaffolds_dict[scaffold].iteritems(), key=operator.itemgetter(1)) data.append((scaffold, genus, bit_score)) data = BiologyBasedRules.filter_genus_assignments(data, n_appearances=2, bit_score_threshold=30) db.store_data(db.ScaffoldsAssignmentsTable, data) db.close()