def taxonomy_ref(in_filename, ref_taxonomy_filename, out_filename): """ """ logger = logging.getLogger('otu.taxonomy_ref') # load the taxonomy file ref_taxonomy_handler = open(ref_taxonomy_filename, 'r') ref_taxonomy_reader = csv.reader(ref_taxonomy_handler, delimiter='\t') ref_taxonomy = dict() for row in ref_taxonomy_reader: ref_taxonomy[row[0]] = utils.parse_taxonomy(row[1]) ref_taxonomy_handler.close() # write the taxonomy file #logger.info("write the taxonomy file %s" % os.path.basename(out_filename) out_handle = open(out_filename, 'w') out_writer = csv.writer(out_handle, delimiter='\t', lineterminator='\n') for record in SeqIO.parse(in_filename, "fasta"): out_writer.writerow([record.id, ref_taxonomy[record.id]]) out_handle.close()
def taxonomy_blast(in_filename, ref_filename, ref_taxonomy_filename, out_filename, task='blastn', num_threads=1, evalue=10e-30, perc_identity=90): """Assign taxonomy. 1) makeblastdb -in REF_FILENAME -parse_seqids -dbtype nucl -out DB_FILENAME 2) blastn -task blastn -db DB_FILENAME -query IN_FILENAME -out BLAST_OUT_FILENAME[.xml] -evalue EVALUE -perc_identity PERC_IDENTITY -num_threads NUM_THREADS -outfmt 5 3) load reference taxonomy file (REF_TAXONOMY_FILENAME) 4) write the taxonomy file (OUT_FILENAME) """ logger = logging.getLogger('otu.taxonomy_blast') basepath = os.path.splitext(out_filename)[0] # makeblastdb db_prefix = basepath + "_BLAST_DB_TMP" devnull = open(os.devnull, "w") cmd = ["makeblastdb", "-in", ref_filename, "-parse_seqids", "-dbtype", "nucl", "-out", db_prefix] logger.info(' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE) _, out_stderr = proc.communicate() devnull.close() if proc.returncode: logger.error(out_stderr) raise Exception(out_stderr) # blastn blast_out_filename = basepath + "_BLAST_OUT_TMP.xml" devnull = open(os.devnull, "w") cmd = ["blastn", "-task", task, "-db", db_prefix, "-query", in_filename, "-out", blast_out_filename, "-evalue", str(evalue), "-perc_identity", str(perc_identity), "-num_threads", str(num_threads), "-outfmt", "5"] logger.info(' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE) _, out_stderr = proc.communicate() devnull.close() if proc.returncode: logger.error(out_stderr) raise Exception(out_stderr) logger.info("write the taxonomy file %s" % os.path.basename(out_filename)) # load reference taxonomy file ref_taxonomy_handler = open(ref_taxonomy_filename, 'r') ref_taxonomy_reader = csv.reader(ref_taxonomy_handler, delimiter='\t') ref_taxonomy = dict() for row in ref_taxonomy_reader: ref_taxonomy[row[0]] = utils.parse_taxonomy(row[1]) ref_taxonomy_handler.close() # write the output file blast_out_handle = open(blast_out_filename, 'r') blast_out_records = NCBIXML.parse(blast_out_handle) out_handle = open(out_filename, 'w') out_writer = csv.writer(out_handle, delimiter='\t', lineterminator='\n') for record in blast_out_records: if len(record.alignments) > 0: best_hit_id = record.alignments[0].hit_id ta = ref_taxonomy[best_hit_id] else: ta = "Unknown" out_writer.writerow([record.query, ta]) out_handle.close() blast_out_handle.close() # delete tmp files for filename in glob.glob(db_prefix + '*'): os.remove(filename) os.remove(blast_out_filename)