def mapping(): out_f, hit_f, map_f, taxid_d = None, None, None, {} hit_f = megablast_output #hit file: BLAST similarity search result (TSV format) map_f = "/home/nancy/assembly_app/blobtools/blobtools-master/taxon_n" #mapping file (TSV format), in which one column lists a sequence ID (of a subject) and another the NCBI TaxID map_col_sseqid = "0" #column of mapping file containing sequence IDs (of the subject) map_col_taxid = "2" #column of mapping file containing the TaxID of the subject hit_col_qseqid = "0" #column of the hit file containing query ID hit_col_sseqid = "1" #column of the hit file containing subject ID hit_col_score = "11" #column of the hit file containing (bit)score try: hit_col_qseqid = int(hit_col_qseqid) hit_col_sseqid = int(hit_col_sseqid) hit_col_score = int(hit_col_score) except ValueError: BtLog.error('41' % ( "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score" )) if map_f: if map_col_sseqid and map_col_taxid: try: map_col_sseqid = int(map_col_sseqid) map_col_taxid = int(map_col_taxid) except ValueError: BtLog.error('44') print BtLog.status_d['1'] % ("Mapping file", map_f) taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid) out_f = BtIO.getOutFile("taxified", hit_f, "out") else: BtLog.error('44') else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("similarity search result", hit_f) with open(hit_f) as fh: for idx, line in enumerate(fh): col = line.rstrip("\n").split() qseqid = col[hit_col_qseqid] sseqid = col[hit_col_sseqid] score = col[hit_col_score] tax_id = None if sseqid not in taxid_d: BtLog.warn_d['12'] % (sseqid, map_f) tax_id = taxid_d.get(sseqid, "N/A") output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output) + "\n")
def main(): args = docopt(__doc__) out_f, hit_f, map_f, taxid_d = None, None, None, {} hit_f = args['--hit_file'] hit_col_qseqid = args['--hit_column_qseqid'] hit_col_sseqid = args['--hit_column_sseqid'] hit_col_score = args['--hit_column_score'] map_f = args['--taxid_mapping_file'] map_col_sseqid = args['--map_col_sseqid'] map_col_taxid = args['--map_col_taxid'] custom_f = args['--custom'] custom_taxid = args['--custom_taxid'] custom_score = args['--custom_score'] prefix = args['--out'] try: hit_col_qseqid = int(hit_col_qseqid) hit_col_sseqid = int(hit_col_sseqid) hit_col_score = int(hit_col_score) except ValueError: BtLog.error('41' % ( "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score")) if custom_taxid: try: custom_taxid = int(custom_taxid) except TypeError: BtLog.error('26') out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid) taxid_d = defaultdict(lambda: custom_taxid) elif map_f: if map_col_sseqid and map_col_taxid: try: map_col_sseqid = int(map_col_sseqid) map_col_taxid = int(map_col_taxid) except ValueError: BtLog.error('44') print BtLog.status_d['1'] % ("Mapping file", map_f) taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid) out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out") else: BtLog.error('44') else: BtLog.error('41') output = [] print BtLog.status_d['1'] % ("similarity search result", hit_f) with open(hit_f) as fh: for idx, line in enumerate(fh): col = line.rstrip("\n").split() qseqid = col[hit_col_qseqid] sseqid = col[hit_col_sseqid] score = col[hit_col_score] tax_id = None if custom_taxid: tax_id = taxid_d[sseqid] else: if sseqid not in taxid_d: BtLog.warn_d['12'] % (sseqid, map_f) tax_id = taxid_d.get(sseqid, "N/A") output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid)) if output: with open(out_f, "w") as fh: print BtLog.status_d['24'] % out_f fh.write("\n".join(output) + "\n")