示例#1
0
    def mapping():
        out_f, hit_f, map_f, taxid_d = None, None, None, {}
        hit_f = megablast_output  #hit file: BLAST similarity search result (TSV format)
        map_f = "/home/nancy/assembly_app/blobtools/blobtools-master/taxon_n"  #mapping file (TSV format), in which one column lists a sequence ID (of a subject) and another the NCBI TaxID
        map_col_sseqid = "0"  #column of mapping file containing sequence IDs (of the subject)
        map_col_taxid = "2"  #column of mapping file containing the TaxID of the subject
        hit_col_qseqid = "0"  #column of the hit file containing query ID
        hit_col_sseqid = "1"  #column of the hit file containing subject ID
        hit_col_score = "11"  #column of the hit file containing (bit)score

        try:
            hit_col_qseqid = int(hit_col_qseqid)
            hit_col_sseqid = int(hit_col_sseqid)
            hit_col_score = int(hit_col_score)
        except ValueError:
            BtLog.error('41' % (
                "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"
            ))

        if map_f:
            if map_col_sseqid and map_col_taxid:
                try:
                    map_col_sseqid = int(map_col_sseqid)
                    map_col_taxid = int(map_col_taxid)
                except ValueError:
                    BtLog.error('44')
                print BtLog.status_d['1'] % ("Mapping file", map_f)
                taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
                out_f = BtIO.getOutFile("taxified", hit_f, "out")
            else:
                BtLog.error('44')
        else:
            BtLog.error('41')

        output = []
        print BtLog.status_d['1'] % ("similarity search result", hit_f)
        with open(hit_f) as fh:
            for idx, line in enumerate(fh):
                col = line.rstrip("\n").split()
                qseqid = col[hit_col_qseqid]
                sseqid = col[hit_col_sseqid]
                score = col[hit_col_score]
                tax_id = None
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
                output.append("%s\t%s\t%s\t%s" %
                              (qseqid, tax_id, score, sseqid))
        if output:
            with open(out_f, "w") as fh:
                print BtLog.status_d['24'] % out_f
                fh.write("\n".join(output) + "\n")
示例#2
0
def main():
    args = docopt(__doc__)
    out_f, hit_f, map_f, taxid_d = None, None, None, {}
    hit_f = args['--hit_file']
    hit_col_qseqid = args['--hit_column_qseqid']
    hit_col_sseqid = args['--hit_column_sseqid']
    hit_col_score = args['--hit_column_score']
    map_f = args['--taxid_mapping_file']
    map_col_sseqid = args['--map_col_sseqid']
    map_col_taxid = args['--map_col_taxid']
    custom_f = args['--custom']
    custom_taxid = args['--custom_taxid']
    custom_score = args['--custom_score']
    prefix = args['--out']

    try:
        hit_col_qseqid = int(hit_col_qseqid)
        hit_col_sseqid = int(hit_col_sseqid)
        hit_col_score = int(hit_col_score)
    except ValueError:
        BtLog.error('41' % (
            "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"))

    if custom_taxid:
        try:
            custom_taxid = int(custom_taxid)
        except TypeError:
            BtLog.error('26')
        out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid)
        taxid_d = defaultdict(lambda: custom_taxid)
    elif map_f:
        if map_col_sseqid and map_col_taxid:
            try:
                map_col_sseqid = int(map_col_sseqid)
                map_col_taxid = int(map_col_taxid)
            except ValueError:
                BtLog.error('44')
            print BtLog.status_d['1'] % ("Mapping file", map_f)
            taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
            out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out")
        else:
            BtLog.error('44')
    else:
        BtLog.error('41')

    output = []
    print BtLog.status_d['1'] % ("similarity search result", hit_f)
    with open(hit_f) as fh:
        for idx, line in enumerate(fh):
            col = line.rstrip("\n").split()
            qseqid = col[hit_col_qseqid]
            sseqid = col[hit_col_sseqid]
            score = col[hit_col_score]
            tax_id = None
            if custom_taxid:
                tax_id = taxid_d[sseqid]
            else:
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
            output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid))
    if output:
        with open(out_f, "w") as fh:
            print BtLog.status_d['24'] % out_f
            fh.write("\n".join(output) + "\n")