Пример #1
0
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap, args):
    hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length",
                   "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage")

    # Cache previous results if resuming is enabled
    VISITED = set()
    if args.resume and pexists(hits_file):
        print colorify("Resuming previous run. Reading computed output from %s" % hits_file, 'yellow')
        VISITED = set([line.split('\t')[0].strip()
                       for line in open(hits_file) if not line.startswith('#')])
        print len(VISITED), 'queries skipped'
        OUT = open(hits_file, 'a')
    else:
        OUT = open(hits_file, 'w')

    print colorify("Sequence mapping starts now!", 'green')
    if not args.no_file_comments:
        print >>OUT, get_call_info()
        print >>OUT, '# ' + '\t'.join(hits_header)
    total_time = 0
    last_time = time.time()
    start_time = time.time()
    qn = 0

    for qn, (name, elapsed, hits, querylen, seq) in enumerate(search.iter_hits(
                                                        fasta_file,
                                                        args.translate,
                                                        args.qtype,
                                                        args.dbtype,
                                                        scantype,
                                                        dbpath,
                                                        port,
                                                        evalue_thr=args.evalue,
                                                        score_thr=args.score,
                                                        qcov_thr=args.qcov,
                                                        fixed_Z=args.Z,
                                                        max_hits=args.maxhits,
                                                        skip=VISITED,
                                                        maxseqlen=args.maxseqlen,
                                                        cpus=args.cpu)):

        if elapsed == -1:
            # error occurred
            print >>OUT, '\t'.join(
                [name] + ['ERROR'] * (len(hits_header) - 1))
        elif not hits:
            print >>OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1))
        else:
            for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto, domscore) in enumerate(hits):
                hitname = hid
                if idmap:
                    hitname = idmap[hid][0]

                print >>OUT, '\t'.join(map(str, [name, hitname, heval, hscore,
                                                 int(querylen), int(hmmfrom),
                                                 int(hmmto), int(sqfrom),
                                                 int(sqto),
                                                 float(sqto - sqfrom) / querylen]))
        OUT.flush()

        # monitoring
        total_time += time.time() - last_time
        last_time = time.time()
        if qn and (qn % 25 == 0):
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time))
            sys.stderr.flush()

    # Writes final stats
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Пример #2
0
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap,
                     args):
    hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length",
                   "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage")

    # Cache previous results if resuming is enabled
    VISITED = set()
    if args.resume and pexists(hits_file):
        print colorify(
            "Resuming previous run. Reading computed output from %s" %
            hits_file, 'yellow')
        VISITED = set([
            line.split('\t')[0].strip() for line in open(hits_file)
            if not line.startswith('#')
        ])
        print len(VISITED), 'queries skipped'
        OUT = open(hits_file, 'a')
    else:
        OUT = open(hits_file, 'w')

    print colorify("Sequence mapping starts now!", 'green')
    if not args.no_file_comments:
        print >> OUT, get_call_info()
        print >> OUT, '# ' + '\t'.join(hits_header)
    total_time = 0
    last_time = time.time()
    start_time = time.time()
    qn = 0  # in case nothing to loop bellow
    for qn, (name, elapsed, hits, querylen, seq) in enumerate(
            search.iter_hits(fasta_file,
                             args.translate,
                             args.qtype,
                             args.dbtype,
                             scantype,
                             dbpath,
                             port,
                             evalue_thr=args.evalue,
                             score_thr=args.score,
                             qcov_thr=args.qcov,
                             fixed_Z=args.Z,
                             max_hits=args.maxhits,
                             skip=VISITED,
                             maxseqlen=args.maxseqlen,
                             cpus=args.cpu,
                             base_tempdir=args.temp_dir)):

        if elapsed == -1:
            # error occurred
            print >> OUT, '\t'.join([name] + ['ERROR'] *
                                    (len(hits_header) - 1))
        elif not hits:
            print >> OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1))
        else:
            for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto,
                           domscore) in enumerate(hits):
                hitname = hid
                if idmap:
                    hitname = idmap[hid][0]

                print >> OUT, '\t'.join(
                    map(str, [
                        name, hitname, heval, hscore,
                        int(querylen),
                        int(hmmfrom),
                        int(hmmto),
                        int(sqfrom),
                        int(sqto),
                        float(sqto - sqfrom) / querylen
                    ]))
        OUT.flush()

        # monitoring
        total_time += time.time() - last_time
        last_time = time.time()
        if qn and (qn % 25 == 0):
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time))
            sys.stderr.flush()

    # Writes final stats
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')