Exemplo n.º 1
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    HIT_HEADER = [
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "best_tax_level",
    ]

    HIT_OG_HEADER = [
        "taxonomic scope", "eggNOG OGs", "best eggNOG OG",
        "COG Functional cat.", "eggNOG free text desc."
    ]

    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER +
                                HIT_OG_HEADER)
    qn = 0

    pool = multiprocessing.Pool(args.cpu)

    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             annotations, annot_level_max, swallowest_level, match_nogs,
             orthologs) = result
            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            # prepare annotations for printing
            annot_columns = [
                query_name, best_hit_name,
                str(best_hit_evalue),
                str(best_hit_score), LEVEL_NAMES[swallowest_level]
            ]

            for h in ANNOTATIONS_HEADER:
                if h in annotations:
                    annot_columns.append(','.join(sorted(annotations[h])))
                else:
                    annot_columns.append('')

            annot_columns.extend([
                annot_level_max, ','.join(match_nogs), bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' ')
            ])

            print >> OUT, '\t'.join(annot_columns)

        #OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Exemplo n.º 2
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_KOs",
        "BiGG_reactions",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(annot_header)
    qn = 0
    pool = multiprocessing.Pool(args.cpu)
    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             best_name, gos, kegg, bigg, annot_level_max, match_nogs,
             orthologs) = result

            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            print >> OUT, '\t'.join(
                map(str, (
                    query_name,
                    best_hit_name,
                    best_hit_evalue,
                    best_hit_score,
                    best_name,
                    ','.join(sorted(gos)),
                    ','.join(sorted(kegg)),
                    ','.join(sorted(bigg)),
                    annot_level_max,
                    ','.join(match_nogs),
                    bestOG,
                    og_cat.replace('\n', ''),
                    og_desc.replace('\n', ' '),
                )))

        OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')