def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): HIT_HEADER = [ "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "best_tax_level", ] HIT_OG_HEADER = [ "taxonomic scope", "eggNOG OGs", "best eggNOG OG", "COG Functional cat.", "eggNOG free text desc." ] start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER + HIT_OG_HEADER) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, annotations, annot_level_max, swallowest_level, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) # prepare annotations for printing annot_columns = [ query_name, best_hit_name, str(best_hit_evalue), str(best_hit_score), LEVEL_NAMES[swallowest_level] ] for h in ANNOTATIONS_HEADER: if h in annotations: annot_columns.append(','.join(sorted(annotations[h]))) else: annot_columns.append('') annot_columns.extend([ annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' ') ]) print >> OUT, '\t'.join(annot_columns) #OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ( "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_KOs", "BiGG_reactions", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(annot_header) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) print >> OUT, '\t'.join( map(str, ( query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(kegg)), ','.join(sorted(bigg)), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')