def find_orthologs_per_hit(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    
    # dp we need this?
    #if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
    #    return None

    all_orthologies = annota.get_member_orthologs(best_hit_name)
    orthologs = sorted(all_orthologies[args.orthology_type])
    taxid = query_name.split(".")[0]
    # target species and taxid to be added
    return (query_name, [], taxid, orthologs)
示例#2
0
def annotate_hit_line(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
        return None

    match_nogs = annota.get_member_ogs(best_hit_name)
    if not match_nogs:
        return None

    match_levels = set([nog.split("@")[1] for nog in match_nogs])
    if args.tax_scope == "auto":
        for level in TAXONOMIC_RESOLUTION:
            if level in match_levels:
                annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                annot_levels.add(level)
                annot_level_max = "%s[%d]" %(level, len(annot_levels))
                break
    else:
        annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
        annot_levels.add(args.tax_scope)
        annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels))

    all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels)
    orthologs = sorted(all_orthologies[args.target_orthologs])
    if args.excluded_taxa:
        orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)]

    if orthologs:
        pname, gos, kegg, bigg = annota.summarize_annotations(orthologs,
                                                         target_go_ev=args.go_evidence,
                                                         excluded_go_ev=args.go_excluded)

        best_name = ''
        if pname:
            name_candidate, freq = pname.most_common(1)[0]
            if freq >= 2:
                best_name = name_candidate
    else:
        pname = []
        best_name = ''
        gos = set()
        kegg = set()
        bigg = set()

    return (query_name, best_hit_name, best_hit_evalue, best_hit_score,
            best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs)
示例#3
0
def _annotate_hit_line(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
        return None

    match_nogs = annota.get_member_ogs(best_hit_name)
    if not match_nogs:
        return None

    match_levels = set()
    for nog in match_nogs:
        match_levels.update(LEVEL_PARENTS[nog.split("@")[1]])

    swallowest_level = sorted(match_levels & set(LEVEL_DEPTH.keys()),
                              key=lambda x: LEVEL_DEPTH[x],
                              reverse=True)[0]

    annot_levels = set()
    if args.tax_scope == "auto":
        for level in TAXONOMIC_RESOLUTION:
            if level in match_levels:
                annot_levels.add(level)
                annot_level_max = LEVEL_NAMES.get(level, level)
                break
    else:
        annot_levels.add(args.tax_scope)
        annot_level_max = LEVEL_NAMES.get(args.tax_scope, args.tax_scope)

    if args.target_taxa != 'all':
        target_taxa = orthology.normalize_target_taxa(args.target_taxa)
    else:
        target_taxa = None

    try:
        all_orthologies = annota.get_member_orthologs(
            best_hit_name, target_taxa=target_taxa, target_levels=annot_levels)
    except Exception:
        orthologs = None
        status = 'Error'
    else:
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [
                o for o in orthologs
                if not o.startswith("%s." % args.excluded_taxa)
            ]
        status = 'OK'

    if orthologs:
        annotations = annota.summarize_annotations(
            orthologs,
            target_go_ev=args.go_evidence,
            excluded_go_ev=args.go_excluded)
    else:
        annotations = {}

    return (query_name, best_hit_name, best_hit_evalue, best_hit_score,
            annotations, annot_level_max, swallowest_level, match_nogs,
            orthologs)
示例#4
0
def annotate_hits_file_sequential(seed_orthologs_file, annot_file,
                                  hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_pathways",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >> OUT, '# ' + time.ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn + 1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" % (level, len(annot_levels))
                    break
        else:
            annot_levels = set(
                LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" % (args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(
            best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [
                o for o in orthologs
                if not o.startswith("%s." % args.excluded_taxa)
            ]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(
                orthologs,
                target_go_ev=args.go_evidence,
                excluded_go_ev=args.go_excluded)
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto,
             q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' % (hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''

        print >> OUT, '\t'.join(
            map(str, (
                query_name,
                best_hit_name,
                best_hit_evalue,
                best_hit_score,
                best_name,
                ','.join(sorted(gos)),
                ','.join(sorted(map(lambda x: "map%05d" % x, map(int,
                                                                 keggs)))),
                annot_level_max,
                ','.join(match_nogs),
                bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' '),
            )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
示例#5
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = ("#query_name",
                    "seed_eggNOG_ortholog",
                    "seed_ortholog_evalue",
                    "seed_ortholog_score",
                    "predicted_gene_name",
                    "GO_terms",
                    "KEGG_pathways",
                    "Annotation_tax_scope",
                    "OGs",
                    "bestOG|evalue|score",
                    "COG cat",
                    "eggNOG annot",
                    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >>OUT, '# ' + time.ctime()
        print >>OUT, '# ' + ' '.join(sys.argv)
        print >>OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" %(level, len(annot_levels))
                    break
        else:
            annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(orthologs,
                                                              excluded_gos=set(["IEA", "ND"]))
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
             seqto, q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' %(hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''


        print >>OUT, '\t'.join(map(str, (query_name,
                                         best_hit_name,
                                         best_hit_evalue,
                                         best_hit_score,
                                         best_name,
                                         ','.join(sorted(gos)),
                                         ','.join(sorted(map(lambda x: "map%05d"%x, map(int, keggs)))),
                                         annot_level_max,
                                         ','.join(match_nogs),
                                         bestOG,
                                         og_cat.replace('\n', ''),
                                         og_desc.replace('\n', ' '),
                                         )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')