Пример #1
0
def main():
    args = get_args()
    # global config
    global g_force_search
    if args.force_search:
        g_force_search = True
    # set defaults
    if args.out is None:
        args.out = args.fasta + ".annotated"
    # translate fasta?
    query = args.fasta
    if args.seqtype == "cds":
        query = os.path.split(query)[1]
        query = os.path.join(args.temp, query)
        query = query + ".translated"
        say("Translating input fasta to:\n ", query)
        translate_fasta(args.fasta, query)
        args.seqtype = "prot"
    # perform uniref90 search
    uniref90hits = uniref_search(
        diamond=args.diamond,
        database=args.uniref90db,
        query=query,
        seqtype=args.seqtype,
        temp=args.temp,
        diamond_options=args.diamond_options,
    )
    '''
	uniref90hits = query + ".uniref90.hits"
    uniref90map = parse_results( uniref90hits )
    # perform uniref50 search
    uniref50hits = uniref_search( 
        diamond=args.diamond, 
        database=args.uniref50db,
        query=query,
        seqtype=args.seqtype,
        temp=args.temp,
        diamond_options=args.diamond_options, )
    uniref50hits = query + ".uniref50.hits"
    uniref50map = parse_results( uniref50hits )
    # override mappings?
    overrides = {}
    if args.transitive_map is not None:
        overrides = trans_mapping( uniref90map, args.transitive_map )
    # reannoate the fasta
    reannotate( 
        query=args.fasta, 
        out=args.out, 
        uniref90map=uniref90map, 
        uniref50map=uniref50map, 
        overrides=overrides, )
    # done
    '''
    say("Finished successfully.")
Пример #2
0
def parse_results(results):
    say("Parsing results file:\n ", results)
    check_path(results)
    mapping = {}
    mode = get_mode(results)
    min_pident = float(mode.replace("uniref", ""))
    with open(results) as fh:
        for row in csv.reader(fh, csv.excel_tab):
            h = Hit(row, config=c_output_format)
            if h.qseqid not in mapping:
                if float(h.pident) >= float(min_pident) and float(
                        h.mcov) >= float(c_min_coverage):
                    uniref = h.sseqid.split("|")[0]
                    mapping[h.qseqid] = uniref
    return mapping
Пример #3
0
def trans_mapping(uniref90map, p_trans_map):
    say("Loading transitive mapping file:\n ", p_trans_map)
    check_path(p_trans_map)
    overrides = {}
    uniref90map_r = {}
    for header, uniref90 in uniref90map.items():
        # modify by yancong
        uniref90 = re.sub("-[0-9]+$", "", uniref90)
        uniref90map_r.setdefault(uniref90, set()).add(header)
    with open(p_trans_map) as fh:
        for row in csv.reader(fh, csv.excel_tab):
            uniref90, uniref50 = row
            headers = uniref90map_r.get(uniref90, set())
            for h in headers:
                overrides[h] = uniref50
    return overrides
Пример #4
0
def uniref_search(diamond=None,
                  database=None,
                  query=None,
                  seqtype=None,
                  temp=None,
                  diamond_options=None):
    if which(diamond) is None:
        die("<diamond> is not executable as: {}".format(diamond))
    for path in [database, query, temp]:
        check_path(path)
    binary = {"nuc": "blastx", "prot": "blastp"}[seqtype]
    mode = get_mode(database)
    results = os.path.split(query)[1]
    results = os.path.join(temp, results)
    results = ".".join([results, mode, "hits"])
    command = [
        diamond,
        binary,
        "--db",
        database,
        "--query",
        query,
        "--outfmt",
        c_output_format,
        "--tmpdir",
        temp,
        "--out",
        results,
        #"--id", get_mode( results ).replace( "uniref", "" ),
        c_diamond_filters,
    ]
    command = " ".join([str(k) for k in command])
    command += (" " + diamond_options) if diamond_options is not None else ""
    if not os.path.exists(results) or g_force_search:
        say("Executing:\n ", command)
        os.system(command)
    else:
        say("Using existing results file:\n ", results)
    return results
Пример #5
0
def reannotate(query=None,
               out=None,
               uniref90map=None,
               uniref50map=None,
               overrides=None):
    say("Writing new output file:\n ", out)
    oh = open(out, "w")
    ntot, nmap90, ninf50, nmap50 = [0 for i in range(4)]
    with open(query) as fh:
        for line in fh:
            line = line.strip()
            if line == "":
                continue
            elif line[0] != ">":
                print(line, file=oh)
            else:
                # diamond breaks the header on whitespace
                header = line[1:].split()[0]
                ntot += 1
                uniref90code = "UniRef90_unknown"
                if header in uniref90map:
                    uniref90code = uniref90map[header]
                    nmap90 += 1
                uniref50code = "UniRef50_unknown"
                if header in overrides:
                    uniref50code = overrides[header]
                    ninf50 += 1
                elif header in uniref50map:
                    uniref50code = uniref50map[header]
                    nmap50 += 1
                print("|".join([line, uniref90code, uniref50code]), file=oh)
    oh.close()
    # report
    say("Summary of annotations:")
    say("  Genes in input FASTA: {:,}".format(ntot))
    say("  UniRef90 codes assigned: {:,} ({:.1f}%)".format(
        nmap90, 100 * nmap90 / float(ntot)))
    say("  UniRef50 codes assigned: {:,} ({:.1f}%)".format(
        nmap50 + ninf50, 100 * (nmap50 + ninf50) / float(ntot)))
    say("  UniRef50 codes inferred from UniRef90 codes: {:,} ({:.1f}%)".format(
        ninf50, 100 * ninf50 / float(ntot)))
    # done
    return None