def main():
    parser = argparse.ArgumentParser(
        description='Filter a Uniref FASTA file by taxonomy')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_fasta',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument('-o',
                        '--output_fasta',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    parser.add_argument(
        '-c',
        '--clades',
        type=str,
        required=True,
        help='Comma-separated string of clades to be included.')
    parser.add_argument('-db',
                        '--taxadb',
                        type=str,
                        required=True,
                        help='Path to the taxadb sqlite3 file')
    args = parser.parse_args()

    if not os.path.exists(args.taxadb):
        raise Exception("ERROR:  Couldn't find taxadb specified")

    taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
    clades = args.clades.split(',')

    record_count = 0
    print_every = 1000

    clade_counter = dict()
    for clade in clades:
        clade_counter[clade] = 0

    # remembers for each ID if we're keeping it or not
    id_cache = dict()

    fout = open(args.output_fasta, 'wt')
    keep_entry = False

    for line in open(args.input_fasta):
        if line[0] == '>':
            record_count += 1
            if record_count % print_every == 0:
                print("{0} records processed ...".format(record_count),
                      flush=True)

            m = re.search('TaxID=(\d+)', line)
            if m:
                tax_id = m.group(1)

                if tax_id in id_cache:
                    if id_cache[tax_id] == True:
                        keep_entry = True
                    else:
                        keep_entry = False
                else:
                    lineage = taxid.lineage_name(tax_id, reverse=True)
                    clade_found = False

                    if lineage is None:
                        keep_entry = False
                        continue

                    for clade in clades:
                        if clade in lineage:
                            clade_found = True
                            clade_counter[clade] += 1
                            break

                    if clade_found:
                        keep_entry = True
                        id_cache[tax_id] = True
                    else:
                        keep_entry = False
                        id_cache[tax_id] = False

            else:
                keep_entry = False

        if keep_entry:
            fout.write(line)

    fout.close()

    print("Number of entries exported by clade:")

    for clade in clade_counter:
        print("\t{0}: {1}".format(clade, clade_counter[clade]))
Exemplo n.º 2
0
#!/usr/bin/env python3

import sys
from os.path import expanduser
from taxadb.taxid import TaxID


handler = TaxID(dbtype="sqlite", dbname=expanduser("~") + "/.taxadb/taxadb.sqlite")

for taxid in sys.stdin:
    taxid = taxid.strip()
    
    lineage = handler.lineage_name(taxid, reverse=True)
    
    lineage_s = ""
    if len(lineage) > 0:
        lineage_s = "; ".join(lineage)
    
    print("{}\t{}".format(taxid, lineage_s))
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser( description='Filter a Uniref FASTA file by taxonomy')

    ## output file to be written
    parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_fasta', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-c', '--clades', type=str, required=True, help='Comma-separated string of clades to be included.' )
    parser.add_argument('-db', '--taxadb', type=str, required=True, help='Path to the taxadb sqlite3 file' )
    args = parser.parse_args()

    if not os.path.exists(args.taxadb):
        raise Exception("ERROR:  Couldn't find taxadb specified")
    
    taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
    clades = args.clades.split(',')

    record_count = 0
    print_every = 1000

    clade_counter = dict()
    for clade in clades:
        clade_counter[clade] = 0

    # remembers for each ID if we're keeping it or not
    id_cache = dict()

    fout = open(args.output_fasta, 'wt')
    keep_entry = False

    for line in open(args.input_fasta):
        if line[0] == '>':
            record_count += 1
            if record_count % print_every == 0:
                print("{0} records processed ...".format(record_count), flush=True)
            
            m = re.search('TaxID=(\d+)', line)
            if m:
                tax_id = m.group(1)

                if tax_id in id_cache:
                    if id_cache[tax_id] == True:
                        keep_entry = True
                    else:
                        keep_entry = False
                else:
                    lineage = taxid.lineage_name(tax_id, reverse=True)
                    clade_found = False

                    if lineage is None:
                        keep_entry = False
                        continue

                    for clade in clades:
                        if clade in lineage:
                            clade_found = True
                            clade_counter[clade] += 1
                            break

                    if clade_found:
                        keep_entry = True
                        id_cache[tax_id] = True
                    else:
                        keep_entry = False
                        id_cache[tax_id] = False
                        
            else:
                keep_entry = False

        if keep_entry:
            fout.write(line)
        
    fout.close()

    print("Number of entries exported by clade:")

    for clade in clade_counter:
        print("\t{0}: {1}".format(clade, clade_counter[clade]))