Exemplo n.º 1
0
import os
import sys
import argparse
import taxon
import re

ffdir = 'flatfiles'  # the location of our genbank flatfiles
blastpf = 'phage_genes.nr.blastp'

genome = {}  # hash with protein IDs as key and genome as value
numprots = {}  # hash with genome ID as key and number of proteins as value

taxondir = "/home2/db/taxonomy/current/"
sys.stderr.write("Reading taxonomy\n")
taxa = taxon.read_nodes(directory=taxondir)
names, blastname = taxon.read_names(directory=taxondir)
sys.stderr.write("Read taxonomy\n")

# first read how many proteins there are per genome
# and create a list of protein ids->genomes.
# Note also throw a fatal error if IDs duplicated because I forgot to check this earlier!
for f in os.listdir(ffdir):
    with open(os.path.join(ffdir, f), 'r') as fin:
        for l in fin:
            p = l.strip().split("\t")
            if p[5] in genome:
                sys.stderr.write(
                    "FATAL AND BUGGER: {} was found in {} and {}\n".format(
                        p[5], genome[p[0]], p[0]))
            genome[p[5]] = p[0]
            numprots[p[0]] = numprots.get(p[0], 0) + 1
import sys
import argparse
import taxon

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Append taxonomy to the patric metadata file. This adds it at column 67")
    parser.add_argument('-f', help='patric metadata file', required=True)
    parser.add_argument('-o', help='output file', required=True)
    parser.add_argument('-t', help='taxonomy directory (default=/home2/db/taxonomy/current/)',
                        default='/home2/db/taxonomy/current/')
    parser.add_argument('-v', help='verbose output', action="store_true")
    args = parser.parse_args()

    sys.stderr.write("Reading taxonomy\n")
    taxa = taxon.read_nodes(directory=args.t)
    names, blastname = taxon.read_names(directory=args.t)
    divs = taxon.read_divisions(directory=args.t)

    sys.stderr.write("Read taxonomy\n")
    want = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

    with open(args.o, 'w', encoding='utf-8') as out: 
        with open(args.f, 'r', encoding='utf-8') as f:
            for l in f:
                p = l.strip().split("\t")
                while (len(p) <= 68):
                    p.append("")

                if l.startswith("genome_id"):
                    out.write("{}\t{}\n".format(l.strip(), "\t".join(want)))
                    continue
Exemplo n.º 3
0
import sys
import argparse
import taxon

__author__ = 'Rob Edwards'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Parse a tsv file and add taxonomy')
    parser.add_argument('-f', help='tab seperated values', required=True)
    args = parser.parse_args()

    want = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

    sys.stderr.write("Reading databases\n")
    taxa = taxon.read_nodes()
    names, blastname = taxon.read_names()
    sys.stderr.write("Done\n")

    with open(args.f, 'r') as f:
        for l in f:
            if l.startswith("#"):
                print("{}\t".format(l.strip()) + "\t".join(want))
                continue
            p = l.strip().split("\t")
            m = ["" for w in want]
            i = p[2]
            if i in taxa:
                while taxa[i].parent != '1' and i != '1':
                    if taxa[i].rank in want:
                        m[want.index(taxa[i].rank)] = names[i].name
                    i = taxa[i].parent
Exemplo n.º 4
0
__author__ = 'Rob Edwards'

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Parse a tsv file and add taxonomy')
    parser.add_argument('-f', help='tab seperated values', required=True)
    args = parser.parse_args()

    want = [
        'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family',
        'genus', 'species'
    ]

    sys.stderr.write("Reading databases\n")
    taxa = taxon.read_nodes()
    names, blastname = taxon.read_names()
    sys.stderr.write("Done\n")

    with open(args.f, 'r') as f:
        for l in f:
            if l.startswith("#"):
                print("{}\t".format(l.strip()) + "\t".join(want))
                continue
            p = l.strip().split("\t")
            m = ["" for w in want]
            i = p[2]
            if i in taxa:
                while taxa[i].parent != '1' and i != '1':
                    if taxa[i].rank in want:
                        m[want.index(taxa[i].rank)] = names[i].name
                    i = taxa[i].parent
Exemplo n.º 5
0
import taxon
import re

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Read a blast file and create a tuple of [query / kingdom / phylum / genus / species]")
    parser.add_argument('-f', help='blast file(s). Note this must have taxids as column 14. You may specify more than one file', nargs='+')
    parser.add_argument('-t', help='taxonomy directory (default=/home2/db/taxonomy/current/)', default='/home2/db/taxonomy/current/')
    parser.add_argument('-v', help='verbose output', action="store_true")
    args = parser.parse_args()

    want = ['superkingdom', 'phylum', 'genus', 'species']

    if args.v:
        sys.stderr.write("Reading taxonomy\n")
    taxa=taxon.read_nodes(directory=args.t)
    names,blastname = taxon.read_names(directory=args.t)
    if args.v:
        sys.stderr.write("Read taxonomy\n")

    for blastf in args.f:
        if args.v:
            sys.stderr.write("Reading {}\n".format(blastf))
        with open(blastf, 'r') as fin:
            for l in fin:
                p=l.strip().split("\t")

                for tid in p[14].split(";"):
                    level = {}
                    results = [p[0], tid]

                    while tid != '0' and tid != '1' and tid in taxa and taxa[tid].parent != '1':