Exemplo n.º 1
0
 def test_getRankedDescendants(self):
     """NcbiTaxonNode getRankedDescendants should return correct list"""
     nested_species = '''3\t|\t3\t|\tsuperkingdom\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     11\t|\t3\t|\tkingdom\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     22\t|\t11\t|\tclass\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     44\t|\t22\t|\torder\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     66\t|\t22\t|\torder\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t
     77\t|\t66\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     99\t|\t66\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     88\t|\t44\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     101\t|\t77\t|\tgenus\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     202\t|\t77\t|\tgenus\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     606\t|\t99\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t
     707\t|\t88\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     909\t|\t88\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     123\t|\t909\t|\tgroup\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     1111\t|\t123\t|\tspecies\t|\tAT\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     2222\t|\t707\t|\tspecies\t|\tTT\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|
     6666\t|\t606\t|\tspecies\t|\tGG\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t|\t
     7777\t|\t606\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     9999\t|\t202\t|\tspecies\t|\tBA\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     1010\t|\t101\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     5555\t|\t555\t|\tspecies\t|\tAC\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|
     555\t|\t3\t|\tsuperclass\t|\t\t|\t8\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|'''.split(
         '\n')
     nested_names = [
         '3|a||scientific name|',
         '11|b||scientific name|',
         '555|c||scientific name|',
         '22|d||scientific name|',
         '44|e||scientific name|',
         '66|f||scientific name|',
         '88|g||scientific name|',
         '77|h||scientific name|',
         '99|i||scientific name|',
         '707|j||scientific name|',
         '909|k||scientific name|',
         '101|l||scientific name|',
         '202|m||scientific name|',
         '606|n||scientific name|',
         '2222|o||scientific name|',
         '123|p||scientific name|',
         '1111|q||scientific name|',
         '1010|r||scientific name|',
         '9999|s||scientific name|',
         '7777|t||scientific name|',
         '6666|u||scientific name|',
         '5555|z||scientific name|',
     ]
     tx = NcbiTaxonomyFromFiles(nested_species, nested_names)
     dec = tx[3].getRankedDescendants('superclass')
     self.assertEqual(len(dec), 1)
     assert dec[0] is tx[555]
     sp = tx['f'].getRankedDescendants('species')
     self.assertSameItems(sp, [tx[1010], tx[9999], tx[7777], tx[6666]])
     empty = tx[11].getRankedDescendants('superclass')
     self.assertEqual(empty, [])
     gr = tx[3].getRankedDescendants('group')
     self.assertEqual(gr, [tx[123]])
     assert tx[3] is tx['a']
Exemplo n.º 2
0
 def test_init_bad(self):
     """NcbiTaxonomyFromFiles should produce deadbeats by default"""
     bad_tx = NcbiTaxonomyFromFiles(bad_nodes, good_names)
     self.assertEqual(len(bad_tx.Deadbeats), 2)
     assert 777 in bad_tx.Deadbeats
     assert 666 in bad_tx.Deadbeats
     assert bad_tx.Deadbeats[777] == bad_tx[9]
def main():

    args.infile_nodesdmp_path, args.infile_namesdmp_path, args.infile_mergeddmp_path, \
        args.infile_delnodesdmp_path = input_files()

    args.outfile_path, args.logfile_path = output_files()

    args.output_ranks = args.output_ranks.replace('_', ',').split(',')

    accessions_in_input_file = get_accessions_from_input_file()

    ncbi_full_taxonomy = NcbiTaxonomyFromFiles(open(args.infile_nodesdmp_path),
                                               open(args.infile_namesdmp_path))

    merged_taxids = get_merged_nodes()

    deleted_taxids = get_deleted_nodes()

    included_nodes, taxid, missing_accessions = obtain_nodes_for_each_accession(accessions_in_input_file, \
        args.infile_acc2taxid_path, ncbi_full_taxonomy, merged_taxids, deleted_taxids)

    taxid_taxonomy, missing_taxonomy = generate_taxonid_taxonomy(
        included_nodes, ncbi_full_taxonomy, args.output_ranks)

    generate_output_files(args.outfile_path, taxid_taxonomy, taxid,
                          missing_accessions, missing_taxonomy)

    log_file = open(args.logfile_path, 'a')
    log_file.write('Finished running entrez_qiime.py at ' +
                   strftime("%H:%M:%S on %d-%m-%Y", localtime()) + '\n')
    log_file.close()
Exemplo n.º 4
0
def get_tax_local(taxid_dict, ranks):
    """
    Fetch complete lineage locally (BETTER for huge queries)
    Return a dict with:
    k: GI
    v: complete lineage
    """    
    from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles

    tree = NcbiTaxonomyFromFiles(open( DB_PATH + 'nodes.dmp'), open( DB_PATH + 'names.dmp'))
    root = tree.Root

    def get_lineage(node, my_ranks):
        ranks_lookup = dict([(r,idx) for idx,r in enumerate(my_ranks)])
        lineage = [None] * len(my_ranks)
        curr = node
        while curr.Parent is not None:
            if curr.Rank in ranks_lookup:
                lineage[ranks_lookup[curr.Rank]] = curr.Name
            curr = curr.Parent
        return lineage

    tax_dict = {}
    for gi in taxid_dict:
        # Get lineage for each (gi i.e taxid)
        try:
            node = tree.ById[taxid_dict[gi]]
            tax_dict[gi] = get_lineage(node, ranks)
        except KeyError:
            print "Cannot Fetch taxonomy for GI: " + gi

    print "DONE: Complete taxonomy retrived for each GI"
    return tax_dict
Exemplo n.º 5
0
def load_ncbi_tree(ncbi_nodes_file, ncbi_names_file):
    """Loading NCBI taxnomy tree
    """
    try:
        ncbi_tree = NcbiTaxonomyFromFiles(open(ncbi_nodes_file, "rt"),
                                          open(ncbi_names_file, "rt"))
    except IOError:
        sys.exit("Error cannot open {0} or {1}".format(ncbi_nodes_file,
                                                       ncbi_names_file))
    return ncbi_tree
Exemplo n.º 6
0
 def test_init_strict(self):
     """NcbiTaxonomyFromFiles should fail if strict and deadbeats exist"""
     tx = NcbiTaxonomyFromFiles(good_nodes, good_names, strict=True)
     self.assertRaises(MissingParentError, NcbiTaxonomyFromFiles, \
         bad_nodes, good_names, strict=True)
Exemplo n.º 7
0
 def setUp(self):
     self.tx = NcbiTaxonomyFromFiles(good_nodes, good_names)
Exemplo n.º 8
0
#!/usr/bin/python2
import sys
import re

## RUN FROM INFO115

from cogent.parse.ncbi_taxonomy import NcbiTaxonomyFromFiles
tree = NcbiTaxonomyFromFiles(open('/2/scratch/brianne/taxonomy/nodes.dmp'),
                             open('/2/scratch/brianne/taxonomy/names.dmp'))
root = tree.Root

# Here is an example of how to get the NCBI taxon IDs for all Eukaryota phylums
# euks = root.getNodeMatchingName('Eukaryota')
# for node in euks.getRankedDescendants('phylum'):
#     print node.Name, node.TaxonId
#
# Chlorophyta 3041
# Streptophyta 35493
# Chytridiomycota 4761
# Microsporidia 6029
# Glomeromycota 214504
# Neocallimastigomycota 451455
# Blastocladiomycota 451459
# Ascomycota 4890
#
# etc

# to get the full lineage for defined ranks based on an input Taxon id
ranks = [
    'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
]
                                NCBI_DB + '\" | tee -a blast_taxid.txt',
                                shell=True)
# New code ends

    map_dict = {}
    with open('blast_taxid.txt') as gi_taxid:
        for each in gi_taxid:
            each = each.split()
            gi = each[0]
            taxid = each[1]
            map_dict[gi] = taxid

    print 'reading in files for ' + SAMPLE_ID

    #	tree = NcbiTaxonomyFromFiles(open('/home/roli/scripts/Python/GI_annotations/nodes.dmp'), open('/home/roli/scripts/Python/GI_annotations/names.dmp'))
    tree = NcbiTaxonomyFromFiles(open('nodes.dmp'), open('names.dmp'))
    root = tree.Root

    print 'generating lineages for ' + SAMPLE_ID

    error = open(SAMPLE_ID + '.no.lineage.found.out', 'w')
    error.write(header2 + "\n")

    if re.search("1|superkingdom", TAX_LEVEL):
        output_kingdom = open(SAMPLE_ID + '.superkingdom.blast_lineage.out',
                              'w')
        output_kingdom.write(header + "\n")

    if re.search("2|phylum", TAX_LEVEL):
        output_phylum = open(SAMPLE_ID + '.phylum.blast_lineage.out', 'w')
        output_phylum.write(header + "\n")
	ranks= [a]

## Define function for pulling lineage info from NCBI nodes and names files
def get_lineage(node, my_ranks):
	ranks_lookup = dict([(r,idx) for idx,r in enumerate(my_ranks)])
	lineage = [None] * len(my_ranks)
	curr = node
	while curr.Parent is not None:
		if curr.Rank in ranks_lookup:
			lineage[ranks_lookup[curr.Rank]] = curr.Name
		curr = curr.Parent
	return lineage

## IMPORT DIR NAMES into DICTIONARY

tree = NcbiTaxonomyFromFiles(open('/home/roli/db/nodes.dmp'), open('/home/roli/db/names.dmp'))
root = tree.Root

output = open("LINEAGES.tsv", "w")

with open(FILE_LIST) as f:
        for line in f:
		line = line.strip("\r\n")
		line = line.split("\t")

		NAME = line[0]
		TaxID = int(line[1])

		node = tree.ById[TaxID]
		tax = get_lineage(node, ranks)
		tax = str(tax[0]).lower()