def test_get_topology(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) t1 = ncbi.get_topology([9606, 7507, 9604]) t2 = ncbi.get_topology([9606, 7507, 678]) self.assertEqual(sorted(t1.get_leaf_names()), ["7507", "9606"]) self.assertEqual(sorted(t2.get_leaf_names()), ["678", "7507", "9606"])
def get_desired_ranks(taxid, desired_ranks): ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lineage2ranks = ncbi.get_rank(names) ranks2lineage = dict( (rank, taxid) for (taxid, rank) in lineage2ranks.items()) return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None): """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are expected to contain a feature (name, by default) encoding a valid taxid number. All descendant nodes (including internal nodes) are annotated with the following new features: `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database `Node.named_lineage`: the NCBI lineage track using scientific names `Node.taxid`: NCBI taxid number `Node.lineage`: same as named_lineage but using taxid codes. Note that for internal nodes, NCBI information will refer to the first common lineage of the grouped species. :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. :param None tax2name: A dictionary where keys are taxid numbers and values are their translation into NCBI scientific name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2track: A dictionary where keys are taxid numbers and values are their translation into NCBI lineage tracks (taxids). Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2rank: A dictionary where keys are taxid numbers and values are their translation into NCBI rank name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None dbfile : If provided, the provided file will be used as a local copy of the NCBI taxonomy database. :returns: tax2name (a dictionary translating taxid numbers into scientific name), tax2lineage (a dictionary translating taxid numbers into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into rank names). """ ncbi = NCBITaxa(dbfile=dbfile) return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank)
def get_family(taxid): ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) for rank in ranks.keys(): if ranks[rank] == 'family': return rank return "None"
def get_rank(self, rank_level): ncbi = NCBITaxa() lineage = ncbi.get_lineage(self.tax_id) ranks = ncbi.get_rank(lineage) for rank in ranks.keys(): if ranks[rank] == rank_level: return rank return "N/A"
def get_rank(taxid, rank_level): ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) for rank in ranks.keys(): if ranks[rank] == rank_level: return rank return "None"
def test_ncbiquery(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) id2name = ncbi.get_taxid_translator(["9606", "7507"]) self.assertEqual(id2name[7507], "Mantis religiosa") self.assertEqual(id2name[9606], "H**o sapiens") name2id = ncbi.get_name_translator(["Mantis religiosa", "h**o sapiens"]) self.assertEqual(name2id["Mantis religiosa"], 7507) self.assertEqual(name2id["h**o sapiens"], 9606)
def taxo_msa(outfile='taxo_msa.svg',taxids=[],annotation='',msa=[],title='',width=2000): """ Visualize MSA together with a taxonomy tree taxids - list of taxids in the same order as seqs in msa """ # taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} # gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} # msa_dict={i.id:i.seq for i in msa_tr} ncbi = NCBITaxa() taxids=map(int,taxids) t = ncbi.get_topology(taxids,intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa[taxids.index(int(node.name))].seq) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") # gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+msa[taxids.index(int(node.name))].id),node,column=1, position = "aligned") # add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': if(annotation): s=annotation # get_hist_ss_in_aln_as_string(msa_tr) else: s=' '*len(msa[0].seq) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned") # add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render(outfile, w=width, dpi=300, tree_style=ts)
def getNcbiTaxonomy(): ncbi = NCBITaxa() nameToTaxIdList = ncbi.get_name_translator(ORGANISM_NAMES_LIST) #print (str(nameToTaxIdList)) with open (OUTPUT_FILE, "w") as outputFile: for name in ORGANISM_NAMES_LIST: #for name, taxIds in nameToTaxIdList.items(): taxIds = nameToTaxIdList[name] for eachId in taxIds: lineage = ncbi.get_lineage(str(eachId)) names = ncbi.get_taxid_translator(lineage) outputFile.write("\t".join([names[taxid] for taxid in lineage]) + "\n")
def annotate_ncbi_taxa(self, taxid_attr="species", tax2name=None, tax2track=None, tax2rank=None, dbfile=None): """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are expected to contain a feature (name, by default) encoding a valid taxid number. All descendant nodes (including internal nodes) are annotated with the following new features: `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database `Node.named_lineage`: the NCBI lineage track using scientific names `Node.taxid`: NCBI taxid number `Node.lineage`: same as named_lineage but using taxid codes. Note that for internal nodes, NCBI information will refer to the first common lineage of the grouped species. :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. :param None tax2name: A dictionary where keys are taxid numbers and values are their translation into NCBI scientific name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2track: A dictionary where keys are taxid numbers and values are their translation into NCBI lineage tracks (taxids). Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2rank: A dictionary where keys are taxid numbers and values are their translation into NCBI rank name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None dbfile : If provided, the provided file will be used as a local copy of the NCBI taxonomy database. :returns: tax2name (a dictionary translating taxid numbers into scientific name), tax2lineage (a dictionary translating taxid numbers into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into rank names). """ ncbi = NCBITaxa(dbfile=dbfile) return ncbi.annotate_tree( self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank )
def ncbi_compare(self, autodetect_duplications=True, cached_content=None): if not cached_content: cached_content = self.get_cached_content() cached_species = set([n.species for n in cached_content[self]]) if len(cached_species) != len(cached_content[self]): print cached_species ntrees, ndups, target_trees = self.get_speciation_trees( autodetect_duplications=autodetect_duplications, map_features=["taxid"]) else: target_trees = [self] ncbi = NCBITaxa() for t in target_trees: ncbi.get_broken_branches(t, cached_content)
def ncbi_compare(self, autodetect_duplications=True, cached_content=None): if not cached_content: cached_content = self.get_cached_content() cached_species = set([n.species for n in cached_content[self]]) if len(cached_species) != len(cached_content[self]): print cached_species ntrees, ndups, target_trees = self.get_speciation_trees( autodetect_duplications=autodetect_duplications, map_features=["taxid"] ) else: target_trees = [self] ncbi = NCBITaxa() for t in target_trees: ncbi.get_broken_branches(t, cached_content)
def test_ncbiquery(self): ncbi = NCBITaxa(dbfile=DATABASE_PATH) id2name = ncbi.get_taxid_translator(['9606', '7507']) self.assertEqual(id2name[7507], 'Mantis religiosa') self.assertEqual(id2name[9606], 'H**o sapiens') name2id = ncbi.get_name_translator(['Mantis religiosa', 'h**o sapiens']) self.assertEqual(name2id['Mantis religiosa'], [7507]) self.assertEqual(name2id['h**o sapiens'], [9606]) name2id = ncbi.get_name_translator(['Bacteria']) self.assertEqual(set(name2id['Bacteria']), set([2, 629395])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=True) #Out[9]: [1425170, 741158, 63221, 9606] self.assertEqual(set(out), set([1425170, 741158, 63221, 9606])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False) #Out[10]: [1425170, 741158, 63221] self.assertEqual(set(out), set([1425170, 741158, 63221])) out = ncbi.get_descendant_taxa("9605", intermediate_nodes=False, rank_limit="species") #Out[11]: [9606, 1425170] self.assertEqual(set(out), set([9606, 1425170]))
def bio_tree(names): from collections import OrderedDict from Bio import Entrez from ete2 import NCBITaxa, PhyloTree from lxml import etree Entrez.email = '*****@*****.**' ncbi = NCBITaxa() ids = [] for name in names: handle = Entrez.esearch(db='taxonomy', term=name) while True: line = handle.readline() if not line: break if '<Id>' in line: ids.append(int(line.strip('<Id></Id>\n'))) scientific_tree = ncbi.get_topology(ids) return scientific_tree.get_ascii(attributes=['sci_name'])
def get_desired_ranks(taxid, desired_ranks): """ Gets the parent TaxID for a particular Taxon node. Args: taxid: Taxon Node whose parent TaxID at a given level needs to be determined desired_ranks: Parent Taxon level at which Taxon ID must be determined. Returns: List of TaxIDs at particular taxon level """ ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lineage2ranks = ncbi.get_rank(names) ranks2lineage = dict( (rank, taxid) for (taxid, rank) in lineage2ranks.items()) return [ranks2lineage.get(rank, '0') for rank in desired_ranks]
def parseVSearchOutputAgainstNCBI(vsearch_out, database, output_file, min_coverage, min_similarity): """Resolves vsearch matches in a vsearch output file to the taxonomic name taken from BOLD. Takes in a vsearch output file from usearch__global, parses the result for good matches, and writes an output file mapping sequence name to taxa name. :param vsearch_out: An output file from vsearch's usearch__global program. :param database: The database used as part of the vsearch usearch__global operation. :param output_file: Where to write the resulting file that maps sequence ID to taxanomic name. :param min_coverage: The minimum coverage for an acceptable vsearch match. :param min_similarity: The minimum simmilarity for an acceptable vsearch match. """ min_simm = float(min_similarity) min_coverage = float(min_coverage) ncbi = NCBITaxa() conn = sqlite3.connect(database) c = conn.cursor() query = "select taxid from gi_taxid where gi=%s" def getTaxFromId(taxId, taxonomy=[ "species", "genus", 'family', 'order', 'class', 'phylum' ]): myTaxonomy = dict([(a, "") for a in taxonomy]) taxId = int(taxId) for lin in ncbi.get_lineage(taxId): rank = ncbi.get_rank([lin]).values()[0] if rank in taxonomy: val = ncbi.get_taxid_translator([lin]).values()[0] myTaxonomy[rank] = val return ":".join([myTaxonomy[x] for x in taxonomy[::-1]]) with open(output_file, 'w') as out: for line in open(vsearch_out, 'r'): data = line.split() if float(data[4]) > min_coverage or float(data[2]) > min_simm: hit = c.execute(query % data[1]).fetchone() if hit: taxonomy = getTaxFromId(hit[0]) data.append(taxonomy) printVerbose("\t".join(data)) out.write("\t".join(data)) out.write("\n") else: printErrorMissingID(out, data[1])
def __init__(self, data_path, workbench=None, genomes=[], taxDb=None): self.data_path = data_path self.workbench = workbench self.metadata_path = pjoin(self.data_path, "metadata") if not os.path.exists(self.metadata_path): os.makedirs(self.metadata_path) self.metadata_file = pjoin(self.metadata_path, "metadata.csv") if taxDb: self.taxDb = taxDb else: self.taxDb = NCBITaxa() self.genomes = genomes
def main(): args = parser.parse_args() cazy_fp = args.cazy_fp p2taxid_fp = args.p2taxid_fp output_fp = args.output_fp len_fp = args.len_fp ncbi = NCBITaxa() # read in ncbi prot to taxid, keep in memory as a dict prot_to_taxid = read_NCBI_prot_to_taxid_gz(p2taxid_fp) # open hmm_len = read_hmm_len_fp(len_fp) # read in cazy cazy_f = fasta_iter(cazy_fp) # for each cazy, with open(output_fp, 'w') as f: for header, seq in cazy_f: acc = header.split('|')[0] fam = header.split('|')[1] try: gene_length = hmm_len[fam] except KeyError: gene_length = 1000 try: taxid = prot_to_taxid[acc] taxonomy = '.'.join(get_taxon_path(taxid, ncbi)) except KeyError: taxonomy = 'unclassified' outline = '{0}\t{1}\t{2}\t{3}\n'.format(header, fam, gene_length, taxonomy) f.write(outline)
def blast2summary_dict(db, blastpath): # (Path, Path) -> list[dict] """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd. Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info.""" rows = csv.DictReader(open(blastpath), delimiter='\t', fieldnames=[ 'qseqid', 'sseqid', 'pid', 'alnlen', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore' ]) rows = list(rows) seqids = map(get('sseqid'), rows) taxids = get_taxid(db, seqids) gis = (s.split('|')[1] for s in seqids) matches = dict( (taxids[gi], row) for gi, row in zip(gis, rows) if gi in taxids) ncbi = NCBITaxa( ) # downloads database and creates SQLite database if needed return dictmap(lambda tid, row: merge(row, taxonomy(ncbi, tid)), matches)
def check_taxa_db_age(dbLocation): # if file doesn't exist, catch the error and run the update, as it will create the file. ncbi = NCBITaxa() try: filetime = datetime.fromtimestamp(path.getctime(dbLocation)) one_month_ago = datetime.now() - timedelta(days=30) if filetime < one_month_ago: # File older than 1 month, update it: logInfo = '<> NCBITaxa Database older than 1 month, updating it <>' ncbi.update_taxonomy_database() else: logInfo = '<> NCBITaxa Database up to date <>' except: logInfo = "<> NCBITaxa Database didn't exist, downloaded it <>" ncbi.update_taxonomy_database() return(logInfo)
def blast2summary_dict(db, blastpath, ete2_db): # (Path, Path) -> list[dict] """Reading in a blast output file, lookup all seqids to get taxids with a single blastdbcmd. Then, lookup the taxonomy using ETE2 via the taxid, and add that info to the blast info.""" # rows = csv.DictReader(open(blastpath), delimiter='\t',fieldnames=[SEQID, 'sseqid','pid', 'alnlen','gapopen','qstart','qend','sstart','send','evalue','bitscore']) rows = csv.DictReader(open(blastpath), delimiter='\t', fieldnames=blast_columns) rows = list(rows) seqids = map(get('sseqid'), rows) taxids = get_taxid(db, seqids) def get_gi(s): fields = s.split('|') if len(fields) > 1: return fields[1] else: raise ValueError("Seq ID %s is missing GI fields and '|'" % s) gis = imap(get_gi, seqids) #TODO: change matches to use something unique--not the TAXID! actually, why is it a dict # in the first place? it should be a list of dictionaries, and then map over # the dictionaries to merge them with the taxonomy info # this will replace the lines: # matches = . . . # items = . . . #matches = dict((taxids[gi], row) for gi, row in zip(gis,rows) if gi in taxids) ncbi = NCBITaxa( ete2_db) # downloads database and creates SQLite database if needed # items = dictmap(lambda tid,row: merge(row, taxonomy(ncbi, tid)), matches) matches = [ assoc(row, 'taxid', taxids[gi]) for gi, row in zip(gis, rows) if gi in taxids ] items = [merge(row1, taxonomy(ncbi, row1['taxid'])) for row1 in matches] res = imap(partial(keyfilter, csv_fields.__contains__), items) return res
#ete2 module ete2_error = ''' ERROR: This program requires the Python module ete2. Please install it and try running the program again. ''' try: from ete2 import NCBITaxa except: print ete2_error sys.exit(0) from rpy2.robjects.packages import importr print 'Loading NCBI taxonomic data...' print ncbi= NCBITaxa() #------------------------------------------------ # Checking/installing/loading ontoCAT (BioconductoR) and PSI-MI #------------------------------------------------ # ontoCAT installed here as variable for later use try: ontoCAT = importr('ontoCAT') except RRuntimeError: print ''' This program requires ontoCAT (BioconductoR package) to run. Currently installing ontoCat to your machine...
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa if not args.taxonomy and not args.info: args.taxonomy = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation( name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" % sim if args.taxonomy: log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features( named_lineage='|'.join(ncbi.translate_to_names(lineage))) dump(t, features=[ "taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage" ]) elif args.info: print '# ' + '\t'.join( ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([ str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string ])
import sys import os import urllib2 import gzip import biom import shutil from numpy import random as np_rand from ete2 import NCBITaxa from scripts.loggingwrapper import LoggingWrapper as logger try: from configparser import ConfigParser except ImportError: from ConfigParser import ConfigParser ncbi = NCBITaxa() RANKS = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] MAX_RANK = 'family' _log = None """ Reads a BIOM file and creates map of OTU: lineage, abundance BIOM file format needs to have a taxonomy field in metadata which contains the taxonomy in the format: RANK__SCINAME; LOWERRANK_LOWERSCINAME """ def read_taxonomic_profile(biom_profile, config, no_samples = None): table = biom.load_table(biom_profile) ids = table.ids(axis="observation") samples = table.ids() if no_samples is None: no_samples = len(samples)
#This function is DEPRECATED def check_tax_id_clade(clade_top_tax_id,check_tax_id): """Checks if given tax_id is inside a clade formed by taxa described by its top taxid""" p=pd.read_table(PATH_to_NCBI_nodes_dmp,sep='|',usecols=[0,1],header=None) G=nx.DiGraph() #Load all taxonomy as graph G.add_edges_from(zip(p.ix[:,1],p.ix[:,0])) clade=nx.dfs_tree(G,clade_top_tax_id) if(not clade.nodes()): clade.add_node(clade_top_tax_id) return(clade.has_node(check_tax_id)) ncbi = NCBITaxa() def subsample_taxids(taxids,rank='species'): """ For a given set of taxids leaves only one representative per selected rank Eg. for a set of subspecies - leave only species. """ rank_dict={'superkingdom':0,'kingdom':1,'phylum':2,'class':3,'superorder':4,'order':5,'suborder':6,'infraorder':7,'parvorder':8,'superfamily':9,'family':10,'subfamily':11,'genus':12,'subgenus':13,'species':14,'subspecies':15} tree = ncbi.get_topology(taxids,intermediate_nodes=True) #We have now a phylogenetic tree with all annotations for our taxids. subsampled_taxids=set() #We are iterating through the taxids and for every we are determining only one representative from this group. #These representatives will be the same for the taxids in one group - and hence subsampling will happen. for t in taxids:
import csv from os import listdir from os.path import isfile, join from collections import Counter from collections import defaultdict from ete2 import NCBITaxa d = defaultdict(list) taxids = [] mypath = raw_input("Enter path to csv files (i.e. /home/user/csv/files/): ") allfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] onlyfiles = [s for s in allfiles if '.csv' in s] filecount = len(onlyfiles) level = raw_input("1: phylum ----> 2: class ----> 3: order ----> 4: family ----> 5: genus ----> 6: species\nEnter rank number: ") ncbi = NCBITaxa() DEFAULT_TAXADB = os.path.join(os.environ.get('HOME', '/'), '.etetoolkit', 'taxa.sqlite') DB_VERSION = 2 rank_dict = {"1": "phylum", "2": "class", "3": "order", "4": "family", "5": "genus", "6": "species"} def is_taxadb_up_to_date(dbfile=DEFAULT_TAXADB): """ Check if a valid and up-to-date taxa.sqlite database exists If dbfile is not specified, DEFAULT_TAXADB is assumed :param dbfile: :return: """ db = sqlite3.connect(dbfile) try: r = db.execute('SELECT version FROM stats;')
from ete2 import NCBITaxa from ete2 import Tree, TreeStyle, AttrFace ncbi = NCBITaxa() input = [l.rstrip("\n") for l in open("db/example_input", "r")] taxid = ncbi.get_name_translator(input) tree = ncbi.get_topology(taxid.values()) #print tree.get_ascii(attributes=["sci_name", "rank", "taxid"]) # custom layout: adds "rank" on top of branches, and sci_name as tip names def my_layout(node): if getattr(node, "rank", None): rank_face = AttrFace("rank", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") ts = TreeStyle() ts.layout_fn = my_layout ts.show_leaf_name = False tree.render("tree.pdf", tree_style=ts)
import numpy as np import pandas as pd import cPickle as pickle from ete2 import NCBITaxa from pprint import pprint import os.path from Bio.Seq import Seq from Bio.Alphabet import IUPAC sys.path.append('/Volumes/MDBD/Dropbox/work/MYSOFT/ALIGNMENT_TOOLS/') # Entrez.email = "*****@*****.**" from hist_ss import get_core_lendiff rank_dict={'superkingdom':0,'kingdom':1,'phylum':2,'class':3,'superorder':4,'order':5,'suborder':6,'infraorder':7,'parvorder':8,'superfamily':9,'family':10,'subfamily':11,'genus':12,'subgenus':13,'species':14,'subspecies':15} ncbi = NCBITaxa() def check_hist_length(seq,hist_type,hist_var=None,dev_percent=10): """ This simple check compares the length of sequence provided to a range of curated sequences in histone DB +- dev_percent %. """ if(os.path.isfile('int_data/cur_length.csv')): cur_df=pd.read_csv('int_data/cur_length.csv') else: hist_df=pd.read_csv('inp_data/seqs.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences #construct df with length cur_df=hist_df[(hist_df['curated']==True)] cur_df['length']=cur_df['gi'].map(lambda x: len(fasta_dict[str(x)].seq)) # print cur_df.groupby(['hist_type','hist_var']).agg([np.max,np.min])
def taxo_seq_architecture(seqreclist=[], outfile='taxo_arch.svg', taxids=[], annotation='', title='', width=2000): """ Visualize sequence architecture together with a taxonomy tree seqreclist - contains a list of seqres. each seqrec should have a list of features in biobython SeqFeature format. features of type "domain" will be plotted as boxes features of type "xxxx" will be plotted as ... taxids - list of taxids in the same order as seqs in msa, if now provided will assume that seqrecs are in genbank format and attempt to get taxids from there. """ aa = [ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '.', '-' ] def get_color(str): colorlist = [ 'red', 'green', 'yellow', 'lightblue', 'cyan', 'magenta', 'orange', 'pink', 'lightgreen' ] return colorlist[hash(str) % 9] if len(taxids) == 0: taxids = map(get_taxid_from_gbrec, seqreclist) ncbi = NCBITaxa() taxids = map(int, taxids) t = ncbi.get_topology(taxids, intermediate_nodes=False) # a=t.add_child(name='annotation') # a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if (node.rank in ['order', 'class', 'phylum', 'kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name == 'annotation': #here we are adding faces and we need to play with seqmotif face seq = str(seqreclist[taxids.index(int(node.name))].seq) motifs = [] #[[0,len(seq), "seq", 10, 10, None, None, None]] for f in seqreclist[taxids.index(int(node.name))].features: if f.type == 'domain': motifs.append([ f.location.start, f.location.end, "[]", None, 10, "blue", get_color(f.qualifiers['name']), "arial|8|black|%s" % f.qualifiers['name'] ]) if f.type == 'motif': #It turns out that we need to solve overlap problem here, here it is solved only in case of one overlap s = f.location.start e = f.location.end flag = True overlappedm = [] for m in motifs: if m[2] == 'seq' and m[0] < e and m[ 1] > s: #we have an overlap, four cases, preceding motife always is on top flag = False overlappedm.append(m) if not flag: #we have to solve multiple overlap problem #let's do it by scanning sflag = False eflag = False for x in range(s, e + 1): if not sflag: #check if we can start overlap = False for m in overlappedm: if x >= m[0] and x < m[1]: overlap = True if not overlap: ts = x sflag = True #check if is time to end if sflag and not eflag: overlap = False for m in overlappedm: if x == m[0]: overlap = True if overlap or x == e: te = x eflag = True if sflag and eflag: motifs.append([ ts, te, "seq", 10, 10, "black", get_color(f.qualifiers['name']), None ]) sflag = False eflag = False if flag: motifs.append([ f.location.start, f.location.end, "seq", 10, 10, "black", get_color(f.qualifiers['name']), None ]) seqFace = SeqMotifFace(seq, motifs, scale_factor=1, seq_format="[]") seqFace.overlaping_motif_opacity = 1.0 # seqFace.fg=aafgcolors # seqFace.bg=aabgcolors_gray add_face_to_node(seqFace, node, 0, position="aligned") # gi=taxid2gi[int(node.name)] add_face_to_node( TextFace(' ' + seqreclist[taxids.index(int(node.name))].id + ' '), node, column=1, position="aligned") # add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") #We currently disable annotation if node.is_leaf() and node.name == 'annotation': if (annotation): s = annotation # get_hist_ss_in_aln_as_string(msa_tr) else: s = ' ' * max(map(lambda x: len(x.seq), seqreclist)) # seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) # add_face_to_node(seqFace, node, 0, position="aligned") # add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned") # add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render(outfile, w=width, dpi=300, tree_style=ts)
def taxo_msa(outfile='taxo_msa.svg', taxids=[], annotation='', msa=[], title='', width=2000): """ Visualize MSA together with a taxonomy tree taxids - list of taxids in the same order as seqs in msa """ # taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} # gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} # msa_dict={i.id:i.seq for i in msa_tr} ncbi = NCBITaxa() taxids = map(int, taxids) t = ncbi.get_topology(taxids, intermediate_nodes=False) a = t.add_child(name='annotation') a.add_feature('sci_name', 'annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if (node.rank in ['order', 'class', 'phylum', 'kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name == 'annotation': s = str(msa[taxids.index(int(node.name))].seq) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") # gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' ' + msa[taxids.index(int(node.name))].id), node, column=1, position="aligned") # add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name == 'annotation': if (annotation): s = annotation # get_hist_ss_in_aln_as_string(msa_tr) else: s = ' ' * len(msa[0].seq) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' ' + 'SEQ_ID'), node, column=1, position="aligned") # add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render(outfile, w=width, dpi=300, tree_style=ts)
def run(args): # add lineage profiles/stats import re from ete2 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in name2tax.values()]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = all_taxids.keys()[0] log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(all_taxids.keys(), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print '\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))]) elif args.info: print '# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in translator.iteritems(): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print '\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string])
def taxo_seq_architecture(seqreclist=[],outfile='taxo_arch.svg',taxids=[],annotation='',title='',width=2000): """ Visualize sequence architecture together with a taxonomy tree seqreclist - contains a list of seqres. each seqrec should have a list of features in biobython SeqFeature format. features of type "domain" will be plotted as boxes features of type "xxxx" will be plotted as ... taxids - list of taxids in the same order as seqs in msa, if now provided will assume that seqrecs are in genbank format and attempt to get taxids from there. """ aa=['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V','B','Z','X','.','-'] def get_color(str): colorlist=['red','green','yellow','lightblue','cyan','magenta','orange','pink','lightgreen'] return colorlist[hash(str)%9] if len(taxids)==0: taxids=map(get_taxid_from_gbrec,seqreclist) ncbi = NCBITaxa() taxids=map(int,taxids) t = ncbi.get_topology(taxids,intermediate_nodes=False) # a=t.add_child(name='annotation') # a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': #here we are adding faces and we need to play with seqmotif face seq=str(seqreclist[taxids.index(int(node.name))].seq) motifs=[]#[[0,len(seq), "seq", 10, 10, None, None, None]] for f in seqreclist[taxids.index(int(node.name))].features: if f.type=='domain': motifs.append([f.location.start,f.location.end,"[]",None,10,"blue", get_color(f.qualifiers['name']), "arial|8|black|%s"%f.qualifiers['name']]) if f.type=='motif': #It turns out that we need to solve overlap problem here, here it is solved only in case of one overlap s=f.location.start e=f.location.end flag=True overlappedm=[] for m in motifs: if m[2]=='seq' and m[0]<e and m[1]>s: #we have an overlap, four cases, preceding motife always is on top flag=False overlappedm.append(m) if not flag: #we have to solve multiple overlap problem #let's do it by scanning sflag=False eflag=False for x in range(s,e+1): if not sflag: #check if we can start overlap=False for m in overlappedm: if x>=m[0] and x<m[1]: overlap=True if not overlap: ts=x sflag=True #check if is time to end if sflag and not eflag: overlap=False for m in overlappedm: if x==m[0]: overlap=True if overlap or x==e: te=x eflag=True if sflag and eflag: motifs.append([ts,te,"seq",10,10,"black",get_color(f.qualifiers['name']),None]) sflag=False eflag=False if flag: motifs.append([f.location.start,f.location.end,"seq",10,10,"black",get_color(f.qualifiers['name']),None]) seqFace = SeqMotifFace(seq,motifs,scale_factor=1,seq_format="[]") seqFace.overlaping_motif_opacity = 1.0 # seqFace.fg=aafgcolors # seqFace.bg=aabgcolors_gray add_face_to_node(seqFace, node, 0, position="aligned") # gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+seqreclist[taxids.index(int(node.name))].id+' '),node,column=1, position = "aligned") # add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") #We currently disable annotation if node.is_leaf() and node.name=='annotation': if(annotation): s=annotation # get_hist_ss_in_aln_as_string(msa_tr) else: s=' '*max(map(lambda x: len(x.seq),seqreclist)) # seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) # add_face_to_node(seqFace, node, 0, position="aligned") # add_face_to_node(TextFace(' '+'SEQ_ID'),node,column=1, position = "aligned") # add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") # add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render(outfile, w=width, dpi=300, tree_style=ts)
def main(): args = parser.parse_args() input_fp = args.input_fp tdt_out_fp = args.tdt_out_fp h2gt_out_fp = args.h2gt_out_fp rank_headers = args.rank_headers ranks = args.ranks ncbi = NCBITaxa() # input_fp = './R1_trimmed_CAT_rare_genefamilies_cpm_ko.tsv' h2gt = read_humann2_genetable_generator(open(input_fp)) rank_headers = rank_headers.split(',') ranks = ranks.split(',') tax_dict = {} if tdt_out_fp: tdt_out_f = open(tdt_out_fp, 'w') if h2gt_out_fp: h2gt_out_f = open(h2gt_out_fp, 'w') first_h = True first_t = True for gene, header, line, tax in h2gt: lineage = list(rank_headers) if tax and tax not in tax_dict: best_id = None family, genus, species = clean_humann2_taxon(tax) best_id = get_best_ncbi_id(family, genus, species, ncbi) if best_id is not None: lineage = get_taxon_path(best_id, ncbi, ranks=ranks, rank_headers=rank_headers) tax_dict[tax] = lineage if tax: lineage = tax_dict[tax] if tdt_out_fp: if first_t: first_t = False tdt_out_f.write('Gene Family\t{0}\t{1}\n'.format( '\t'.join(header), '\t'.join(ranks))) elif tax: tdt_out_f.write('{0}\t{1}\t{2}\n'.format( gene, '\t'.join(line), '\t'.join(lineage))) if h2gt_out_fp: if first_h: h2gt_out_f.write('# Gene Family\t{0}\n'.format( '\t'.join(header))) first_h = False if tax: if lineage == rank_headers: h2gt_out_f.write('{0}|{1}\t{2}\n'.format( gene, 'unknown', '\t'.join(line))) else: h2gt_out_f.write('{0}|{1}\t{2}\n'.format( gene, '.'.join(lineage), '\t'.join(line))) else: h2gt_out_f.write('{0}\t{1}\n'.format(gene, '\t'.join(line))) if tdt_out_fp: tdt_out_f.close() if h2gt_out_f: h2gt_out_f.close()
def get_name(taxid): ncbi = NCBITaxa() names = ncbi.get_taxid_translator([taxid]) return names[taxid]
from pandas import DataFrame from Bio import SeqIO from pandas import Index from ete2 import NCBITaxa data_path = "/home/moritz/people/MoreData/genomes/img_od1s" img_fasta = "/home/moritz/people/MoreData/raw_imgs/od1s.fasta" img_xls = "/home/moritz/people/MoreData/raw_imgs/od1s.xls" name = "parcu_from_img_" taxDb = NCBITaxa() contigs = DataFrame.from_csv(img_xls, sep="\t", header=0, index_col=0) manual_taxo = taxDb.get_name_translator(['Candidatus Parcubacteria' ]).values()[0][0] metadata = { name + str(g): { 'IMG_ID': g, 'name': name + str(g), 'species_taxid': manual_taxo, 'long_name': contigs.loc[contigs['Genome ID'] == g]['Genome'].iloc[0] } for g in set(contigs['Genome ID']) } seq_dict = {k: [] for k in metadata} with open(img_fasta, "r") as file: for i, c in enumerate(SeqIO.parse(file, "fasta")): seq_dict[name + str(contigs.iloc[i]['Genome ID'])] += [c]
val = [color[2] for color in hsv] ind = np.lexsort((val, sat, hue)) sorted_colors = [colors_[i] for i in ind] colors_final = [] for i, (name, color) in enumerate(sorted_colors): colors_final.append(color) import random random.shuffle(colors_final) colors_mapping = {} #set up NCBI database from ete2 import NCBITaxa ncbi = NCBITaxa('/dfs/scratch0/manans/.etetoolkit/taxa.sqlite') # read .nemb file EMBEDDING_FILE = 'emb/n2v-avg.nemb' histograms = [] species_ids = [] with open(EMBEDDING_FILE, 'r') as tf: for line in tf: ls = line.split(' ') species_ids.append(ls[0]) v = [] for n in ls[1:]: v.append(float(n)) histograms.append(v)