def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None): """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are expected to contain a feature (name, by default) encoding a valid taxid number. All descendant nodes (including internal nodes) are annotated with the following new features: `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database `Node.named_lineage`: the NCBI lineage track using scientific names `Node.taxid`: NCBI taxid number `Node.lineage`: same as named_lineage but using taxid codes. Note that for internal nodes, NCBI information will refer to the first common lineage of the grouped species. :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. :param None tax2name: A dictionary where keys are taxid numbers and values are their translation into NCBI scientific name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2track: A dictionary where keys are taxid numbers and values are their translation into NCBI lineage tracks (taxids). Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None tax2rank: A dictionary where keys are taxid numbers and values are their translation into NCBI rank name. Its use is optional and allows to avoid database queries when annotating many trees containing the same set of taxids. :param None dbfile : If provided, the provided file will be used as a local copy of the NCBI taxonomy database. :returns: tax2name (a dictionary translating taxid numbers into scientific name), tax2lineage (a dictionary translating taxid numbers into their corresponding NCBI lineage track) and tax2rank (a dictionary translating taxid numbers into rank names). """ ncbi = NCBITaxa(dbfile=dbfile) return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank)
def ncbi_compare(self, autodetect_duplications=True, cached_content=None): if not cached_content: cached_content = self.get_cached_content() cached_species = set([n.species for n in cached_content[self]]) if len(cached_species) != len(cached_content[self]): print(cached_species) ntrees, ndups, target_trees = self.get_speciation_trees(autodetect_duplications=autodetect_duplications, map_features=["taxid"]) else: target_trees = [self] ncbi = NCBITaxa() for t in target_trees: ncbi.get_broken_branches(t, cached_content)
def AddmyID(modelIDList, ID, filepath): ncbi = NCBITaxa() if ID.isdigit(): modelIDList.append(int(ID)) else: name2taxID = ncbi.get_name_translator(ID) modelIDList.append(int(name2taxID[ID][0])) tree = model2Tree(modelIDList) #print tree.get_ascii(attributes=["sci_name", "rank"]) outfile = "outTree.tmp" out = open(outfile, "w") for line in tree.get_ascii(attributes=["sci_name", "rank"]): out.write(line) out.close() return modelIDList
def main(): """Make queries against NCBI Taxa databases """ # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa(dbfile=args.database) ## dbfile location if args.verbose > 1: sys.stderr.write('Taxa database is stored at {}\n'.format(ncbi.dbfile)) # Update the database if required. if args.update is True: if args.verbose > 1: msg = 'Updating the taxonomy database. This may take several minutes...\n' sys.stderr.write(msg) ncbi.update_taxonomy_database() # If names were provided in taxid list, convert to taxids args.taxid = args.taxid.replace('"', '').replace("'", '').split(',') args.taxid = name2taxid(args.taxid, ncbi) # Output if args.outfile is None: outFH = sys.stdout else: outFH = open(args.outfile, 'w') ## header if args.taxon_info: outFH.write('\t'.join(['name', 'taxid', 'rank', 'lineage']) + '\n') elif not args.just_taxids: outFH.write('\t'.join(['parent_taxid', 'descendent_taxid', 'descendent_name']) + '\n') ## body for taxid in args.taxid: if args.taxon_info: taxon_info(taxid, ncbi, outFH) else: desc_taxa(taxid, ncbi, outFH, args.just_taxids) outFH.close()
def get_tags_leaves(tree, taxid_dict): ncbi_taxa = NCBITaxa() bacteria_taxid = 2 dpapi_taxid = 91374 leaf_tags = {} for leaf in tree.iter_leaves(): seqid = leaf.name if "DIPPA" in seqid: leaf_tags[seqid] = "dpapi" elif seqid in taxid_dict.keys(): # print (seqid) # print (taxid_dict[seqid]) taxid = int(taxid_dict[seqid]) if taxid == dpapi_taxid: leaf_tags[seqid] = "dpapi" elif bacteria_taxid in ncbi_taxa.get_lineage(taxid): leaf_tags[seqid] = "bacteria" else: leaf_tags[seqid] = "other" else: print (seqid, "is not in taxid dict!") leaf_tags[seqid] = "other" return leaf_tags
def model_organisms(inputfile): ncbi = NCBITaxa() infile = open(inputfile, "r") modelList = [] for line in infile: modelList.append(line[:-1]) infile.close() if modelList[0].isdigit(): print "List of model IDs Loaded" Type = 'Id' else: print "List of model names Loaded" Type = 'Sp' modelIDList = [] if Type == 'Sp': name2taxID = ncbi.get_name_translator(modelList) for model in modelList: modelIDList.append(name2taxID[model][0]) else: modelIDList = modelList return modelIDList
import sqlite3 import csv import re import random from os import listdir from os.path import isfile, join from collections import Counter from collections import defaultdict from collections import OrderedDict from ete3 import NCBITaxa from operator import itemgetter from datetime import datetime, date, time d = defaultdict(list) ncbi = NCBITaxa() DEFAULT_TAXADB = os.path.join(os.environ.get( 'HOME', '/'), '.etetoolkit', 'taxa.sqlite') DB_VERSION = 2 def get_input(): """Get all user input and return all files and settings. Returns: Filepaths and all QC and classification files. Searchranks that will be added to the OTU table. Minimum qscore used for filtering the reads. """ while True: mypath = input("Enter classification files path: ")
return #comapre2taxonomies(geneList[0].taxonomy, geneList[1].taxonomy) NCBI = False if NCBI : from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() taxIDlist=[] for gene in geneList: name2taxID = ncbi.get_name_translator([gene.organism]) gene.taxID = name2taxID[gene.organism][0] for i in ncbi.get_lineage(gene.taxID): gene.addlineageid(i) taxIDlist.append(gene.taxID) #taxid2name = ncbi.get_taxid_translator([9606, 9443]) #print taxid2name tree = False if tree : tree = ncbi.get_topology(taxIDlist)
def main(): parser = argparse.ArgumentParser( 'Design FISH probes for a complex microbial community') parser.add_argument( 'input_folder', type=str, help='Input folder containing images of biological samples') parser.add_argument( '-p', '--probe_design_filename', dest='probe_design_filename', type=str, default='', help='Input folder containing images of biological samples') parser.add_argument( '-r', '--ref_clf', dest='ref_clf', type=str, default='', help='Input folder containing images of biological samples') parser.add_argument('-d', '--dimension', dest='dimension', type=int, default='', help='Reference folder') parser.add_argument('-s', '--subfolder', dest='subfolder', type=str, default='F', help='Sub folder') parser.add_argument('-e', '--epithelial', dest='ep', type=str, default='F', help='Sub folder') args = parser.parse_args() if args.probe_design_filename == '': filenames = glob.glob('{}/*.czi'.format(args.input_folder)) samples = list( set([ re.sub('_[0-9][0-9][0-9].czi', '', file) for file in filenames ])) i = 1 for s in samples: measure_biofilm_images_no_reference(s, args.dimension) print("Finished str(i) of str(len(samples))") i = i + 1 else: probes = pd.read_csv(args.probe_design_filename, dtype={'code': str}) ncbi = NCBITaxa() taxon_lookup = probes.loc[:, ['target_taxon', 'code']].drop_duplicates() taxon_lookup['H'] = np.arange(0, 1, 1 / taxon_lookup.shape[0]) taxon_lookup['S'] = 1 taxon_lookup['V'] = 1 taxon_sciname = pd.DataFrame.from_dict(ncbi.get_taxid_translator( taxon_lookup.target_taxon.values), orient='index').reset_index() taxon_sciname.columns = ['target_taxon', 'sci_name'] taxon_lookup = taxon_lookup.merge(taxon_sciname, on='target_taxon') taxon_lookup.to_csv('{}/taxon_color_lookup.csv'.format( args.input_folder)) if args.ep == 'T': taxon_lookup.loc[taxon_lookup.shape[0]] = [ '0', '0000000', 0, 0, 0.5, 'Epithelial' ] umap_transform = joblib.load(args.ref_clf) clf_umap = joblib.load( re.sub('transform_biofilm_7b.pkl', 'transformed_biofilm_7b_svc.pkl', args.ref_clf)) clf = joblib.load( re.sub('transform_biofilm_7b.pkl', 'transformed_biofilm_7b_check_svc.pkl', args.ref_clf)) if args.subfolder == 'T': sf = glob.glob('{}/*'.format(args.input_folder)) for subf in sf: filenames = glob.glob('{}/*.czi'.format(subf)) samples = list( set([ re.sub('_[0-9][0-9][0-9].czi', '', file) for file in filenames ])) for s in samples: measure_biofilm_images(s, args.dimension, umap_transform, clf_umap, clf, taxon_lookup) else: filenames = glob.glob('{}/*.czi'.format(args.input_folder)) samples = list( set([ re.sub('_[0-9][0-9][0-9].czi', '', file) for file in filenames ])) for s in samples: measure_biofilm_images(s, args.dimension, umap_transform, clf_umap, clf, taxon_lookup) return
parser.add_argument('-v', '--version', action='version', version='%(prog)s v3.2') # Getting arguments args = parser.parse_args() kaiju_file = args.kaiju_file R1 = args.R1_file R2 = args.R2_file taxonomy_level = args.taxonomy_level # Getting taxonomy database and taxonomy level ncbi = NCBITaxa() descendants = ncbi.get_descendant_taxa(taxonomy_level) # Create filtered files names # Input: SRR8771429.trimmed.5905288_00_R1.fastq # Output: SRR8771429.trimmed.5905288_00_filtered.R1.fastq # Output: SRR8771429.trimmed.5905288_00_unclassified.R1.fastq filtered_R1 = R1[:-8] + "filtered.R1.fastq" filtered_R2 = R2[:-8] + "filtered.R1.fastq" unfiltered_R1 = R1[:-8] + "unclassified.R1.fastq" unfiltered_R2 = R2[:-8] + "unclassified.R2.fastq" # Create index for large fastq files - This process dramatically decreases the runtime and RAM usage when compared to dictionaries.
__email__ = "*****@*****.**" import sys import argparse from ete3 import NCBITaxa #Display help and usage parser = argparse.ArgumentParser(description="Incorrect number of command line arguments") parser.add_argument('Sorted-LCA.csv') parser.add_argument('Output.gv') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() ncbi = NCBITaxa() #The number of species you want to create the tree with NumberOfSpecies = 10 #Read CSV results into list, remove all but the top 10 most abundant taxonomies ResultsList = list(line.strip().split(",") for line in open(sys.argv[1])) ResultsList = ResultsList[0:int(NumberOfSpecies) + 1] #Take first n items in list (+1 is to negate the header line) #Open output file for writing Output = open(sys.argv[2], "w") #Write header line in dot format Output.write('digraph G {\n\tsize="8,5!";\n') #Define lists, dicts and variables
import optparse from ete3 import NCBITaxa ncbi = NCBITaxa() parser = optparse.OptionParser() parser.add_option('-s', '--species', dest="input_species_filename", help='Species list in text format one species in each line') parser.add_option('-f', '--format', type='choice', choices=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '100'], dest="format", default='8', help='outpur format for tree') parser.add_option('-t', '--treebest', type='choice', choices=['yes', 'no'], dest="treebest", default='no', help='To be used in TreeBest') parser.add_option('-d', '--database', type='choice', choices=['yes', 'no'], dest="database", default='no', help='Update database') options, args = parser.parse_args() if options.database == "yes": try: ncbi.update_taxonomy_database() except: pass if options.input_species_filename is None: raise Exception('-s option must be specified, Species list in text format one species in each line') with open(options.input_species_filename) as f:
def taxId2Species(taxid): return NCBITaxa().get_taxid_translator([taxid])
#!/usr/bin/env python from ete3 import NCBITaxa import sys import os args = sys.argv if len(args) < 2: print("Usage:", args[0], "[IDs]") sys.exit(1) ncbi = NCBITaxa() for id in open(args[1]): print ncbi.get_lineage(id)
def model2Tree(modelIDList): ncbi = NCBITaxa() tree = ncbi.get_topology(modelIDList) print tree.get_ascii(attributes=["sci_name", "rank"]) return tree
#This script uses python module ete3 to query NCBI taxonomy hierarchy for corresponding NCBI taxID ``` The input file, ncbi_gi_taxid_file, resemles a tab delimited two column table, where the first column is the NCBI gene id, the second column is the NCBI taxID. The output file, output.txt, is a four column table, where the first column is the NCBI gene id, the second column is the NCBI taxID, the third column is the taxonomy rank (e.g, phylum), and the fourth column is the calssification at the taxonomy rank (e.g., Proteobacteria) The output does not contain header. It's also in long table format as known in R. The output can be converted to the traditional wide table format in R using "reshape2". ``` if len(sys.argv) == 1: sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0]) ncbi = NCBITaxa() #ncbi.update_taxonomy_database() fp = open('taxa-ids-not-found.txt', 'w') hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"] missing = [] for x in open(sys.argv[1]): dat = x.rstrip().split('\t')[-1] try: lineage = ncbi.get_lineage(dat) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) new_ranks = {} for keys in ranks:
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if not args.no_annot and not pexists(EGGNOGDB_FILE): print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB): print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error('--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))') if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
#!/usr/bin/python3 from ete3 import NCBITaxa ncbi = NCBITaxa() diamond_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond.tsv" out_path = "/home/anna/bioinformatics/diplonema/dpapi_genome_diamond_annotation.tsv" taxids = [] with open(diamond_path) as input_f: for line in input_f: newtaxid = line.split("\t")[1] taxids.append(newtaxid) taxids_nr = list(set(taxids)) tax_names = ncbi.get_taxid_translator(taxids_nr) input_f = open(diamond_path, "r") output_f = open(out_path, 'w') for line in input_f: line_split = line.rstrip().split("\t") id = line_split[0] taxid = line_split[1] evalue = line_split[2] if taxid == "0": name = "None" is_bacteria = 0 else: name = tax_names[int(taxid)] is_bacteria = 1 if 2 in ncbi.get_lineage(taxid) else 0
def test_raise_taxdict_level(): ncbi = NCBITaxa() testdict = {246200: 181.8, 190047: 259.8} higher_level = vica.minhash._raise_taxdict_level(testdict, ncbi) eq_({1224: 181.8, 1117: 259.8}, higher_level)
from ete3 import NCBITaxa #The first time this will download the taxonomic NCBI database and save a parsed version #of it in `~/.etetoolkit/taxa.sqlite`.May take some minutes ncbi = NCBITaxa() print("ncbi.dbfile", ncbi.dbfile) with open(snakemake.input[0], 'r', encoding='utf8') as fh: genus_list = fh.read().strip().split('\n') genus_to_taxid = ncbi.get_name_translator(genus_list) tax_id_vals = genus_to_taxid.values() tree = ncbi.get_topology( [genus_id for subls in tax_id_vals for genus_id in subls], intermediate_nodes=True) # `get_ascii()` has a bug, prints the taxons before to genus without any separation between them, so a way to avoid that is using extra attribues, `dist` seems to be less invasive. Also, numbers from 'dist' are replaced with open(snakemake.output[0], mode='w', encoding='utf8') as fh: print(tree.get_ascii(attributes=["dist", "sci_name"]).replace('1.0,', '-'), file=fh)
help=""" save path of Colorbar for the heatmap with matplotlib """) args = parser.parse_args() infile = args.infile mode = args.mode newick = args.newick if newick: t = PhyloTree(args.newick) species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile)) taxids = set(species2taxid.values()) else: ncbi = NCBITaxa() taxids = set([ line.strip() for line in open(infile) ]) if args.taxoncolors: taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors)) tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True) tNCBI = tNCBI.search_nodes(name="2759")[0] ncbi.annotate_tree(tNCBI, taxid_attr="name") tax2node = dict([node.taxid, node] for node in tNCBI.traverse()) if args.no_intermediate_nodes: for node in tNCBI.get_descendants(): if len(node.children) == 1: node.delete()
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if args.data_dir: set_data_path(args.data_dir) if not args.no_annot and not pexists(get_eggnogdb_file()): print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond': dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error('--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) else: raise ValueError('Invalid --go_evidence value') # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))') if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
import sys import cPickle from pandas import DataFrame from collections import Counter, defaultdict import os from glob import glob from sys import exit from ete3 import SeqGroup from ete3 import NCBITaxa ncbi = NCBITaxa() path = sys.argv[1] + "*clustalo" infiles = glob(path) print "%s infiles" % len(infiles) valid_cols = 0 spvariants = defaultdict(Counter) refaas = [] for infile in infiles: #for infile in glob("formatted_MG_seqs.faa.final_tree.fa"): print infile if os.stat(infile).st_size == 0: continue alg = SeqGroup(infile) alg_matrix = [] labels = [] for name, seq, _ in alg: # Replace trailing gaps with # and * for stop
from argparse import ArgumentParser from Bio import Entrez from ete3 import NCBITaxa import sys ncbi = NCBITaxa() Entrez.email = "*****@*****.**" parser = ArgumentParser() parser.add_argument('-file', help="txt file to be parsed") parser.add_argument('-dbType', help="type of database") parser.add_argument('-filenum', help="output name") args = parser.parse_args() filename = args.file dbType = args.dbType filenum = args.filenum # filename = 'C:/Users/Andrew.Hwang/Desktop/fastaq2phylo/output/blastout.txt' # dbType = 'nt' # filenum = "0" memory = {} writeLines = [] with open(filename, 'r') as f: for line in f: line_arr = line.split("\t") ID=line_arr[1] pos=int(round(100 * int(line_arr[2]) / int(line_arr[4]))) pos2=int(round(100 * int(line_arr[3]) / int(line_arr[4]))) if dbType == 'nt': lineage = ncbi.get_lineage(ID) names = ncbi.get_taxid_translator(lineage)
##Command line: "python3.6 Taxpull.py > taxids" ##Taxpull.py from ete3 import NCBITaxa ncbi = NCBITaxa() descendants = ncbi.get_descendant_taxa('Mus') print(descendants)
from intermine.webservice import Service from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() service = Service("https://phytozome.jgi.doe.gov/phytomine/service") query = service.new_query("Organism") query.add_view("annotationVersion", "assemblyVersion", "commonName", "genus", "name", "proteomeId", "shortName", "species", "taxonId", "version") k = [ "proteomeId", "commonName", "name", "shortName", "annotationVersion", "assemblyVersion", "genus", "species", "taxonId", "version" ] t = [ "superkingdom", "kingdom", "phylum", "class", "subclass", "order", "family", "genus", "species" ] print("\t".join(k + t + ["full_lineage"])) def filterRanks(L): subset = {ncbi.get_rank([x])[x]: x for x in L} #return([if x in subset: ncbi.get_taxid_translator([x])[x] else: "NA" for x in t]) return ([ list(ncbi.get_taxid_translator([subset[x]]).values())[0] if x in subset else 'NA' for x in t ])
line2 = line2.strip() genome_name1 = line2.split("\t")[1] bacteria_name = line2.split("\t")[0] # genome_name1 = genome_name1.split(".")[0] dic_genome_bacteria[genome_name1]=bacteria_name with open(sys.argv[4],'r') as tax_id_fi: for line3 in tax_id_fi: line3 = line3.strip() tax_id = line3.split("\t")[2] species_name = line3.split("\t")[3] bacteria_name2 = line3.split("\t")[0] dic_taxid_bacteria[bacteria_name2]=tax_id dic_taxid_name[tax_id]=species_name ncbi = NCBITaxa() with open(sys.argv[1],'r') as blast_file: for line in blast_file: line = line.strip() blastn_gene_name =line.split("\t")[0] identity = line.split("\t")[2] genome_name = line.split("\t")[1] dic_blastn_identity[blastn_gene_name]=identity dic_blastn[blastn_gene_name]=genome_name if blastn_gene_name in dic_cu: lineage = ncbi.get_lineage(dic_taxid_bacteria[dic_genome_bacteria[dic_blastn[blastn_gene_name]]]) names = ncbi.get_taxid_translator(lineage) tax_seq = [names[taxid] for taxid in lineage] if len(tax_seq)>9: tax_seq2 = [tax_seq[4],tax_seq[8],tax_seq[9]]
331111, 262724, 300852, 83333, 83333, 331111, 83333, 562, 83333, 1351, 83333, 562, 83334, 93061, 93062, 559292, 285006, 1280, 10665, 83333, 663, 83333, 83333, 7460, 246196, 759272, 5693, 559292, 4932, 1773, 300852, 574, 562, 331111, 1247190, 5811, 5722, 300852, 287, 262724, 562, 274, 246196, 1772, 300852, 9739, 83333, 284590, 559292, 284590, 4932, 9823, 9606, 9606, 9823, 562, 7460, 9986, 194966, 679895, 5702, 562, 9606, 262724, 274, 1351, 300852, 562, 300852, 562, 262724, 9986, 9606, 9615, 6039, 209285, 311400, 287, 272844, 273057, 83333, 224308, 69014, 1293037, 2287, 562, 1223565, 1144670, 1217649, 1977881, 480119, 1217710, 1310637, 421052, 470, 1310678, 52133, 1144663, 1960940, 1144670, 1217649, 1977881, 480119, 1217710, 470, 1310637, 421052, 1144663, 1960940, 1310678, 52133, 1217649, 1977881, 480119, 1310637, 421052, 470, 1310678, 1144670, 1217710, 1144663, 1960940, 52133, 1144670, 480119, 1217710, 1217649, 470, 1310637, 421052, 1144663, 1977881, 474186, 3702, 575584 ] ncbi = NCBITaxa() # ncbi.update_taxonomy_database() unique_taxa = list(set(unique_taxa)) with open('unique_taxa.txt', 'w') as infile: for i in unique_taxa: infile.write(str(i) + "\n") infile.close() taxid2name = ncbi.get_taxid_translator(unique_taxa) b = ncbi.get_name_translator(['Bacteria'])['Bacteria'][0] a = ncbi.get_name_translator(['Archaea'])['Archaea'][0] e = ncbi.get_name_translator(['Eukaryota'])['Eukaryota'][0] v = ncbi.get_name_translator(['Viruses'])['Viruses'][0]
parser.add_argument('-o', '-outfile', help='output filepath', type=str) args = parser.parse_args() num_cores = multiprocessing.cpu_count() ### set up MySQL connections ### # connect to NCBITaxonomy DB to get gi to taxid mappings mysql_cn = pymysql.connect(host='localhost', port=3306, user='******', passwd='balamuthia', db='NCBI_Taxonomy') # initialize the NCBI database ncbi = NCBITaxa() ### MAIN FUNCTIONALITY ### # read in .m8 file (GSNAPL output) to pandas dataframe, assign to column names print(date() + " Begin reading in .m8 dataframe") df = pd.read_csv(args.i, sep='\t', header=None, names=[ 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore' ]) print(date() + ' Finished reading in .m8 dataframe')
def inferLineage(self, places): """ infer the lineage from looking at the location of placement looking at the leaves and their tax id and looking at the lineages of all these """ if self.cfg["touch"]: return ncbi = NCBITaxa() # fetch file and load taxinformation seqinfo = self.config.pkgfile("concat.refpkg", "seq_info") taxids = {} si = base.readCSV(seqinfo) # make dictionary for r in si: taxids[r["seqname"]] = r["tax_id"] # for each placement: logging.debug("Infering lineages now") for p in places: # get the GCA names children = p["sisters"] # fetch lineages for all lngs = [] for c in children: try: lngs.append(ncbi.get_lineage(taxids[c])) except ValueError as e: logging.warning(e) # find common elements: common = set(lngs[0]) for l in lngs[1:]: common = common & set(l) # common lineage lng = [] for v in lngs[0]: if v not in common: break # add common elements lng.append(v) nodetaxid = lng[-1] # now we can make it pretty if not self.cfg["fullineage"]: # limit to desired ranks2 desired_ranks = [ "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", ] lineage2ranks = ncbi.get_rank(lng) ranks2lineage = dict( (rank, taxid) for (taxid, rank) in lineage2ranks.items()) ranks = { "{}_id".format(rank): ranks2lineage.get(rank, "NA") for rank in desired_ranks } lng = [i for i in lng if i in ranks.values()] # get translator and make string named = ncbi.translate_to_names(lng) # save to placed object p["lineage"] = "_".join(named) # replace names with spaces into points p["lineage"] = p["lineage"].replace(" ", ".") p["taxidlineage"] = "_".join([str(x) for x in lng]) p["taxid"] = nodetaxid return ()
#!/usr/bin/python Usage = """ Print taxid's lineage and ranks by default prints to the stdout Usage: taxid_ranks.py taxid > ouput.txt Arun Seetharam [email protected] taxid_ranks.py -version 1.0 04/13/2017 """ from ete3 import NCBITaxa import sys ncbi = NCBITaxa() if len(sys.argv)<2: print Usage else: cmdargs = str(sys.argv) lineage = ncbi.get_lineage((sys.argv[1])) names = ncbi.get_taxid_translator(lineage) for taxid in lineage: print [ncbi.get_rank([taxid])], [names[taxid]] # print [names[taxid] for taxid in lineage] # print [ncbi.get_rank([taxid]) for taxid in lineage] # print [ncbi.get_rank([name]) for name in names]
#!/Users/zoliq/anaconda3/bin/python3 from distutils.log import error import os,re,time,argparse from Bio import SeqIO,Entrez from ete3 import NCBITaxa #http://etetoolkit.org/docs/2.3/tutorial/tutorial_ncbitaxonomy.html ncbi = NCBITaxa() Entrez.email = '*****@*****.**' #Entrez.api_key = os.environ["API_KEY"] #update at times: #ncbi.update_taxonomy_database() def assembly_methods(blastline): # Infer the assembler and protein predictor from blast input or faa file seqname = blastline.split()[0].replace(">", "") assembler, predictor = "NA", "NA" if "::" in seqname: #format >TRINITY_DN7724_c0_g1::TRINITY_DN7724_c0_g1_i1::g.1::m.1 type:3prime_partial len:132 gc:universal TRINITY_DN7724_c0_g1_i1:113-505(+) predictor = "transdecoder-old" #old! elif re.search(r"\.p\d+", seqname): #format >c0_g2_i1.p1 type:3prime_partial len:319 gc:universal c0_g2_i1:175-1128(+) predictor = "transdecoder-new" if seqname.startswith("TRINITY"): assembler = "trinity-new" if seqname.split("_")[-2].startswith("i"): #format >TRINITY_DN61_c0_g2_i2_1 # 3 # 644 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.372 #prodigal appends a _number to seqid predictor = "prodigal"
else: #set_env() try: os.mkdir(OUTDIR) except OSError: print("Be careful : The directory taxonDB_data exists") TAXON_DT = {} DOC = search_in_database(PARAM.r, PARAM.dbName, PARAM.taxid) if DOC and DOC["_id"] != PARAM.taxid: DOC = None LIST_GCF = list(set([PARAM.gcf] + DOC["GCF"])) if DOC else [PARAM.gcf] ncbi = NCBITaxa() name = ncbi.get_taxid_translator([int(PARAM.taxid)]) if not name: raise Exception("No correspondance for " + PARAM.taxid + " in ete3 NCBITaxa") name = name[int(PARAM.taxid)] LIST_NAME = list(set([name] + DOC["names"])) if DOC else [name] tmp_taxon_dt = init_taxondt(LIST_GCF, PARAM.user, PARAM.taxid, PARAM.fasta, PARAM.gcf, LIST_NAME, name) if (tmp_taxon_dt != 1): TAXON_DT[PARAM.taxid] = tmp_taxon_dt
#!/usr/bin/env python #This script will take tab-separated uniprotid and taxids and finds first_recenet_common ancestor #and number of steps from FCA to Eukaryota # It produces 4 output files (summary report,ontology.tab, trees.tab) #and a general newick file for taxa from all proteins #This script uses ete3 for dealing with trees and graphs import sys from ete3 import Tree from ete3 import NCBITaxa from ete3 import PhyloTree ncbi = NCBITaxa() tax_dict = dict() tree_dict = dict() ontology = dict() #creating output files outputFile = open('trees.tab', 'w') outputFile2 = open('summary_report.tab', 'w') outputFile2.write('uni_id' + '\t' + 'FCA_id' + '\t' + 'FCA_name' + '\t' + 'steps_from_Eukaryota' + '\n') outputFile3 = open('ontology.tab', 'w') #Here I open the file that Matt script creates and loops in each line and get the taxids with open('SP_by_taxa.tab', 'r') as fo: for line in fo: line = line.rstrip() (uniprotid, taxids) = line.split('\t') one_taxid = taxids.split( ',') # divide the list of taxids to diff taxids 'strings'
def tax_id(lyst): from Bio import Entrez def get_tax_id(species): """to get data from ncbi taxomomy, we need to have the taxid. we can get that by passing the species name to esearch, which will return the tax id""" species = species.replace(' ', "+").strip() search = Entrez.esearch(term=species, db="taxonomy", retmode="xml") record = Entrez.read(search) if species != 'Not assigned' or 'root' and record['IdList'] != []: return record['IdList'][0] def get_tax_data(taxid): """once we have the taxid, we can fetch the record""" search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml") return Entrez.read(search) Entrez.email = "*****@*****.**" if not Entrez.email: print("you must add your email address") sys.exit(2) species_list = [ 'Terrabacteria group', 'Helicobacter pylori 26695', 'Thermotoga maritima MSB8', 'Deinococcus radiodurans R1', 'Treponema pallidum subsp. pallidum str. Nichols', 'Aquifex aeolicus VF5', 'Archaeoglobus fulgidus DSM 4304' ] species_list = lyst taxid_list = [] # Initiate the lists to store the data to be parsed in data_list = [] lineage_list = [] print('parsing taxonomic data...' ) # message declaring the parser has begun for species in species_list: print('\t' + species) # progress messages taxid = get_tax_id(species) # Apply your functions data = get_tax_data(taxid) if 'LineageEx' in data[0]: lineage = { d['Rank']: d['ScientificName'] for d in data[0]['LineageEx'] if d['Rank'] in ['phylum'] } else: print('ERROR:', species, 'not found in dictionary') taxid_list.append( taxid) # Append the data to lists already initiated data_list.append(data) lineage_list.append(lineage) print('complete!') print() print('TaxId\'s:') print(taxid_list) print() from ete3 import NCBITaxa ncbi = NCBITaxa() def get_desired_ranks(taxid, desired_ranks): lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lineage2ranks = ncbi.get_rank(names) ranks2lineage = dict( (rank, taxid) for (taxid, rank) in lineage2ranks.items()) return { '{}_id'.format(rank): ranks2lineage.get(rank, '<not present>') for rank in desired_ranks } if __name__ == '__main__': taxids = taxid_list desired_ranks = [ 'superkingdom', 'kingdom', 'class', 'family' ] #, 'genus'] #['kingdom', 'phylum', 'class', 'order', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species', 'subspecies'] results = list() for taxid in taxids: results.append(list()) results[-1].append(str(taxid)) ranks = get_desired_ranks(taxid, desired_ranks) for key, rank in ranks.items(): if rank != '<not present>': results[-1].append( list(ncbi.get_taxid_translator([rank ]).values())[0]) else: results[-1].append(rank) #generate the header header = ['reads', 'Original search', 'Original_query_taxid'] header.extend(desired_ranks) out = [] # print('\t'.join(header)) out.append(header) #print the results for result, reads, term in zip(results, values, lyst): cnt = 0 for i in result: if 'bacter' in i or 'Bacter' in i: result[2] = 'Bacteria' if i == '<not present>': result[cnt] = '' cnt += 1 temp = [reads, term] temp.extend(result) out.append(temp) out = pd.DataFrame(out) return out
"Humphaplotropis culaishanensis" : "Humphaplotropis culaishanensis (nomen nudum)", "Paraglypturus tonganus" : "Paraglypturus tonganus (nomen nudum)", "Hoploplana elisabelloi" : "Hoploplana elisabelloi (nomen nudum)", "Palpitomonas bilix Eukaryota." : "Palpitomonas bilix", "Eukaryota sp. BB2 Eukaryota." : "Eukaryota sp. BB2", "Ancoracysta twista Eukaryota." : "Ancoracysta twista" } # --- load a parser and iterator for our GenBank file gb_handle = gzip.open(sys.argv[1], "r") # -- a parser that will give you back SeqFeature objects feature_parser = GenBank.FeatureParser() iterator = GenBank.Iterator(gb_handle, feature_parser) # load taxonomy for taxids ncbi = NCBITaxa() # output using prefix out_1 = open("%s.cdna.fasta" % sys.argv[2], "w") out_2 = open("%s.codons.tab" % sys.argv[2], "w") strands = [] stop_codons = [] prot_ids = [] excluded = [] missing_id = [] # begin iterating through the file and getting GenBank records while 1: # get a SeqFeature object for the next GenBank record. When we run # out of records in the file, cur_entry will be None
parser = argparse.ArgumentParser() parser.add_argument("--taxons", "-t", type=str, help="File containing the list of species.") parser.add_argument("--output", "-o", type=str, help="Name of the output file in newick") args = parser.parse_args() if not args.output: args.output = "species_tree.nw" # Setting up a local copy of the NCBI taxonomy database and upgrade it ncbi = NCBITaxa() #ncbi.update_taxonomy_database() # Load the species names try: with open(args.taxons, 'r') as taxFile: listTaxa = taxFile.readlines() listTaxa = [x.strip() for x in listTaxa] listTaxa = [x.split(" ") for x in listTaxa] listTaxa = list(set(itertools.chain(*listTaxa))) listTaxa = [x.replace("_", " ") for x in listTaxa] except FileNotFoundError: print("File does not exist") sys.exit(1) # Retrieve TaxId from species names
def test_pick_higher_level(): ncbi = NCBITaxa() higher_level = vica.minhash._pick_higher_level(9606, ncbi) eq_(7711, higher_level)
from ete3 import NCBITaxa #http://etetoolkit.org/docs/2.3/tutorial/tutorial_ncbitaxonomy.html ncbi = NCBITaxa() from Bio import SeqIO goodones = {"Panarthropoda"} species = set() goodscafs = {} distribution = {"Panarthropoda": 0} with open("besthits_NR.taxified.out") as infile, open("highertaxa.txt", "w") as outfile: table = infile.read().split("\n") for line in table: if len(line.split("\t")) != 1: line = line.split("\t") if line[1] != "N/A": taxid = line[1] lineage = ncbi.get_lineage(taxid)[2:] names = ncbi.get_taxid_translator(lineage) rank = [names[taxid] for taxid in lineage] # if "Eukaryota" in rank: # rank.remove("Eukaryota") if taxid not in species: species.add(taxid) #orgn = ncbi.get_taxid_translator([taxid])[int(taxid)] #print("{}\t{}".format(orgn, "_".join(rank))) if "Panarthropoda" in rank: orgn = ncbi.get_taxid_translator([taxid])[int(taxid)] outfile.write("{}\t{}\t{}\n".format(line[0], orgn, "_".join(rank))) goodscafs[line[0]] = orgn distribution["Panarthropoda"] += 1 elif "Metazoa" in rank:
def generate_consensus(input_blast_filename, input_fasta_filename, similarity, outdir, target_rank, ud): # read in blast result blast_result = pd.read_table(input_blast_filename, header=None) blast_result.columns = [ 'molecule_id', 'reference_id', 'pid', 'qcovhsp', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'staxids' ] # initiate an instance of ncbi taxonomy database ncbi = NCBITaxa() # retrieve lineage information for each full length 16S molecule desired_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] ranks = pd.DataFrame(columns=['staxids'] + desired_ranks) blast_result_staxids = blast_result['staxids'].unique() ranks['staxids'] = blast_result_staxids for i in range(0, blast_result_staxids.shape[0]): taxid = blast_result_staxids[i] if not str(taxid).isdigit(): taxid = taxid.split(';')[0] ranks.ix[i, 1:len(desired_ranks) + 1] = get_lineage_at_desired_ranks( taxid, desired_ranks) # merge lineage information with PacBio 16S blast results blast_lineage = blast_result.merge(ranks, on='staxids', how='left') seq_dict = SeqIO.to_dict(SeqIO.parse(input_fasta_filename, 'fasta')) blast_lineage_filename = outdir + '/blast_lineage.tab' blast_lineage.to_csv(blast_lineage_filename, sep='\t', index=False) if target_rank == 'strain': blast_lineage.groupby(['species']).apply(write_taxon_fasta, taxon=target_rank, seq_dict=seq_dict, similarity=similarity, outdir=outdir, ud=ud) else: blast_lineage.groupby([target_rank]).apply(write_taxon_fasta, taxon=target_rank, seq_dict=seq_dict, similarity=similarity, outdir=outdir, ud=ud) if target_rank == 'strain': blast_lineage_strain = retrieve_cluster(blast_lineage_filename, outdir, 'F') taxon_abundance = blast_lineage_strain['strain'].value_counts( ).reset_index() taxon_abundance.columns = ['taxid', 'counts'] taxon_abundance.to_csv(outdir + '/taxon_abundance.csv', index=False) else: blast_lineage_strain = retrieve_cluster(blast_lineage_filename, outdir, 'F') taxon_abundance = blast_lineage[target_rank].value_counts( ).reset_index() taxon_abundance.columns = ['taxid', 'counts'] taxon_abundance.to_csv(outdir + '/taxon_abundance.csv', index=False) return (blast_lineage)
def contig_tax(annot_df, min_prot, prop_annot, tax_thres): '''This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations''' ncbi = NCBITaxa() tax_rank_order = ["genus", "subfamily", "family", "order"] contig_list = list(annot_df["Contig"].value_counts().index) df_rows = [] def get_tax_rank(label): try: tax_id = ncbi.get_name_translator([label])[label] tax_rank = ncbi.get_rank(tax_id)[tax_id[0]] except: tax_rank = "" return tax_rank for contig in contig_list: assigned_taxa = [] assigned_taxa.append(contig) contig_df = annot_df[annot_df["Contig"] == contig] filtered_df = contig_df[contig_df["Label"].notnull()] filtered_df = filtered_df.reset_index(drop = True) total_annot_prot = len(filtered_df) if total_annot_prot < max(min_prot, prop_annot * len(contig_df)): assigned_taxa.extend([""]*4) else: filtered_df["Rank"] = filtered_df["Label"].apply(get_tax_rank) for item in tax_rank_order: tax_hits = {} if item == "genus": for row, column in filtered_df.iterrows(): if column["Rank"] == item: if column["Label"] not in tax_hits.keys(): tax_hits[column["Label"]] = 1 else: tax_hits[column["Label"]] += 1 if len(tax_hits) < 1: assigned_taxa.append("") else: annot_ratio = max(tax_hits.items(), key = operator.itemgetter(1))[1]/total_annot_prot if annot_ratio < tax_thres: assigned_taxa.append(str(annot_ratio)) else: max_tax = [] for key,value in tax_hits.items(): if value == max(tax_hits.items(), key = operator.itemgetter(1))[1]: max_tax.append(key) if len(max_tax) > 1: assigned_taxa.append("-".join(max_tax)) else: assigned_taxa.append(max_tax[0]) else: for row, column in filtered_df.iterrows(): if column["Rank"] == item: if column["Label"] not in tax_hits.keys(): tax_hits[column["Label"]] = 1 else: tax_hits[column["Label"]] += 1 else: try: name2taxid = ncbi.get_name_translator([column["Label"]]) label_lineage = ncbi.get_lineage(name2taxid[column["Label"]][0]) lineage_names = ncbi.get_taxid_translator(label_lineage) lineage_ranks = ncbi.get_rank(label_lineage) if item in lineage_ranks.values(): for x,y in lineage_ranks.items(): if y == item: if lineage_names[x] not in tax_hits.keys(): tax_hits[lineage_names[x]] = 1 else: tax_hits[lineage_names[x]] += 1 break except: continue if len(tax_hits) < 1: assigned_taxa.append("") else: annot_ratio = max(tax_hits.items(), key = operator.itemgetter(1))[1]/total_annot_prot if annot_ratio < tax_thres: assigned_taxa.append(str(annot_ratio)) else: max_tax = [] for key,value in tax_hits.items(): if value == max(tax_hits.items(), key = operator.itemgetter(1))[1]: max_tax.append(key) if len(max_tax) > 1: assigned_taxa.append("-".join(max_tax)) else: assigned_taxa.append(max_tax[0]) df_rows.append(assigned_taxa) final_df = pd.DataFrame(df_rows, columns = ["contig_ID", "genus", "subfamily", "family", "order"]) return final_df
from ete3 import NCBITaxa ncbi = NCBITaxa() infile = "accesionsetc/40117.prot.accession2taxid" pre = "accesionsetc/" target_rank = 1783270 target_name = 'FCB' outfile = file(target_name + '_Acc.txt', 'w') with open(infile, "rb") as data: next(data) for line in data: entry = line.strip().split("\t") Taxon = entry[2] Version = entry[1] try: rank_list = ncbi.get_lineage(Taxon) if target_rank in rank_list and Version.lower() != "na": outfile.write(Version + "\t" + str(Taxon) + "\n") except: pass outfile.close()
def tdb_from_hits(hits, minPerc=50, testing=False): ''' Determines the lowest taxonomic level with at least minPerc certainty For every hit: reconstruct the lineage (kingdom, phylum, class, ect.) add a count to every rank in the lineage For every rank: see if the number of hits matching one taxa at that rank is above the minPerc the denominator for this equation is the number of hits that have a phyla rank * Note: this is complicated because some lower ranks don't have higher ranks For example, species [Eubacterium] rectale (taxID 39491) has no genus Also, species [artifical construct] (taxID 32630) has no anything but species ''' from ete3 import NCBITaxa ncbi = NCBITaxa() Levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] # generate nested dictionary for levels countDic = {} for level in Levels: countDic[level] = {} # fill in nested dictionary for t in hits['taxID'].tolist(): if t == 0: continue # This try / except thing is trying to catch sporatic errors of: # sqlite3.OperationalError: disk I/O error try: lin = ncbi.get_lineage(t) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) except: time.sleep(1) lin = ncbi.get_lineage(t) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) for i in lin: rank = name2rank[i] name = lin2name[i] if rank in countDic: countDic[rank][i] = countDic[rank].get(i, 0) + 1 # make the table total = sum(countDic['phylum'].values()) table = { 'tax_ID': [], 'tax_confidence': [], 'tax_level': [], 'taxonomy': [] } count = None for level in Levels: dic = countDic[level] for name in sorted(dic, key=dic.get, reverse=True): count = dic[name] break if count == None: table['tax_ID'].append(None) table['tax_confidence'].append(0) table['tax_level'].append(level) table['taxonomy'].append('unk') else: lin = ncbi.get_lineage(name) lin2name = ncbi.get_taxid_translator(lin) name2rank = ncbi.get_rank(lin) rank2name = {v: k for k, v in name2rank.items()} tax = (lin2name[rank2name[level]]) table['tax_ID'].append(name) table['tax_confidence'].append(((count / total) * 100)) table['tax_level'].append(level) table['taxonomy'].append(tax) count = None tdb = pd.DataFrame(table) # find and mark the best hit best = tdb['tax_ID'][tdb['tax_confidence'] >= minPerc].tolist()[-1] tdb['best_hit'] = [True if i == best else False for i in tdb['tax_ID']] # get the full taxonomy for the best hit tdb['full_tax'] = [lineage_from_taxId(t) if b else False for t, b in zip(\ tdb['tax_ID'], tdb['best_hit'])] return tdb
#!/usr/bin/env python import click from ete3 import NCBITaxa import pandas as pd from micone import Lineage NCBI = NCBITaxa() TAX_LEVELS = Lineage._fields TAX_MAP = { "Kingdom": "superkingdom", "Phylum": "phylum", "Class": "class", "Order": "order", "Family": "family", "Genus": "genus", "Species": "species", } def get_lineage(species_name): taxid = NCBI.get_name_translator([species_name])[species_name][0] lineage_taxids = NCBI.get_lineage(taxid) lineage_names = NCBI.get_taxid_translator(lineage_taxids) lineage_ranks = {v: k for k, v in NCBI.get_rank(lineage_taxids).items()} lineage_dict = dict() for tax_level in TAX_LEVELS: try: rank_taxid = lineage_ranks[TAX_MAP[tax_level]] rank_name = lineage_names[rank_taxid]
#!/usr/bin/env python3 import sys from ete3 import NCBITaxa ncbi = NCBITaxa() for taxid in sys.stdin: taxid = taxid.strip() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) lineage_s = "; ".join([names[taxid] for taxid in lineage[1:]]) print("{}\t{}".format(taxid, lineage_s))
def main(argv): #read in taxonomy info for each BUSCO species_taxids = [] #species_taxids[marker_id] = taxid for line in open(sys.argv[1]): tax = line.split('\t')[1].strip('\n') if tax not in species_taxids: species_taxids.append(tax) #initialize NCBI taxdb ncbi = NCBITaxa(sys.argv[2]) #create 2 dicts for ease of lookup #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs #seq_taxids = {seq: taxid, seq:taxid} Save every seq taxid_seqs = {} seq_taxids = {} for line in open(sys.argv[1]): line = line.strip('\n') taxid = line.split('\t')[1] if taxid not in taxid_seqs: taxid_seqs[taxid] = [] seq = line.split('\t')[0] taxid_seqs[taxid].append(seq) seq_taxids[seq] = taxid #iterate over idxstats file and save counts #seq_counts[seq] = [readcount, correct_bases, total_bases, seqlen, coverage] seq_counts = {} seen_taxids = [] counter = 0 countfile = open(sys.argv[4]) countfile.readline() for line in countfile: counter += 1 line = line.strip('\n') seq = line.split('\t')[0] count = int(line.split('\t')[1]) correct_bases = int(line.split('\t')[2]) incorrect_bases = int(line.split('\t')[3]) total_bases = int(line.split('\t')[4]) subjlen = int(line.split('\t')[5]) coverage = float(line.split('\t')[6]) seq_counts[seq] = [ count, correct_bases, total_bases, subjlen, coverage ] taxid = seq_taxids[seq] if taxid not in seen_taxids: seen_taxids.append(int(taxid)) if counter == 0: message = "Empty read count file. Likely no aligned reads in sample." print(message) #still have to write stuff f = open(sys.argv[5], 'w') f.write(message + '\n') f.close() f = open(sys.argv[6], 'w') f.write(message + '\n') f.close() sys.exit() #done parsing idxstats file #create NCBI taxon tree of observed taxa + extend to cellular_org tree = ncbi.get_topology(seen_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) full_taxids = seen_taxids + lineage full_tree = ncbi.get_topology(full_taxids, intermediate_nodes=True) full_seq_taxids = { line.split('\t')[0]: [ line.split('\t')[1].split(','), line.split('\t')[-1].strip('\n').split(',') ] for line in open(sys.argv[3]) } #full_seq_taxids: {taxid: [[specific buscos], [specific + inherited buscos]]} #determine seq counts #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage]]} taxid_counts = {} for seq in seq_counts: taxid = seq_taxids[seq] if taxid not in taxid_counts: taxid_counts[taxid] = [] taxid_counts[taxid].append([ seq, int(seq_counts[seq][0]), int(seq_counts[seq][1]), seq_counts[seq][2], seq_counts[seq][3], seq_counts[seq][4] ]) #write just observed taxid seqs taxon_coverage = {} #taxon_coverage[taxon] = [observed_markers, readcounts, total_bases, percentage_markers, marker_coverage, percent_id ] #dest = open(sys.argv[6], 'w') #dest.write("Name\tNCBI_Rank\tTaxID\tObserved_markers\tRead_counts\tPercent_observed_markers\tMarker_coverage\tPercent_identity\n") for tax in taxid_counts: mc = len(taxid_counts[tax]) counts = 0 bases = 0 correct = 0 total_bases = 0 subj_len = 0 for i in range(0, len(taxid_counts[tax])): counts += taxid_counts[tax][i][1] bases += taxid_counts[tax][i][3] correct += taxid_counts[tax][i][2] total_bases += taxid_counts[tax][i][3] subj_len += taxid_counts[tax][i][4] percent_identity = round((correct / total_bases) * 100, 2) overall_coverage = round((total_bases / subj_len) * 100, 2) total_markers = len(taxid_seqs[tax]) marker_percentage = round(mc / total_markers * 100, 2) name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] #rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] #dest.write(name + '\t' # + rank + '\t' # + tax + '\t' # + str(mc) + '\t' # + str(counts) + '\t' # + str(marker_percentage) + '%\t' # + str(overall_coverage) + '%\t' # + str(percent_identity) + '%\n') taxon_coverage[tax] = [ mc, counts, total_bases, marker_percentage, overall_coverage, percent_identity ] #dest.close() dest = open(sys.argv[6], 'w') dest.write( "Name\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n" ) marker_sorted = sorted(taxon_coverage.keys(), reverse=True, key=lambda x: taxon_coverage[x][3]) for tax in marker_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] dest.write(name + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\n') orphan_children = [] #find counts of seqs for internal nodes for node in full_tree.traverse(): if node.is_leaf() == False: if node.name not in taxid_counts: taxid_counts[node.name] = [] for desc in node.iter_descendants(): if desc.name in taxid_counts: for seq in taxid_counts[desc.name]: if seq not in taxid_counts[node.name]: taxid_counts[node.name].append(seq) else: if node.name not in taxid_counts: orphan_children.append(node.name) #print the tree level_counts = [] currspaces = 0 currparent = '' seen_parents = {} dest = open(sys.argv[5], 'w') dest.write( "Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n" ) for node in full_tree.traverse("preorder"): if node.name not in orphan_children: rank = [ ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name]) ][0] name = [ ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name]) ][0] if node.is_root(): currspaces = 0 else: if currparent == '': currparent = node.up.name currspaces += 4 else: if currparent != node.up.name: currparent = node.up.name if currparent in seen_parents: currspaces = seen_parents[currparent] else: currspaces += 4 seen_parents[currparent] = currspaces if node.name in taxon_coverage: pid = str(taxon_coverage[node.name][5]) + '%' else: pid = "NA" #total_buscos buscos = len(taxid_counts[node.name]) seqs = sum([b[1] for b in taxid_counts[node.name]]) total_buscos = len(full_seq_taxids[node.name][1]) percent = round((buscos / total_buscos) * 100, 2) dest.write( str(buscos) + '\t' + str(total_buscos) + "\t" + str(percent) + '%\t' + str(pid) + '\t' + str(seqs) + '\t' + rank + '\t' + ' ' * currspaces + name + '\n') dest.close()
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if not args.no_annot and not pexists(EGGNOGDB_FILE): print colorify( 'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB): print colorify( 'DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error( '--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error( 'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))' ) if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
#!/usr/bin/env python # Originally from Magpy. Altered to fit this pipeline import sys from ete3 import NCBITaxa # get NCBI taxonomu object ncbi = NCBITaxa() if len(sys.argv) == 1: print("Please provide a filename") sys.exit() # open the file checkm_file = snakemake.input[0] outfile = snakemake.output[0] # skip three lines row1 = checkm_file.readline() # print titles for the output titles = ["name", "nprots", "nhits", "nfull", "genus", "ngenus", "species", "nspecies",
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))