def correct_erroneous_repres_of_taxon_instances(): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() erroneous_instances = protDB.get_erroneous_repres_of_taxon_instances() for instance in erroneous_instances: lineage = ncbi.get_lineage(instance[1]) lineage_ranks = ncbi.get_rank(lineage) representative_id = protDB.get_protein_entry(instance[2])[5] lineage_translation = ncbi.get_taxid_translator(lineage) if protDB.get_protein_entry(instance[2]): if representative_id != None: representative_id = protDB.get_protein_entry( instance[2])[5] print('representative_id', representative_id) lineage_representative = ncbi.get_lineage( representative_id) print('lineage_representative', lineage_representative) lineage_representative_ranks = ncbi.get_rank( lineage_representative) print('lineage_representative_ranks', lineage_representative_ranks) if lineage_ranks[ instance[1]] == lineage_representative_ranks[ representative_id]: count_instance = protDB.get_count_of_children_of_repres( instance[1]) count_representative = protDB.get_count_of_children_of_repres( representative_id) if count_representative >= count_instance: protDB.update_protein_entry( {'representative_of_taxon': None}, instance[1]) else: protDB.update_protein_entry_by_repres_by( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'represented_by': instance[1]}, representative_id) protDB.update_protein_entry( {'representative_of_taxon': None}, representative_id) else: protDB.update_protein_entry({'representative_of_taxon': None}, instance[0]) protDB.update_protein_entry( {'representative_of_taxon': instance[1]}, instance[2]) protDB.update_protein_entry( { 'taxon_name_representative': lineage_translation[instance[1]] }, representative_id) protDB.update_protein_entry( {'representative_taxon_rank': lineage_ranks[instance[1]]}, representative_id) print( '---------------------------------------------------------------------------------' )
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres): """This function takes the annotation table generated by viral_contig_maps.py and generates a table that provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations""" ncbi = NCBITaxa(dbfile=ncbi_db) tax_rank_order = ["genus", "subfamily", "family", "order"] contig_set = set(annot_df["Contig"]) for contig in contig_set: contig_lineage = [contig] contig_df = annot_df[annot_df["Contig"] == contig] total_prot = len(contig_df) annot_prot = sum(contig_df["Best_hit"] != "No hit") if annot_prot < prop_annot * total_prot: contig_lineage.extend([""] * 4) else: contig_hits = contig_df[pd.notnull( contig_df["Label"])]["Label"].values taxid_list = [ ncbi.get_name_translator([item])[item][0] for item in contig_hits ] hit_lineages = [{ y: x for x, y in ncbi.get_rank(ncbi.get_lineage(item)).items() if y in tax_rank_order } for item in taxid_list] for rank in tax_rank_order: taxon_list = [item.get(rank) for item in hit_lineages] total_hits = sum(pd.notnull(taxon_list)) if total_hits < min_prot: contig_lineage.append("") continue else: count_hits = Counter( [item for item in taxon_list if pd.notnull(item)]) best_hit = sorted( [(x, y) for x, y in count_hits.items()], key=lambda x: x[1], reverse=True, )[0] prop_hits = best_hit[1] / total_hits if prop_hits < tax_thres: contig_lineage.append(prop_hits) continue else: best_lineage = ncbi.get_lineage(best_hit[0]) contig_lineage.extend([ ncbi.get_taxid_translator([key])[key] if pd.notnull(key) else "" for key in [{ y: x for x, y in ncbi.get_rank( best_lineage).items() }.get(item ) for item in tax_rank_order[tax_rank_order. index(rank):]] ]) break yield contig_lineage
def checkTaxId(taxId): ncbi = NCBITaxa() tmp = ncbi.get_rank([taxId]) try: tmp = ncbi.get_rank([taxId]) rank = tmp[int(taxId)] if not rank == 'species': print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank)) else: print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)])) except: print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId)
def make_krona_table(f, db): if not db: ncbi_taxa = NCBITaxa() else: ncbi_taxa = NCBITaxa(db) krona_table = pd.DataFrame(columns = ["abundance","superkingdom","phylum","class","order","family","genus", "species","leaf"]) one_letter_ranks = {"D": "superkingdom", "P": "phylum", "C": "class", "O": "order", "F": "family", "G": "genus", "S": "species"} df = pd.read_csv(f, header=None, names = ["clade_percent", "clade_reads", "reads", "rank", "taxid", "name"], sep="\t") df = df.loc[df.reads > 0] for j, i in enumerate(df.index): r = df.loc[i] taxid = r["taxid"] reads = r["reads"] name = r["name"] one_letter_rank = r["rank"] if one_letter_rank == "-": rank = ncbi_taxa.get_rank([taxid])[taxid] try: parent_taxid = ncbi_taxa.get_lineage(taxid)[-2] except IndexError: parent_taxid = taxid parent_rank = ncbi_taxa.get_rank([parent_taxid])[parent_taxid] if rank == "no rank" and parent_rank == "species": rank = "leaf" else: continue elif one_letter_rank == "U": rank = "unclassified" else: try: rank = one_letter_ranks[one_letter_rank] #TODO: Shouldn't be too many reads mapped directly to ranks not in the krona table, but check eventually except KeyError: continue res = {"abundance": reads, "superkingdom": "", "phylum": "", "class": "", "order": "", "family": "", "genus": "", "species": "", "leaf": ""} if rank != "unclassified": rank_dict = ncbi_taxa.get_rank(ncbi_taxa.get_lineage(taxid)) name_dict = ncbi_taxa.get_taxid_translator(ncbi_taxa.get_lineage(taxid)) for dict_taxid, dict_rank in rank_dict.items(): if dict_rank in res.keys(): rank_name = name_dict[dict_taxid] res[dict_rank] = rank_name if not rank in ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]: res["leaf"] = name _df = pd.DataFrame(res, index=[j])[krona_table.columns] krona_table = pd.concat([krona_table, _df]) return krona_table
def get_metadata(records: List[SeqRecord]): ncbi = NCBITaxa() species = [gb.annotations["organism"] for gb in records] name_translator = ncbi.get_name_translator(species) sought_ranks = [ "superkingdom", "order", "family", "subfamily", "genus", "species" ] metadata = [] for gb in records: taxid = name_translator[gb.annotations["organism"]][0] lineage = ncbi.get_lineage(taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) taxonomy = { ranks[k]: names[k] for k in lineage if ranks[k] in sought_ranks } metadata.append({**taxonomy, "aid": gb.id}) df = pd.DataFrame(metadata) df.to_csv("metadata.csv") return df
def sort_collection_by_taxon_rank(collection, key, rank='species', rank_id=None): ncbi = NCBITaxa() new_collection = collection i = 0 for item in collection: lineage = ncbi.get_lineage(item[key]) lineage_ranks = ncbi.get_rank(lineage) if (rank in lineage_ranks.values() and item[key] in lineage_ranks.keys() and lineage_ranks[item[key]] != rank): for taxon, taxon_rank in lineage_ranks.items(): if rank == taxon_rank: new_rep = [] for k in range(len(item)): if k == key: new_rep.append(taxon) else: new_rep.append(item[k]) new_rep = tuple(new_rep) if rank_id == None: new_collection[i] = new_rep elif taxon == rank_id: new_collection[i] = new_rep i += 1 sorted_collection = tuple( sorted(new_collection, key=operator.itemgetter(key))) return sorted_collection
def from_taxid(cls, taxid: int) -> "Lineage": """ Create `Lineage` instance from taxid Parameters ---------- taxid : int A valid NCBI taxonomy id Returns ------- "Lineage" Instance of the `Lineage` class """ ncbi = NCBITaxa() lineage_taxids = ncbi.get_lineage(taxid) lineage_names = ncbi.get_taxid_translator(lineage_taxids) lineage_ranks = { v.capitalize(): k for k, v in ncbi.get_rank(lineage_taxids).items() } if "Superkingdom" in lineage_ranks: lineage_ranks["Kingdom"] = lineage_ranks["Superkingdom"] del lineage_ranks["Superkingdom"] taxa: Dict[str, str] = {} for field in cls._fields: if field in lineage_ranks: taxa[field] = lineage_names[lineage_ranks[field]] else: break return cls(**taxa)
def get_taxonomy(species_name, name_format="Genus species", ranks=None, update_db=False): species_name = str(species_name) ncbi = NCBITaxa() if update_db == True: ncbi.update_taxonomy_database() if name_format == "Genus species": species_name = species_name if name_format == "Genus_species": species_name = species_name.replace("_", " ") species_id = ncbi.get_name_translator([species_name]) if len(species_id) == 0 and ranks == None: return (['unknown']) if len(species_id) == 0 and ranks != None: return (['unknown'] * len(ranks)) lineage_ids = ncbi.get_lineage(species_id[species_name][0]) names = ncbi.get_taxid_translator(lineage_ids) if ranks == None: return (names) lineage_rk = ncbi.get_rank(lineage_ids) parsed_names = [] for rk in ranks: for rk_id, rk_rk in lineage_rk.items(): if rk_rk == rk: parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id]) return (parsed_names)
class TaxaRetriever(object): # tested def __init__(self, category): self.ncbi = NCBITaxa() self.species = list( self.ncbi.get_descendant_taxa(category, collapse_subspecies=True)) self.ranks = self.ncbi.get_rank(self.species) self.taxas = filter(lambda x: self.ranks[x] == 'species', self.species)
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def from_name2ids(phylum_name, dataset='genbank', return_d2ids=False): """ retrieve ids and metadata from genbank file :param phylum_name: :return: """ phylum_names = [_ for _ in phylum_name.split(';') if _] # phylum_name = "Nitrospirae;" # phylum_tid = "40117" ncbi = NCBITaxa() p2tid = ncbi.get_name_translator(phylum_names) for _ in phylum_names: if not p2tid.get(_): print(f" '{_}'' not found. please check the name") tids = [p2tid.get(_, [None])[0] for _ in phylum_names if p2tid.get(_)] tid2name = { p2tid.get(_, [None])[0]: _ for _ in phylum_names if p2tid.get(_) } domain2dids = defaultdict(list) descend_ids = [] tid2dids = {} for tid in tids: lineages = ncbi.get_lineage(tid) ranks = ncbi.get_rank(lineages) ranks = {v: k for k, v in ranks.items()} names = ncbi.get_taxid_translator(lineages) domain = names[ranks['superkingdom']] _descend_ids = ncbi.get_descendant_taxa(tid, intermediate_nodes=True) tid2dids[tid2name[tid]] = len(_descend_ids) descend_ids += _descend_ids domain2dids[domain].extend(_descend_ids) print(f"in total, {len(descend_ids)} taxids were found. ") if return_d2ids: return domain2dids domain2aids = defaultdict(list) collect_info = [] descend_ids = set(descend_ids) for domain, ids in domain2dids.items(): d = domain.lower() metadata = join(metadata_files_dir, f"{dataset}_{d}_assembly_summary.txt") tqdm.write( f'read {metadata} which last modified at : {time.ctime(os.path.getmtime(metadata))}' ) for row in tqdm(open(metadata)): if row.startswith("GC"): rows = row.split('\t') if int(rows[5]) in descend_ids: collect_info.append(row) domain2aids[d].append(rows[0]) return domain2aids, collect_info
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to phylum output file tree_output_file (str): path to tree output file """ ncbi = NCBITaxa() taxon_ids = [] phylum_count = {} with open(taxon_output_file, "w") as phylum_file: csvwriter = csv.writer(phylum_file, delimiter="\t") csvwriter.writerow([ "species", "taxid", "phylum_number", "phylum", "class", "order", "family", "genus", "species" ]) with open(mpwt_taxon_file, "r") as taxon_file: csvfile = csv.reader(taxon_file, delimiter="\t") for line in csvfile: if "taxon" not in line[1]: taxon_ids.append(line[1]) lineage = ncbi.get_lineage(line[1]) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict( (rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ ranks2lineage.get(rank, "no_information") for rank in [ "phylum", "class", "order", "family", "genus", "species" ] ] if ranks[0] != "no_information": phylum = ranks[0][:4] else: phylum = "no_information" if phylum not in phylum_count: phylum_count[phylum] = 1 elif phylum == "no_information": phylum_count[phylum] = "" else: phylum_count[phylum] += 1 row = ([line[0], line[1]] + [phylum + str(phylum_count[phylum])] + ranks) csvwriter.writerow(row) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
def taxid2lineage(taxid): ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) lineage_dict = dict() names = ncbi.get_taxid_translator(lineage) for rank in ranks: for k, v in ncbi.get_rank(lineage).items(): if v == rank: lineage_dict.update({v: names[k]}) return lineage_dict
def get_lineage_at_desired_ranks(taxid, desired_ranks): 'Retrieve lineage information at desired taxonomic ranks' # initiate an instance of the ncbi taxonomy database ncbi = NCBITaxa() # retrieve lineage information for each full length 16S molecule lineage = ncbi.get_lineage(taxid) lineage2ranks = ncbi.get_rank(lineage) ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items()) ranki = [ranks2lineage.get(x) for x in desired_ranks] ranks = [x if x is not None else 0 for x in ranki] return(ranks)
def check_ancestor(name: str, tax_id: int, rank: str = None) -> bool: ncbi = NCBITaxa() ancestor_ids = ncbi.get_name_translator([name]).get(name, []) if not ancestor_ids: raise ValueError("No taxonomy id for {}".format(name)) lineage = ncbi.get_lineage(tax_id) for anc_id in lineage: if rank is None or ncbi.get_rank([anc_id]).get(anc_id, '') == rank: if anc_id in ancestor_ids: return True return False
def get_rank_dict(taxa_name=None): ncbi = NCBITaxa() name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: ## try only the first word (which may be a genus name?) print("can not find taxid for", taxa_name, file=sys.stderr) taxa_name = taxa_name.split() if len(taxa_name) > 1: taxa_name = taxa_name[0] print("try to search %s instead..." % taxa_name, file=sys.stderr) name_dict = ncbi.get_name_translator([taxa_name]) if not name_dict: print("can not find taxid for %s, maybe it's a misspelling.\n" % taxa_name, file=sys.stderr) return None lineage_taxid_list = ncbi.get_lineage(name_dict[taxa_name][0]) rank_dict = dict() for rank in [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ]: rank_dict[rank] = 'NA' for j in lineage_taxid_list: rank = ncbi.get_rank([j])[j] taxa = ncbi.get_taxid_translator([j])[j] if rank == 'kingdom': rank_dict['kingdom'] = taxa elif rank == 'phylum': rank_dict['phylum'] = taxa elif rank == 'class': rank_dict['class'] = taxa elif rank == 'order': rank_dict['order'] = taxa elif rank == 'family': rank_dict['family'] = taxa elif rank == 'genus': rank_dict['genus'] = taxa elif rank == 'species': rank_dict['species'] = taxa else: pass return rank_dict
def get_tax_lineage(taxonid, source): """Return taxonomy lineage information This function uses Biopython library to connect NCBI database and search for taxonomy information or ete3 to download taxdump file and search the information locally. Parameters ------------- taxonid : string Taxonomic id of the species source : string Source to be used to collect the info about the taxonid Returns ------------- lineage: dict Species lineage """ if taxonid not in LINEAGES: if source == "taxdump": ncbi_taxdump = NCBITaxa() lineage_ids = ncbi_taxdump.get_lineage(taxonid) ranks = ncbi_taxdump.get_rank(lineage_ids) names = ncbi_taxdump.get_taxid_translator(lineage_ids) lineage = {ranks[i]:names[i] for i in lineage_ids} LINEAGES[taxonid] = lineage return LINEAGES[taxonid] while True: data = "" try: Entrez.email = "*****@*****.**" handle = Entrez.efetch(id = taxonid, db = "taxonomy", retmode = "xml") data = Entrez.read(handle) handle.close() except Exception as e: with open(LOG, "a") as log: print("Error when searching information about {}".format(taxonid), file=log) if data: break lineage = {d["Rank"]:d["ScientificName"] for d in data[0]["LineageEx"]} lineage[data[0]["Rank"]] = data[0]["ScientificName"] LINEAGES[taxonid] = lineage return LINEAGES[taxonid]
def get_taxo(self, function_get_taxid): ncbi = NCBITaxa() taxname = None taxrank = None taxid = function_get_taxid(self) taxname_dic = ncbi.get_taxid_translator([taxid]) if taxname_dic: taxname = taxname_dic[int(taxid)] taxrank_dic = ncbi.get_rank([taxid]) if taxrank_dic: taxrank = taxrank_dic[int(taxid)] self.taxo = Taxo(taxid, taxname, taxrank)
def get_rank(self): """ Get the rank of the taxon Returns: :obj:`str`: rank of the taxon """ if self.distance_from_nearest_ncbi_taxon == 0: ncbi_taxa = NCBITaxa() rank = ncbi_taxa.get_rank([self.id_of_nearest_ncbi_taxon])[self.id_of_nearest_ncbi_taxon] if rank != 'no rank': return rank return None
def assign_rank_representation(self, rank='species'): protDB = db_handling.ProteinDatabase() ncbi = NCBITaxa() entries_no_representative = protDB.get_entries_no_representative() for entry in entries_no_representative: taxon_id = entry[1] with warnings.catch_warnings(record=True) as w: warn_msg = None warnings.simplefilter("always") lineage = ncbi.get_lineage(taxon_id) for a in w: warn_msg = a.message if warn_msg: warn_data = str(warn_msg).split() taxon_id = int(warn_data[-1]) protDB.update_protein_entry( {'representative_of_taxon': taxon_id}, entry[0]) lineage_ranks = ncbi.get_rank(lineage) lineage_translation = ncbi.get_taxid_translator(lineage) insert = True ellected_rank_id = '' for rank_id, lineage_rank in lineage_ranks.items(): if rank == lineage_rank: ellected_rank_id = rank_id print(entry[0]) if lineage_ranks[taxon_id] != rank: if not self.bigger_than_rank_taxon( lineage_ranks[taxon_id], rank) and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0]) insert = False if entry[2] == None and insert and ellected_rank_id != '': protDB.update_protein_entry( { 'representative_of_taxon': ellected_rank_id, 'representative_taxon_rank': rank, 'taxon_name_representative': lineage_translation[ellected_rank_id] }, entry[0])
def taxid_to_lineage_string(taxid): tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] outstr = '' ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) for level in tax_order: for tid in names: rank = ncbi.get_rank([tid]) if rank[tid] == 'superkingdom': rank[tid] = 'domain' if rank[tid] == level: outstr += level[0] + '_' + names[tid] + ';' return outstr[:-1]
def taxid_to_lineage(taxid): """ Function for retrieving the taxonomic rank of given taxid :param taxid: :return: """ ncbi = NCBITaxa() lineage = ncbi.get_lineage(taxid) rank_to_id = {rank: id for (id, rank) in ncbi.get_rank(lineage).items()} rank_to_id = { desired_rank: (rank_to_id[desired_rank] if desired_rank in rank_to_id.keys() else None) for desired_rank in desired_ranks } return rank_to_id
def get_ncbi_taxa_rank(taxa_name): ncbi= NCBITaxa() name2taxid=ncbi.get_name_translator([taxa_name]) rank="N/A" ncbi_taxid="N/A" if name2taxid: ncbi_taxid=name2taxid[taxa_name].pop() ncbi_ranks=ncbi.get_rank([ncbi_taxid]) rank=ncbi_ranks[ncbi_taxid] return(rank,ncbi_taxid)
def get_taxonomic_group_mapping(group_ids: List[str], selected_rank: str) -> Tuple[Dict, Dict]: """ Function to create a mapping from NCBI-taxon ids to groups which are used to split the provided training records into training and validation sets :param group_ids: List of identifiers that should be NCBI taxon ids :param selected_rank: selected standard rank determining on which level the set is split in training and validation-set :return: Mapping of input taxon_ids as string and groups as integers """ ncbi = NCBITaxa() standard_ranks = [ "superkingdom", "phylum", "class", "order", "family", "genus", "species" ] if not selected_rank.lower() in standard_ranks: selected_rank = auto_select_rank(group_ids) taxon_ids_set = set(group_ids) taxon_ancestor_mapping = {} for taxon in taxon_ids_set: lineage = ncbi.get_lineage(int(taxon)) ids_of_ranks = ncbi.get_rank(lineage) taxon_ancestor_mapping[ taxon] = 0 # fall-back value if sample does not have an entry on this level for ancestor_id, rank in ids_of_ranks.items(): if rank == selected_rank: taxon_ancestor_mapping[taxon] = ancestor_id ancestor_ids = set(taxon_ancestor_mapping.values()) ancestor_names = ncbi.get_taxid_translator(ancestor_ids) ancestor_names[0] = "unknown" ancestor_enumeration = { ancestor_id: x for x, ancestor_id in enumerate(ancestor_ids) } group_name_mapping = { taxon: ancestor_names[taxon_ancestor_mapping[taxon]] for taxon in group_ids } group_id_mapping = { taxon: ancestor_enumeration[taxon_ancestor_mapping[taxon]] for taxon in group_ids } return group_name_mapping, group_id_mapping
def get_ncbi_taxonomy(taxid): ncbi= NCBITaxa() lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) ncbi_taxonomy_path="" for taxid in lineage: if not ranks[taxid]=="no rank": ncbi_taxonomy_path = ncbi_taxonomy_path +";"+names[taxid] return(ncbi_taxonomy_path)
class NCBIController: def __init__(self): self.ncbi = NCBITaxa() def translate(self, taxid): """ :ret scientific name """ return self.ncbi.get_taxid_translator([taxid])[taxid] def get_lineage(self, taxid, rank_lst=None): if rank_lst is None: rank_lst = [ "superkingdom", "phylum", "class", "order", "family", "genus", "species" ] dct = {} try: for taxidLineage, rank in self.ncbi.get_rank( self.ncbi.get_lineage(taxid)).items(): if rank in rank_lst: dct[rank] = taxidLineage dct[rank + "_s"] = self.translate(taxidLineage) return dct except (KeyError, ValueError): # print("ERROR: unknown taxid = {}".format(taxid)) return dict() def get_descendant(self, taxid, rank): ret = [] children = self.ncbi.get_descendant_taxa(taxid, rank_limit="genus") for k, v in self.ncbi.get_rank(children).items(): if v == rank: ret.append(k) return ret
def get_off_target_last_common_taxon_rank(df, target_rank, target_taxon): ncbi = NCBITaxa() if (target_taxon != 0) & (df.loc[target_rank] != 0): if not pd.isnull(df.loc[target_rank]): last_common_taxon = ncbi.get_topology([df.loc[target_rank], target_taxon]) last_common_taxon_rank = last_common_taxon.rank if last_common_taxon_rank != 'no rank': lineage = ncbi.get_lineage(last_common_taxon.taxid) last_common_taxon_rank = ncbi.get_rank([lineage[-1]])[lineage[-1]] else: last_common_taxon_rank = 'no rank' else: last_common_taxon_rank = 'no rank' else: last_common_taxon_rank = 'no rank' return(last_common_taxon_rank)
def compute_taxid_paths(unique_tax_id_hash, ): #ncbi = NCBITaxa() path_output = "" ncbi = NCBITaxa(NCBITaxaDbFile) pathways = list() tax_name_ctr = dict() max_scalable_hits = 1000 max_value = 40 for tax_id in unique_tax_id_hash: # save mode; because the tax id can also be a not parsable string try: # get pathway (ete3 package) => "['root', 'bacteria', 'bac1']" global_scaling_val = unique_tax_id_hash[tax_id] lineage = ncbi.get_lineage(int(tax_id)) # prepare output for CopraRNA path_output += str(ncbi.get_rank(lineage)) + "\n" path_output += str(lineage) + "\n\n" names = ncbi.get_taxid_translator(lineage) tmp_path = list() for tax_id2 in lineage: tax_name = str(tax_id2) + ":" + str(names[tax_id2]) if tax_name in tax_name_ctr: tax_name_ctr[tax_name][0] += global_scaling_val else: tax_name_ctr[tax_name] = list() tax_name_ctr[tax_name].append(global_scaling_val) #tax_name_ctr[tax_name][0] += unique_tax_id_hash[tax_id] tax_name_ctr[tax_name].append(0) tax_name_ctr[tax_name].append(0) tmp_path.append(tax_name) # normalize node values for tax_name in tax_name_ctr: if (tax_name_ctr[tax_name][0]) <= max_scalable_hits: tax_name_ctr[tax_name][1] = math.sqrt(float(tax_name_ctr[tax_name][0])) * 1.26 tax_name_ctr[tax_name][2] = "passed" else: tax_name_ctr[tax_name][1] = max_value tax_name_ctr[tax_name][2] = "failed" # append sub-pathway to pathways pathways.append(tmp_path) except ValueError: pass return pathways, tax_name_ctr, path_output
def build_seqxds(vpwxracc_fpathstr, dbpathstr, vpwxrafull_fpathstr=None): '''builds an xarray dataset from vscurate xarray, adding ''' vpwxra_cc = xr.open_dataarray(vpwxracc_fpathstr) for x in vpwxra_cc.curateseq.values: #().coords[:,'curateseqs'].data: vpwxra_cc.loc[:,x,'normscore'] = (vpwxra_cc.loc[:,x,'score']- vpwxra_cc.loc[:,x,'score'].min()) / \ (vpwxra_cc.loc[:,x,'score'].max() - vpwxra_cc.loc[:,x,'score'].min()) mergeds = xr.Dataset(data_vars={'vpwxra_cc': vpwxra_cc}) if vpwxrafull_fpathstr is not None: vpwxra_full = xr.open_dataarray(vpwxrafull_fpathstr) for x in vpwxra_full.curateseq.values: #().coords[:,'curateseqs'].data: vpwxra_full.loc[:,x,'normscore'] = (vpwxra_full.loc[:,x,'score']- vpwxra_full.loc[:,x,'score'].min()) / \ (vpwxra_full.loc[:,x,'score'].max() - vpwxra_full.loc[:,x,'score'].min()) mergeds[ 'vpwxra_full'] = vpwxra_full #xr.Dataset(data_vars={'vpwxra_cc':vpwxracc}) taxra=xr.DataArray( np.full((len(vpwxra_cc.dbseq),7),np.nan), \ coords=[vpwxra_cc.dbseq,['superkingdom','phylum','class','order','family','genus','species']], \ dims=['dbseq','ranks']) ncbitaxa = NCBITaxa() conn = seqdbutils.gracefuldbopen(dbpathstr) conn.row_factory = sqlite3.Row c = conn.cursor() tonamedict = {} for accentry in vpwxra_cc.dbseq: acc = accentry.data.item(0) c.execute('''SELECT * FROM PROTEINGBS WHERE acc=(?)''', (acc, )) # sr=pickle.loads(c.fetchone()['pklgbsr']) #taxdict=get_taxdict(sr,ncbitaxa) row = c.fetchone() taxid = row['taxid'] if taxid is not None: taxlineage = ncbitaxa.get_lineage(taxid) rankdict = ncbitaxa.get_rank(taxlineage) valdict = ncbitaxa.get_taxid_translator(taxlineage) rankinfodict = {rankdict[k]: [k, valdict[k]] for k in rankdict} for k in rankinfodict: if k in taxra.ranks: taxra.loc[acc, k] = rankinfodict[k][0] tonamedict[rankinfodict[k][1]] = rankinfodict[k][0] conn.close() # mergeds=xr.Dataset(data_vars={'vpwxra':vpwxra,'taxra':taxra}) #mergeds=xr.Dataset(data_vars={'vpwxra':vpwxracc}) mergeds['taxra'] = taxra return mergeds
def determine_unassigned_rank(taxid): """ Given a taxid, will use ete3 to look at all its descendants. Based on what it finds, will infer what taxonomic level the taxid should be at. Useful for things that have 'no rank' according to NCBI. :param taxid: NCBI taxid, should be an integer :return: string that says what taxonomy level we're at, one of the options from tax_order """ tax_order = ['kingdom', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] ncbi = NCBITaxa() descendants = ncbi.get_descendant_taxa(taxid, intermediate_nodes=True) lowest_rank = 900 for descendant in descendants: rank = ncbi.get_rank([descendant]) if rank[descendant] in tax_order: rank_number = tax_order.index(rank[descendant]) if rank_number < lowest_rank: lowest_rank = rank_number return tax_order[lowest_rank - 1]
if len(sys.argv) == 1: sys.exit("USAGE: python %s <path/to/ncbi_gi_taxid_file> > <output.txt>" % sys.argv[0]) ncbi = NCBITaxa() #ncbi.update_taxonomy_database() fp = open('taxa-ids-not-found.txt', 'w') hier = ["superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species"] missing = [] for x in open(sys.argv[1]): dat = x.rstrip().split('\t')[-1] try: lineage = ncbi.get_lineage(dat) names = ncbi.get_taxid_translator(lineage) ranks = ncbi.get_rank(lineage) new_ranks = {} for keys in ranks: if ranks[keys] in hier: new_ranks[keys]=ranks[keys] d = {} for taxid in lineage and new_ranks: d[new_ranks[taxid]] = names[taxid] for key in sorted(d): print x.rstrip() + "\t"+ str(key)+"\t"+d[key] except ValueError: missing.append(x.rstrip()) fp.write('\n'.join(missing))
#!/usr/bin/python Usage = """ Print taxid's lineage and ranks by default prints to the stdout Usage: taxid_ranks.py taxid > ouput.txt Arun Seetharam [email protected] taxid_ranks.py -version 1.0 04/13/2017 """ from ete3 import NCBITaxa import sys ncbi = NCBITaxa() if len(sys.argv)<2: print Usage else: cmdargs = str(sys.argv) lineage = ncbi.get_lineage((sys.argv[1])) names = ncbi.get_taxid_translator(lineage) for taxid in lineage: print [ncbi.get_rank([taxid])], [names[taxid]] # print [names[taxid] for taxid in lineage] # print [ncbi.get_rank([taxid]) for taxid in lineage] # print [ncbi.get_rank([name]) for name in names]
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))