def blast_dico2taxonomy_info(blast_dico): import sequence_id2scientific_classification accession2hits_classification = {} for n, accession in enumerate(blast_dico.keys()): print n, accession accession2hits_classification[accession] = {} all_taxon_ids = [] for n, one_hit_accession in enumerate(blast_dico[accession].keys()): taxon_ids = blast_dico[accession][one_hit_accession][0]['subject_taxid'].split(";") all_taxon_ids+=taxon_ids print "ids:", all_taxon_ids taxon_id2classification = sequence_id2scientific_classification.taxon_id2scientific_classification(all_taxon_ids) for n, one_hit_accession in enumerate(blast_dico[accession].keys()): accession2hits_classification[accession][one_hit_accession] = [] taxon_ids = blast_dico[accession][one_hit_accession][0]['subject_taxid'].split(";") for taxon in taxon_ids: try: accession2hits_classification[accession][one_hit_accession].append(taxon_id2classification[taxon]) except KeyError: accession2hits_classification[accession][one_hit_accession].append(None) return accession2hits_classification
def accession2full_taxonomic_path(accession_list, database="nucleotide"): import sequence_id2scientific_classification taxon_id_list = accession2taxon_id(accession_list, database).values() #print 'taxon_id list', classif = sequence_id2scientific_classification.taxon_id2scientific_classification( taxon_id_list) return classif
def accession2taxon_rank(accession, rank='phylum'): from Bio import Entrez import sequence_id2scientific_classification Entrez.email = "*****@*****.**" handle1 = Entrez.esearch(db="nuccore", term=accession) record1 = Entrez.read(handle1) ncbi_id = record1['IdList'][0] handle2 = Entrez.elink(dbfrom="nuccore", db="taxonomy", id=ncbi_id) record2 = Entrez.read(handle2) id = record2[0]['LinkSetDb'][0]['Link'][0]['Id'] taxo_data = sequence_id2scientific_classification.taxon_id2scientific_classification( [id]) try: data = taxo_data[id][rank] except: print(accession, ncbi_id, taxo_data[id]) return (False) return data
def insert_taxons_into_sqldb(taxon_id_list, chunk_size=300, mysql_host='localhost', mysql_user='******', mysql_pwd='wnkonwn', mysql_db='blastnr'): import time import MySQLdb import sequence_id2scientific_classification import re conn = MySQLdb.connect( host=mysql_host, # your host, usually localhost user=mysql_user, # your username passwd=mysql_pwd, # your password db=mysql_db) # name of the data base cursor = conn.cursor() taxid2classification = {} id_lists = _chunks(taxon_id_list, chunk_size) for i, one_list in enumerate(id_lists): print(i, "/", len(id_lists)) if i % 100 == 0 and i != 0: time.sleep(60) taxid2classification.update( sequence_id2scientific_classification. taxon_id2scientific_classification(one_list)) print('Number of taxon id retrieved:', len(taxid2classification.keys())) print('Updating blastnr_taxonomy table with %s new taxons' % str(len(taxon_id_list))) for taxon_id in taxon_id_list: if taxon_id == 'N/A': continue sql = 'INSERT INTO blastnr_taxonomy(taxon_id) values (%s)' % (taxon_id) try: cursor.execute(sql) conn.commit() except MySQLdb.IntegrityError: print('Taxon %s already in database' % str(taxon_id)) continue try: for rank in taxid2classification[taxon_id].keys(): sql_id = re.sub(' ', '_', rank) value = taxid2classification[taxon_id][rank] sql = 'UPDATE blastnr_taxonomy SET `%s`="%s" where taxon_id=%s' % ( sql_id, value, taxon_id) #print sql try: cursor.execute(sql) conn.commit() except: print('could not insert rank', sql) except KeyError: print( 'Could not add the following taxon: %s, trying again to get the data...' % taxon_id) temp_dico = sequence_id2scientific_classification.taxon_id2scientific_classification( [taxon_id]) try: for rank in temp_dico[taxon_id].keys(): sql_id = re.sub(' ', '_', rank) value = temp_dico[taxon_id][rank] sql = 'UPDATE blastnr_taxonomy SET `%s`="%s" where taxon_id=%s' % ( sql_id, value, taxon_id) cursor.execute(sql) conn.commit() print('sucess!') except KeyError: print('Could not add %s' % taxon_id) print("temp_dico", temp_dico)
import argparse import sequence_id2scientific_classification parser = argparse.ArgumentParser() parser.add_argument("-t", '--taxon_id', type=str, help="taxon ncbi id", nargs='+') parser.add_argument("-r", '--rank', action="store_true", help="get taxon rank ") args = parser.parse_args() classif = sequence_id2scientific_classification.taxon_id2scientific_classification( args.taxon_id) #print classif if not args.rank: for accession in classif: data = classif[accession] try: superkingdom = data["superkingdom"] except: superkingdom = "-" try: phylum = data["phylum"] except: phylum = "-" try: class_ = data["class"]