def download_fasta_and_genbank(identifier, tag, genbank=True, fasta=True): """ :param identifier: valid identifier to retrieve from NCBI (genbank) and ENA (fasta) :param tag: name of the filename for the genbank and fasta files. """ if genbank: from bioservices import EUtils eu = EUtils() data = eu.EFetch(db="nuccore", id=identifier, rettype="gbwithparts", retmode="text") if isinstance(data, int) and data == 400: raise ValueError("{} not found on NCBI".format(identifier)) else: with open("%s.gbk" % tag, "w") as fout: fout.write(data.decode()) if fasta: from bioservices import ENA ena = ENA() data = ena.get_data(identifier, 'fasta') if isinstance(data, int) and data == 400: raise ValueError("{} not found on ENA".format(identifier)) else: with open("%s.fa" % tag, "w") as fout: try: # change in API in v1.7.8 fout.write(data) except: fout.write(data.decode())
def findName(accession): s = EUtils() theID = None geneOrProtein = "gene" # Check gene database res = s.ESearch("gene", accession) if len(res["idlist"]) > 0: # Get the ID in the "gene" database theID = res["idlist"][0] # If that fails, check the protein database if theID == None: res = s.ESearch("protein", accession) if len(res["idlist"]) > 0: theID = res["idlist"][0] geneOrProtein = "protein" # Couldn't find in either database if not theID: print("ERROR: couldn't find link for %s" % accession) # Get link to the corresponding ID in the Taxonomy database link = s.ELink(db="taxonomy", dbfrom=geneOrProtein, id=theID, retmode="json") taxID = json.loads(link)["linksets"][0]["linksetdbs"][0]["links"][0] # Download taxonomy record tax = s.EFetch(db="taxonomy", id=taxID) #print(tax) tree = ElementTree.fromstring(tax) xmlP = parseString(tax) #print(xmlP.toprettyxml()) taxonTag = tree.find("Taxon") sciName = taxonTag.find("ScientificName") return sciName.text
def get_entrez_summary(gene_id): e = EUtils() ret = e.ESearch('gene', gene_id) ret = e.EFetch('gene', ret['idlist']) return ret.decode("utf-8")
class EUtilsTools(object): """Simple wrapper around EUtils to fetch basic informatino about an accession number :: >>> from sequana.databases import EUtilsTools >>> et.accession_to_info("K01711.1") {'K01711.1': {'accession': '331784', 'comment': 'Measles virus (strain Edmonston), complete genome', 'gi': '331784', 'identifier': 'gi|331784|gb|K01711.1|MEANPCG[331784]', 'taxid': '11234'}} """ def __init__(self): from bioservices import EUtils self.eutils = EUtils() def accession_to_info(self, ids): """An accession or list of them returns list of dictionaries""" res = self.eutils.EFetch(db="nuccore", id=ids, rettype="docsum", retmode="dict") res = res['eSummaryResult']['DocSum'] # if one id provided, it will be a dict, otherwise a list of dicts try: res[0] except: res = [res] # now we can loop over all identifiers records = {} accessions = [x.strip() for x in ids.split(',')] for i, entry in enumerate(res): # first, save the acc number accession = entry['Id'] # then various info items = entry['Item'] identifier = [x for x in items if x['@Name'] == "Extra"][0]['#text'] if "||" in identifier: # strip content after || identifier = identifier.split("||")[0] title = [x for x in items if x['@Name'] == "Title"][0]['#text'] taxid = [x for x in items if x['@Name'] == "TaxId"][0]['#text'] gi = [x for x in items if x['@Name'] == "Gi"][0]['#text'] record = { "taxid": taxid, 'accession': accession, "identifier": identifier, 'gi': gi, 'comment': title } records[accessions[i]] = AttrDict(**record) return records
class KrakenBuilderBase(): def __init__(self, dbname): from bioservices import EUtils self.dbname = dbname self.eutils = EUtils() self.enadb = ENADownload() self.category = [ 'archaea', 'bacteria', 'fungi', 'invertebrate', 'mitonchondrion', 'other', 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral' ] def download_ncbi_refseq(self, category): """Download all files of type *fna* from ncbi FTP. :: kb = KrakenBuilder() kb.download_ncbi_refseq("viral") """ import ftplib import os assert category in self.category, "Please use one of {}".format( self.category) ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login("anonymous", "*****@*****.**") ftp.cwd("refseq/release/{}".format(category)) import io file_mapper = {} for filename in ftp.nlst(): if "genomic.fna" in filename: ftp.retrbinary('RETR ' + filename, open(filename, "wb").write) print(filename) def download_genomes_from_ncbi(self, email, category): """This downloads all genomes on ncbi for a given category looking at their ftp. This could be highly redundant. """ assert category in self.category ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login("anonymous", email) ftp.cwd("refseq/release/{}".format(category)) file_mapper = {} for filename in ftp.nlst(): if "genomic.fna" in filename: ftp.retrbinary('RETR ' + filename, open(filename, "wb").write) logger.info(filename) def _download_assembly_report(self, category): assert category in self.category ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login("anonymous", "anonymous") ftp.cwd("genomes/refseq/{}".format(category)) filename = "assembly_summary.txt" ftp.retrbinary( 'RETR ' + filename, open(filename.replace(".txt", "_{}.txt".format(category)), "wb").write) logger.info(filename) def download_accession_from_ncbi(self, accession): # a list of accessions in a file # can be a list, a unique string, a filename with 1-column wit accession # to retrieve if isinstance(accession, list): pass elif isinstance(accession, str): if os.path.exists(accession): with open(accession, "r") as fin: accessions = fin.read().split() else: accessions = [accession] from easydev import Progress N = len(accessions) pb = Progress(N) logger.info("Fetching {} accession fasta files from NCBI".format(N)) for i, accession in enumerate(accessions): data = self.eutils.EFetch("nucleotide", rettype="fasta", id=accession, retmode="text") if isinstance(data, int): logger.info( "Could not fetch this accession: {}. continue".format( accession)) print("Could not fetch this accession: {}. continue".format( accession)) else: outname = "{}/library/{}.fa".format(self.dbname, accession) with open(outname, "wb") as fout: fout.write(data) pb.animate(i + 1)