def get_random_ncbi_ids_set(count, db, keyword_search): """Get random NCBI ids""" logging.info('Fetching {} NCBI IDs for the {} database'.format(count, db)) if not config.ENTREZ_EMAIL: msg = ( "ENTREZ_EMAIL environnment variable undefined. " "You must define one with a valid email to use this program." ) logging.error(msg) raise ConfigurationError(msg) Entrez.email = config.ENTREZ_EMAIL retstart = random.randint(1, 10000) retmax = count * 10 handle = Entrez.esearch( db=db, retmax=retmax, retstart=retstart, term=keyword_search ) pub_search = Entrez.read(handle) handle.close() return get_random_ids(pub_search['IdList'], count)
def search_info(self, term, options=''): Entrez.email = self.email search_handle = Entrez.esearch(db="nucleotide", term=term+options, usehistory="y") search_results = Entrez.read(search_handle) search_handle.close() webenv = search_results["WebEnv"] query_key = search_results["QueryKey"] gi_list = search_results["IdList"] retmax = int(search_results["RetMax"]) count = int(search_results["Count"]) assert retmax == len(gi_list) dict = {"Count": count, "IdList": gi_list, "ReturnMax": retmax, "WebEnv": webenv, "QueryKey": query_key} return dict
def fetch_ncbi(ID, db, fmt='fasta', output_path='.'): """Fetch a ncbi entity Args: ID: NCBI ID fmt: file format output_path: path to output downloaded files """ logging.info('Fetching NCBI ID {} with format {}...'.format(ID, fmt)) filename = '{id}.{fmt}'.format(id=ID, fmt=fmt) db_fmt_path = os.path.join(output_path, fmt, db) output_file = os.path.join(db_fmt_path, filename) os.makedirs(db_fmt_path, exist_ok=True) handle = Entrez.efetch(db=db, id=ID, rettype=fmt, retmode="text") response = handle.read() with open(output_file, 'w') as f: f.write(response)
def search(self, organism='', options='', batch_size=50, progress=True): Entrez.email = self.email search_handle = Entrez.esearch(db="nucleotide", term=organism+options, usehistory="y") search_results = Entrez.read(search_handle) search_handle.close() webenv = search_results["WebEnv"] query_key = search_results["QueryKey"] gi_list = search_results["IdList"] retmax = int(search_results["RetMax"]) count = int(search_results["Count"]) if count == 0: return assert retmax == len(gi_list) ### Download data referred to in the previous search in batches tinyseqxml = organism + '.tseqxml' if not os.path.isfile(tinyseqxml): widgets = [tinyseqxml + "\t", Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed(unit='r')] out_handle = open(tinyseqxml, "w") pbar = ProgressBar(widgets=widgets, maxval=count, term_width=int(self.columns) - 20) for start in range(0, count, batch_size): #end = min(count, start + batch_size) #print("Going to download record %i to %i" % (start+1, end)) fetch_handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="xml", retstart=start, retmax=batch_size, webenv=webenv, query_key=query_key) data = fetch_handle.read() fetch_handle.close() out_handle.write(data) status = (start + 1) / float(count) if progress and pbar.seconds_elapsed == 0: pbar.start() if status > 0.001 and progress: pbar.term_width = int(self.columns) - 20 pbar.update(status * count) if progress: pbar.finish() out_handle.close() ## At this point, file may be a stream of multiple XML documents. ### Do not parse as one single XML document. ### Beautiful Soup PUNCH! (Think Donkey Kong) start = CURRENT_MILLI_TIME() soup = BeautifulSoup(open(tinyseqxml, "r"), "lxml") end = CURRENT_MILLI_TIME() print "Finished Parsing XML in " + str(end - start) + " ms" return soup.find_all('tseq')
def server_info(self): Entrez.email = self.email handle = Entrez.einfo(db="nucleotide") record = Entrez.read(handle) handle.close() return record
def fetch_count_from_api(term): try: handle = Entrez.esearch(db="pubmed", term=term, rettype="count") search_result = Entrez.read(handle) except Exception, e: raise e
# -*- coding: utf-8 -*- """ Created on Fri Mar 18 11:18:38 2016 @author: tanfan.zjh """ from Bio_Eutils import Entrez, Medline Entrez.email = "*****@*****.**" # Search for PMIDs from author text based search handle = Entrez.esearch(db="pubmed", retmax=100000, term="Arabidopsis Thaliana") pub_search = Entrez.read(handle) handle.close() print pub_search ''' # Fetch matching entries handle = Entrez.efetch(db='pubmed', id=pub_search['IdList'], retmax=20, rettype="medline", retmode="text") pub_items = Medline.parse(handle) # Work with it for pub_item in pub_items: print "*" * 10 print "%s - %s." % ( pub_item.get("TI","?"), ", ".join(pub_item.get("AB","?")) ) handle.close() '''