def test_esearchs(self): """Get the first batch of 16S GIs this test isn't reliable as NCBI does not always return the same GIs as more records are added. Just make sure we get back 10 GI like ids """ # only pulling 10 for testing simplicity obs = esearch('16S', retmax=10, binsize=3) try: foo = map(int, obs) except: self.fail()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: #ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ['16S AND tm7'] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out,'w') else: poss_output = open(opts.possible_new_gb_out,'w') collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if '<html>' in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith('VERSION'): records.append(l.split(':')[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % \ (time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open('possible_retries_at_retry_%d.txt.gz' % retries, 'w') possible_gis_at_retry.write('\n'.join(possible_gis)) possible_gis_at_retry.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: # ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ["16S AND tm7"] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out, "w") else: poss_output = open(opts.possible_new_gb_out, "w") collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if "<html>" in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith("VERSION"): records.append(l.split(":")[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % ( time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes, ) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open("possible_retries_at_retry_%d.txt.gz" % retries, "w") possible_gis_at_retry.write("\n".join(possible_gis)) possible_gis_at_retry.close()