def download_wgs_for_record(record, config): """Download all WGS records in a record.""" if 'wgs_scafld' in record.annotations: # Biopython splits on '-' for us, but doesn't actually calculate the range # Also this is somehow a list of lists wgs_range = WgsRange.from_string('-'.join( record.annotations['wgs_scafld'][0])) elif 'wgs' in record.annotations: # Biopython splits on '-' for us, but doesn't actually calculate the range # Unlike WGS_SCAFLD, this is just a list wgs_range = WgsRange.from_string('-'.join(record.annotations['wgs'])) else: return [record] handle = StringIO() id_list = wgs_range.get_ids() i = 0 while i < len(id_list): dl_id = ",".join(id_list[i:i + STEP_SIZE]) i += STEP_SIZE url = get_url_by_format(config) params = build_params(dl_id, config) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) write_stream(r, handle, dl_id, config) # Rewind, so Biopython can parse this handle.seek(0) return list(SeqIO.parse(handle, config.format))
def fix_supercontigs(record, config): """Fix a record containing a CONTIG entry instead of a seq.""" handle = StringIO() # Let the NCBI assemble the proper record for us by asking for the right format. dl_id = record.id url = get_url_by_format(config) params = build_params(dl_id, config) try: r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) except TooManyRequests as err: # Wait, and then retry config.emit( "Server requested us to slow down, waiting {} seconds\n".format( err.retry_after)) time.sleep(int(err.retry_after)) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) write_stream(r, handle, dl_id, config) # Rewind, so Biopython can parse this handle.seek(0) return list(SeqIO.parse(handle, config.format))
def generate_url(dl_id, config): """Generate the Entrez URL to download a file using a separate tool""" # types: string, Config -> string url = get_url_by_format(config) params = build_params(dl_id, config) # remove the tool field, some other tool will do the download del params['tool'] encoded_params = urlencode(params, doseq=True) return "?".join([url, encoded_params])
def download_to_file(dl_id, config, filename=None, append=False): """Download a single ID from NCBI and store it to a file.""" # types: string, Config, string, bool -> None mode = 'a' if append else 'w' url = get_url_by_format(config) params = build_params(dl_id, config) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) if config.keep_filename: outfile_name = filename else: outfile_name = _generate_filename(params, filename) with open(outfile_name, mode) as fh: _validate_and_write(r, fh, dl_id, config)
def fix_supercontigs(record, config): """Fix a record containing a CONTIG entry instead of a seq.""" handle = StringIO() # Let the NCBI assemble the proper record for us by asking for the right format. dl_id = record.id url = get_url_by_format(config) params = build_params(dl_id, config) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) write_stream(r, handle, dl_id, config) # Rewind, so Biopython can parse this handle.seek(0) return list(SeqIO.parse(handle, config.format))
def download_to_file(dl_id, config, filename=None, append=False): """Download a single ID from NCBI and store it to a file.""" # types: string, Config, string, bool -> None mode = 'a' if append else 'w' url = get_url_by_format(config) params = build_params(dl_id, config) try: r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) except TooManyRequests as err: config.emit("Server requested us to slow down, waiting {} seconds.".format(err.retry_after)) time.sleep(int(err.retry_after)) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) if config.keep_filename: outfile_name = filename else: outfile_name = _generate_filename(params, filename) with open(outfile_name, mode) as fh: _validate_and_write(r, fh, dl_id, config)