def retrieve_records(accession_dict, ncbi_folder, batch_size=200): """Retrieve GenBank records.""" print("\n\nRetrieving records from NCBI") genome_folder = pathlib.Path(ncbi_folder, GENOME_FOLDER) genome_folder.mkdir() retrieval_errors = [] results = [] tickets_list = [] accessions = list(accession_dict.keys()) mod_accessions = [accession + "[ACCN]" for accession in accessions] # When retrieving in batch sizes, first create the list of values # indicating which indices of the accessions should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 batch_indices = basic.create_indices(mod_accessions, batch_size) print(f"There are {len(mod_accessions)} GenBank accession(s) to check.") for indices in batch_indices: start = indices[0] stop = indices[1] print(f"Checking accessions {start + 1} to {stop}...") esearch_term = " | ".join(mod_accessions[start:stop]) # Use esearch for each accession # First use esearch to verify the accessions are valid. search_record = ncbi.run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] # Keep track of the accessions that failed to be located in NCBI # Each accession in the error list is formatted "accession[ACCN]" current_batch_size = stop - start if search_count < current_batch_size: search_failure = search_record["ErrorList"]["PhraseNotFound"] for accession in search_failure: retrieval_errors.append(accession[:-6]) # Now get summaries for these records using esummary summary_records = ncbi.get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) results_tuple = get_accessions_to_retrieve(summary_records, accession_dict) accessions_to_retrieve = results_tuple[0] results.extend(results_tuple[1]) if len(accessions_to_retrieve) > 0: # Use efetch to retrieve the record. output_list = ncbi.get_records(accessions_to_retrieve, db="nucleotide", rettype="gb", retmode="text") # TODO check_record_date may be redundant. It checks date within the # record. Earlier in the pipeline, the docsum date has already been # checked though. So if docsum date is identical to date in the # record, this is redundant. tup = check_record_date(output_list, accession_dict) new_record_list = tup[0] # list of results dictionaries results.extend(tup[1]) if len(new_record_list) > 0: tickets = save_and_tickets(new_record_list, accession_dict, genome_folder) tickets_list.extend(tickets) if len(tickets_list) > 0: create_ticket_table(tickets_list, ncbi_folder) # Remove genome folder if empty. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() # Report the genomes that could not be retrieved. failed = process_failed_retrieval(retrieval_errors, accession_dict) results.extend(failed) return results
def get_data(output_folder, acc_id_dict, ncbi_cred_dict={}, batch_size=200): """Retrieve genomes from GenBank. output_folder = Path to where files will be saved. acc_id_dict = Dictionary where key = Accession and value = List[PhageIDs] """ # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"], email=ncbi_cred_dict["ncbi_email"], api_key=ncbi_cred_dict["ncbi_api_key"]) # Use esearch to verify the accessions are valid and efetch to retrieve # the record # Create batches of accessions unique_accession_list = list(acc_id_dict.keys()) # Add [ACCN] field to each accession number appended_accessions = \ [accession + "[ACCN]" for accession in unique_accession_list] # When retrieving in batch sizes, first create the list of values # indicating which indices of the unique_accession_list should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 batch_indices = basic.create_indices(unique_accession_list, batch_size) print( f"There are {len(unique_accession_list)} GenBank accessions to check.") for indices in batch_indices: start = indices[0] stop = indices[1] print(f"Checking accessions {start + 1} to {stop}...") delimiter = " | " esearch_term = delimiter.join(appended_accessions[start:stop]) # Use esearch for each accession search_record = ncbi.run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] summary_records = ncbi.get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) accessions_to_retrieve = ncbi.get_accessions_to_retrieve( summary_records) if len(accessions_to_retrieve) > 0: records = ncbi.get_records(accessions_to_retrieve, db="nucleotide", rettype="gb", retmode="text") for record in records: output_data(record, acc_id_dict, output_folder)
def retrieve_records(accession_dict, batch_size=200): """Retrieve GenBank records.""" # First use esearch to verify the accessions are valid. # Seoncd use efetch to retrieve the record. print("\n\nRetrieving records from NCBI") retrieved_records = [] # GenBank records that have been retrieved. retrieval_errors = [] tally_not_new = 0 # Keeps track if docsum date is new or not. results = [] # Summary of retrieval results. accessions = list(accession_dict.keys()) mod_accessions = [accession + "[ACCN]" for accession in accessions] # When retrieving in batch sizes, first create the list of values # indicating which indices of the accessions should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 batch_indices = basic.create_indices(mod_accessions, batch_size) print(f"There are {len(mod_accessions)} GenBank accession(s) to check.") for indices in batch_indices: start = indices[0] stop = indices[1] print(f"Checking accessions {start + 1} to {stop}...") delimiter = " | " esearch_term = delimiter.join(mod_accessions[start:stop]) # Use esearch for each accession search_record = ncbi.run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] # Keep track of the accessions that failed to be located in NCBI # Each accession in the error list is formatted "accession[ACCN]" current_batch_size = stop - start if search_count < current_batch_size: search_failure = search_record["ErrorList"]["PhraseNotFound"] for accession in search_failure: retrieval_errors.append(accession[:-6]) # Now get summaries for these records using esummary summary_records = ncbi.get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) results_tuple = get_accessions_to_retrieve(summary_records, accession_dict) accessions_to_retrieve = results_tuple[0] results.extend(results_tuple[1]) tally_not_new += len(summary_records) - len(accessions_to_retrieve) if len(accessions_to_retrieve) > 0: output_list = ncbi.get_records(accessions_to_retrieve, db="nucleotide", rettype="gb", retmode="text") retrieved_records.extend(output_list) return (tally_not_new, retrieved_records, retrieval_errors, results)
def get_genbank_data(output_folder, accession_set, ncbi_cred_dict={}): """Retrieve genomes from GenBank.""" batch_size = 200 # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"], email=ncbi_cred_dict["ncbi_email"], api_key=ncbi_cred_dict["ncbi_api_key"]) # Use esearch to verify the accessions are valid and efetch to retrieve # the record # Create batches of accessions unique_accession_list = list(accession_set) # Add [ACCN] field to each accession number appended_accessions = \ [accession + "[ACCN]" for accession in unique_accession_list] # When retrieving in batch sizes, first create the list of values # indicating which indices of the unique_accession_list should be used # to create each batch. # For instace, if there are five accessions, batch size of two produces # indices = 0,2,4 batch_indices = basic.create_indices(unique_accession_list, batch_size) print( f"There are {len(unique_accession_list)} GenBank accessions to check.") for indices in batch_indices: batch_index_start = indices[0] batch_index_stop = indices[1] print("Checking accessions " f"{batch_index_start + 1} to {batch_index_stop}...") current_batch_size = batch_index_stop - batch_index_start delimiter = " | " esearch_term = delimiter.join( appended_accessions[batch_index_start:batch_index_stop]) # Use esearch for each accession search_record = ncbi.run_esearch(db="nucleotide", term=esearch_term, usehistory="y") search_count = int(search_record["Count"]) search_webenv = search_record["WebEnv"] search_query_key = search_record["QueryKey"] summary_records = ncbi.get_summaries(db="nucleotide", query_key=search_query_key, webenv=search_webenv) accessions_to_retrieve = [] for doc_sum in summary_records: doc_sum_accession = doc_sum["Caption"] accessions_to_retrieve.append(doc_sum_accession) if len(accessions_to_retrieve) > 0: output_list = ncbi.get_records(accessions_to_retrieve, db="nucleotide", rettype="gb", retmode="text") for retrieved_record in output_list: ncbi_filename = (f"{retrieved_record.name}.gb") flatfile_path = pathlib.Path(output_folder, ncbi_filename) SeqIO.write(retrieved_record, str(flatfile_path), "genbank")