def output_genbank_summary(output_folder, results): """Save summary of GenBank retrieval results to file.""" filepath = pathlib.Path(output_folder, GENBANK_RESULTS_TABLE) basic.export_data_dict(results, filepath, NCBI_RESULTS_COLUMNS, include_headers=True)
def create_ticket_table(tickets, output_folder): """Save tickets associated with retrieved from GenBank files.""" filepath = pathlib.Path(output_folder, IMPORT_TABLE) tickets = convert_tickets_to_dict(tickets) basic.export_data_dict(tickets, filepath, IMPORT_COLUMNS, include_headers=True)
def execute_csv_export(db_filter, export_path, folder_path, columns, csv_name, sort=[], raw_bytes=False, verbose=False): """Executes csv export of a MySQL database table with select columns. :param alchemist: A connected and fully build AlchemyHandler object. :type alchemist: AlchemyHandler :param export_path: Path to a dir for file creation. :type export_path: Path :param folder_path: Path to a top-level dir. :type folder_path: Path :param table: MySQL table name. :type table: str :param conditionals: MySQL WHERE clause-related SQLAlchemy objects. :type conditionals: list[BinaryExpression] :param sort: A list of SQLAlchemy Columns to sort by. :type sort: list[Column] :param values: List of values to fitler database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ if verbose: relative_path = str(export_path.relative_to(folder_path)) print(f"Preparing {csv_name} export for '{relative_path}'...") headers = [db_filter._key.name] for column in columns: if column.name != db_filter._key.name: headers.append(column.name) results = db_filter.select(columns) if not raw_bytes: decode_results(results, columns, verbose=verbose) if len(results) == 0: print(f"No database entries received for {csv_name}.") export_path.rmdir() else: if verbose: print(f"...Writing csv {csv_name}.csv in '{export_path.name}'...") print("......Database entries retrieved: {len(results)}") file_path = export_path.joinpath(f"{csv_name}.csv") basic.export_data_dict(results, file_path, headers, include_headers=True)
def setUpClass(self): test_db_utils.create_filled_test_db() self.test_dir = Path(TEST_DIR) if self.test_dir.is_dir(): shutil.rmtree(TEST_DIR) self.test_dir.mkdir() self.resubmit_form = self.test_dir.joinpath("resubmit_form.txt") basic.export_data_dict(TEST_DATA, self.resubmit_form, PF_HEADER, include_headers=True)
def save_files_and_tkts(record_list, accession_dict, output_folder): """Save flat files retrieved from GenBank and create import tickets.""" import_tickets = [] genome_folder = pathlib.Path(output_folder, GENOMES_DIR) genome_folder.mkdir() for record in record_list: accession = record.name accession = accession.split('.')[0] gnm = accession_dict[accession] ncbi_filename = f"{gnm.name.lower()}__{accession}.gb" flatfile_path = pathlib.Path(genome_folder, ncbi_filename) SeqIO.write(record, str(flatfile_path), "genbank") tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = gnm.id tkt.data_dict["host_genus"] = gnm.host_genus tkt.data_dict["cluster"] = gnm.cluster tkt.data_dict["subcluster"] = gnm.subcluster tkt.data_dict["annotation_status"] = gnm.annotation_status tkt.data_dict["annotation_author"] = gnm.annotation_author tkt.description_field = "product" # Accession is set to 'parse' to ensure that during import, # the file's accession is directly compared to the database # record's accession. # tkt.data_dict["accession"] = gnm.accession tkt.data_dict["accession"] = "parse" tkt.eval_mode = "auto" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) # Now make the import table. if len(import_tickets) > 0: filepath = basic.prepare_filepath(output_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(output_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True)
def write_report(data, export_path, header, csv_name="PhamReport", verbose=False): """Outputs a csv file """ if not export_path.is_dir(): print("Passed in path is not a directory.") sys.exit(1) file_path = export_path.joinpath(f"{csv_name}.csv") if verbose: print(f"Writing {file_path.name} in {export_path.name}...") basic.export_data_dict(data, file_path, header, include_headers=True)
def test_export_data_dict_1(self): """Verify data is exported correctly.""" list_of_data = [self.tkt_dict1, self.tkt_dict2] headers = ["type", "phage_id", "host_genus", "cluster"] basic.export_data_dict(list_of_data, self.export_file, headers, include_headers=True) exp_success_tkts = [] with open(self.export_file, 'r') as file: file_reader = csv.DictReader(file) for dict in file_reader: exp_success_tkts.append(dict) with self.subTest(): self.assertEqual(len(exp_success_tkts), 2) with self.subTest(): self.assertEqual(set(exp_success_tkts[0].keys()), set(headers))
def execute_csv_export(alchemist, export_path, table="phage", values=[], verbose=False): remove_fields = { "phage": ["Sequence"], "gene": ["Translation"], "domain": [], "gene_domain": [], "pham": [], "pham_color": [], "trna": ["Sequence"], "tmrna": [], "trna_structures": [] } table_obj = alchemist.get_table(table) select_columns = [] headers = [] for column in table_obj.columns: if column.name not in remove_fields[table]: select_columns.append(column) headers.append(column.name) for column in table_obj.primary_key.columns: primary_key = column query = querying.build_select(alchemist.graph, select_columns) if values: query = query.where(primary_key.in_(values)) results = alchemist.execute(query) file_path = export_path.joinpath(f"{table}.csv") basic.export_data_dict(results, file_path, headers, include_headers=True)
def get_update_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve field updates from PhagesDB.""" updates_folder = pathlib.Path(output_folder, UPDATES_FOLDER) updates_folder.mkdir() update_tickets = [] for gnm_pair in matched_genomes: tkt_list = compare_data(gnm_pair) update_tickets.extend(tkt_list) # Field updates if len(update_tickets) > 0: print(f"\n\n{len(update_tickets)} field updates are available.") filepath = pathlib.Path(updates_folder, UPDATE_TABLE) basic.export_data_dict(update_tickets, filepath, UPDATE_COLUMNS, include_headers=True) else: print("\n\nNo field updates.") # Now remove empty folders. if len(basic.identify_contents(updates_folder, kind=None)) == 0: updates_folder.rmdir()
def execute_resubmit(alchemist, revisions_data_dicts, folder_path, folder_name, filters="", groups=[], verbose=False): """Executes the entirety of the genbank resubmit pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param revisions_data_dicts: Data dictionaries containing pham/notes data. :type revisions_data_dicts: list[dict] :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ db_filter = Filter(alchemist=alchemist) db_filter.key = "gene.PhamID" db_filter.add(BASE_CONDITIONALS) if filters != "": try: db_filter.add(filters) except: print("Please check your syntax for the conditional string:\n" f"{filters}") resubmit_columns = db_filter.get_columns(RESUBMIT_COLUMNS) phams = [] for data_dict in revisions_data_dicts: phams.append(data_dict["Pham"]) db_filter.values = phams if verbose: print("Creating export folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) conditionals_map = {} export_db.build_groups_map(db_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning review export...") for mapped_path in conditionals_map.keys(): if verbose: print("Retreiving phage data for pham revisions...") export_dicts = [] for data_dict in revisions_data_dicts: if verbose: print(f"...Retrieving data for pham {data_dict['Pham']}...") conditionals = conditionals_map[mapped_path] final_call = data_dict["Final Call"] if final_call == "Hypothetical Protein": final_call = "" conditionals.append( querying.build_where_clause(alchemist.graph, f"gene.Notes!={final_call}")) query = querying.build_select(alchemist.graph, resubmit_columns, where=conditionals) results = querying.execute(alchemist.engine, query, in_column=db_filter.key, values=[data_dict["Pham"]]) for result in results: format_resubmit_data(result, data_dict["Final Call"]) export_dicts.append(result) if not export_dicts: if verbose: print("'{mapped_path.name}' data selected for resubmision " "matches selected call; no resubmision exported...") mapped_path.rmdir() continue export_dicts = sorted(export_dicts, key=lambda export_dict: export_dict["Phage"]) if verbose: print(f"Writing {CSV_NAME} in {mapped_path.name}...") file_path = mapped_path.joinpath(CSV_NAME) basic.export_data_dict(export_dicts, file_path, RESUBMIT_HEADER, include_headers=True)
def retrieve_drafts(output_folder, phage_list): """Retrieve auto-annotated 'draft' genomes from PECAAN.""" print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN") genome_folder = pathlib.Path(output_folder, GENOMES_DIR) genome_folder.mkdir() # Keep track of how many genomes were retrieved from PECAAN retrieved_tally = 0 failed_list = [] import_tickets = [] # Iterate through each row in the file for new_phage in phage_list: pecaan_link = constants.PECAAN_PREFIX + new_phage response = phagesdb.retrieve_url_data(pecaan_link) if response == "": print(f"Error: unable to retrieve {new_phage} draft genome.") print(pecaan_link) failed_list.append(new_phage) else: pecaan_filename = f"{new_phage}.txt" pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename) with pecaan_filepath.open("w") as fh: fh.write(response) tkt = ticket.ImportTicket() tkt.type = "add" tkt.phage_id = new_phage tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "draft" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "none" tkt.eval_mode = "draft" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = "none" tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) print(f"{new_phage} retrieved from PECAAN.") retrieved_tally += 1 # Now make the import table. if len(import_tickets) > 0: filepath = basic.prepare_filepath(output_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(output_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) # Report results if retrieved_tally > 0: print(f"{retrieved_tally} phage(s) were successfully retrieved") if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.")
def get_genbank_data(output_folder, genome_dict, ncbi_cred_dict={}, genbank_results=False): """Run sub-pipeline to retrieve genomes from GenBank.""" # Flow of the NCBI record retrieval process: # 1 Create list of phages to check for updates at NCBI (completed above) # 2 Using esearch, verify which accessions are valid # 3 Using esummary, get update date for each valid accession # 4 Using efetch, retrieve flat files for NCBI records newer than # the MySQL database date # 5 Save new records in a folder and create an import table for them # Create output folder ncbi_folder = pathlib.Path(output_folder, f"genbank") ncbi_folder.mkdir() ncbi_results_list = [] tallies = {} tallies["total"] = len(genome_dict.keys()) # Iterate through each phage in the MySQL database result_tuple1 = sort_by_accession(genome_dict) tallies["not_auto_updated"] = result_tuple1[0] tallies["no_accession"] = result_tuple1[1] tallies["duplicate_accession"] = result_tuple1[2] ncbi_results_list.extend(result_tuple1[3]) unique_accession_dict = result_tuple1[4] # More setup variables if NCBI updates are desired. NCBI Bookshelf resource # "The E-utilities In-Depth: Parameters, Syntax and More", by Dr. Eric # Sayers, recommends that a single request not contain more than about 200 # UIDS so we will use that as our batch size, and all Entrez requests must # include the user's email address and tool name. ncbi.set_entrez_credentials(tool=ncbi_cred_dict["ncbi_tool"], email=ncbi_cred_dict["ncbi_email"], api_key=ncbi_cred_dict["ncbi_api_key"]) results_tuple2 = retrieve_records(unique_accession_dict, batch_size=200) tallies["docsum_not_new"] = results_tuple2[0] retrieved_record_list = results_tuple2[1] retrieval_error_list = results_tuple2[2] ncbi_results_list.extend(results_tuple2[3]) # Report the genomes that could not be retrieved. results3 = process_failed_retrieval(retrieval_error_list, unique_accession_dict) ncbi_results_list.extend(results3) tallies["retrieval_failure"] = len(retrieval_error_list) results_tuple4 = check_record_date(retrieved_record_list, unique_accession_dict) new_record_list = results_tuple4[0] ncbi_results_list.extend(results_tuple4[1]) tallies["retrieved_for_import"] = len(new_record_list) tallies["record_not_new"] = (len(retrieved_record_list) - len(new_record_list)) if len(new_record_list) > 0: save_files_and_tkts(new_record_list, unique_accession_dict, ncbi_folder) # Record retrieval results for all phages. if genbank_results == True: filepath3 = basic.prepare_filepath(ncbi_folder, "genbank_results.csv") basic.export_data_dict(ncbi_results_list, filepath3, NCBI_RESULTS_COLUMNS, include_headers=True) # Print summary of script tallies["auto_updated"] = tallies["total"] - tallies["not_auto_updated"] tallies["accession"] = (tallies["auto_updated"] - tallies["no_accession"] - tallies["duplicate_accession"]) print("\n\n\nSummary of GenBank data retrieval:") print("Of the genomes in the MySQL database:") print(f"{tallies['total']:>6}: total") print(f"{tallies['not_auto_updated']:>6}: not auto-updated") print(f"{tallies['auto_updated']:>6}: auto-updated") print("\nOf the auto-updated genomes:") print(f"{tallies['no_accession']:>6}: no accession") print(f"{tallies['duplicate_accession']:>6}: duplicated accession") print(f"{tallies['accession']:>6}: unique accession") print("\nOf the auto-updated genomes with unique accessions:") print(f"{tallies['retrieval_failure']:>6}: could not be retrieved") print(f"{tallies['docsum_not_new']:>6}: retrieved but docsum not new") print(f"{tallies['record_not_new']:>6}: retrieved but record not new") print(f"{tallies['retrieved_for_import']:>6}: retrieved for import") # Now remove empty folders. if len(basic.identify_contents(ncbi_folder, kind=None)) == 0: ncbi_folder.rmdir()
def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" phagesdb_folder = pathlib.Path(output_folder, "phagesdb") phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: flatfile_filename = phagesdb_gnm.filename.split("/")[-1] flatfile_path = pathlib.Path(genome_folder, flatfile_filename) with flatfile_path.open("w") as fh: fh.write(flatfile_data) # Create the new import ticket # Since the PhagesDB phage has been matched to # the MySQL database phage, the AnnotationAuthor field # could be assigned from the current mysqldb author # variable. However, since this genbank-formatted # file is acquired through PhagesDB, both the # Annotation status is expected to be 'final' and # the Annotation author is expected to be 'hatfull'. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = mysqldb_gnm.id tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "final" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "retrieve" tkt.eval_mode = "final" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) count1 = len(import_tickets) if count1 > 0: print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.") filepath = basic.prepare_filepath(phagesdb_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(phagesdb_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()
def get_update_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve field updates from PhagesDB.""" update_tickets = [] for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Compare Cluster if mysqldb_gnm.cluster != phagesdb_gnm.cluster: result1 = { "table": "phage", "field": "Cluster", "value": phagesdb_gnm.cluster, "key_name": "PhageID", "key_value": mysqldb_gnm.id } update_tickets.append(result1) # Compare Subcluster if mysqldb_gnm.subcluster != phagesdb_gnm.subcluster: result3 = { "table": "phage", "field": "Subcluster", "value": phagesdb_gnm.subcluster, "key_name": "PhageID", "key_value": mysqldb_gnm.id } update_tickets.append(result3) # Compare Host genus if mysqldb_gnm.host_genus != phagesdb_gnm.host_genus: result5 = { "table": "phage", "field": "HostGenus", "value": phagesdb_gnm.host_genus, "key_name": "PhageID", "key_value": mysqldb_gnm.id } update_tickets.append(result5) # Compare Accession # If the genome author is not "hatfull", then don't worry about # updating the accession. This used to be determined with # the status field, but now it is determined with the # AnnotationAuthor field. if (mysqldb_gnm.accession != phagesdb_gnm.accession and \ mysqldb_gnm.annotation_author == 1): result6 = { "table": "phage", "field": "Accession", "value": phagesdb_gnm.accession, "key_name": "PhageID", "key_value": mysqldb_gnm.id } update_tickets.append(result6) # Field updates if len(update_tickets) > 0: print("\n\nNew field updates are available.") filepath = basic.prepare_filepath(output_folder, "update_table.csv", folder_name="updates") basic.export_data_dict(update_tickets, filepath, UPDATE_COLUMNS, include_headers=True) else: print("\n\nNo field updates found.")