def retrieve_drafts(output_folder, phage_list): """Retrieve auto-annotated 'draft' genomes from PECAAN.""" genome_folder = pathlib.Path(output_folder, GENOME_FOLDER) genome_folder.mkdir() failed = [] tickets = [] # Iterate through each row in the file for new_phage in phage_list: pecaan_link = constants.PECAAN_PREFIX + new_phage response = phagesdb.retrieve_url_data(pecaan_link) if response == "": print(f"Error: unable to retrieve {new_phage} draft genome.") print(pecaan_link) failed.append(new_phage) else: save_pecaan_file(response, new_phage, genome_folder) tkt = create_draft_ticket(new_phage) tickets.append(tkt) print(f"{new_phage} retrieved from PECAAN.") if len(tickets) > 0: create_ticket_table(tickets, output_folder) print(f"{len(tickets)} phage(s) were successfully retrieved") if len(failed) > 0: print(f"{len(failed)} phage(s) failed to be retrieved:") for item in failed: print(item) input("\n\nPress ENTER to continue.")
def test_retrieve_url_data_2(self): """Verify fasta data is not retrieved and an error is produced.""" url = "https://phagesdb.org/media/fastas/L5_x.fasta" fasta_data = phagesdb.retrieve_url_data(url) expected_fasta_data_header = "" self.assertEqual(fasta_data, expected_fasta_data_header)
def test_retrieve_url_data_1(self): """Verify fasta data is retrieved and no error is produced.""" url = "https://phagesdb.org/media/fastas/L5.fasta" fasta_data = phagesdb.retrieve_url_data(url) expected_fasta_data_header = ">Mycobacterium phage L5" self.assertEqual(fasta_data[:23], expected_fasta_data_header)
def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" print(f"\n\nDownloading genome(s) from PhagesDB.") phagesdb_folder = pathlib.Path(output_folder, PHAGESDB_FOLDER) phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOME_FOLDER) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: save_phagesdb_file(flatfile_data, phagesdb_gnm, genome_folder) tkt = create_phagesdb_ticket(mysqldb_gnm.id) import_tickets.append(tkt) if len(import_tickets) > 0: print(f"\n\n{len(import_tickets)} genome(s) " "were retrieved from PhagesDB.") create_ticket_table(import_tickets, phagesdb_folder) if len(failed_list) > 0: print(f"{len(failed_list)} genome(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()
def retrieve_drafts(output_folder, phage_list): """Retrieve auto-annotated 'draft' genomes from PECAAN.""" print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN") genome_folder = pathlib.Path(output_folder, GENOMES_DIR) genome_folder.mkdir() # Keep track of how many genomes were retrieved from PECAAN retrieved_tally = 0 failed_list = [] import_tickets = [] # Iterate through each row in the file for new_phage in phage_list: pecaan_link = constants.PECAAN_PREFIX + new_phage response = phagesdb.retrieve_url_data(pecaan_link) if response == "": print(f"Error: unable to retrieve {new_phage} draft genome.") print(pecaan_link) failed_list.append(new_phage) else: pecaan_filename = f"{new_phage}.txt" pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename) with pecaan_filepath.open("w") as fh: fh.write(response) tkt = ticket.ImportTicket() tkt.type = "add" tkt.phage_id = new_phage tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "draft" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "none" tkt.eval_mode = "draft" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = "none" tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) print(f"{new_phage} retrieved from PECAAN.") retrieved_tally += 1 # Now make the import table. if len(import_tickets) > 0: filepath = basic.prepare_filepath(output_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(output_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) # Report results if retrieved_tally > 0: print(f"{retrieved_tally} phage(s) were successfully retrieved") if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.")
def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" phagesdb_folder = pathlib.Path(output_folder, "phagesdb") phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: flatfile_filename = phagesdb_gnm.filename.split("/")[-1] flatfile_path = pathlib.Path(genome_folder, flatfile_filename) with flatfile_path.open("w") as fh: fh.write(flatfile_data) # Create the new import ticket # Since the PhagesDB phage has been matched to # the MySQL database phage, the AnnotationAuthor field # could be assigned from the current mysqldb author # variable. However, since this genbank-formatted # file is acquired through PhagesDB, both the # Annotation status is expected to be 'final' and # the Annotation author is expected to be 'hatfull'. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = mysqldb_gnm.id tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "final" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "retrieve" tkt.eval_mode = "final" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) count1 = len(import_tickets) if count1 > 0: print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.") filepath = basic.prepare_filepath(phagesdb_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(phagesdb_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()