def setUp(self): self.ticket1 = ticket.ImportTicket() self.ticket2 = ticket.ImportTicket() self.ticket1.phage_id = "Trixie" self.ticket2.phage_id = "L5" self.bundle1 = bundle.Bundle() self.bundle2 = bundle.Bundle() self.bundle1.ticket = self.ticket1 self.bundle2.ticket = self.ticket2
def create_phagesdb_ticket(phage_id): """Create ImportTicket for PhagesDB genome.""" # Since the PhagesDB phage has been matched to # the MySQL database phage, the AnnotationAuthor field # could be assigned from the current mysqldb author # variable. However, since this genbank-formatted # file is acquired through PhagesDB, both the # Annotation status is expected to be 'final' and # the Annotation author is expected to be 'hatfull'. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = phage_id tkt.description_field = "product" tkt.eval_mode = "final" tkt.data_dict = { "host_genus": "retain", # formerly "retrieve", "cluster": "retain", # formerly "retrieve", "subcluster": "retain", # formerly "retrieve", "annotation_status": "final", "annotation_author": 1, "accession": "retain", # formerly "retrieve", "retrieve_record": 1 } return tkt
def setUp(self): self.bndl = bundle.Bundle() self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome2 = genome.Genome() self.genome2.type = "mysql" self.tkt = ticket.ImportTicket()
def parse_import_ticket_data(data_dict): """Converts import ticket data to a ImportTicket object. :param data_dict: A dictionary of data with the following keys: 0. Import action type 1. Primary PhageID 2. Host 3. Cluster 4. Subcluster 5. Status 6. Annotation Author (int) 7. Feature field 8. Accession 9. Retrieve Record (int) 10. Eval mode :type data_dict: dict :returns: A pdm_utils ImportTicket object. :rtype: ImportTicket """ ticket_attributes = constants.IMPORT_TABLE_STRUCTURE["valid_ticket"] other_attributes = data_dict.keys() - ticket_attributes tkt = ticket.ImportTicket() tkt.data_dict = data_dict for attr in ticket_attributes: attr_value = data_dict[attr] setattr(tkt, attr, attr_value) data_retrieve = set() data_retain = set() data_add = set() data_parse = set() other_attributes = list(other_attributes) x = 0 while x < len(other_attributes): attr = other_attributes[x] attr_value = data_dict[attr] if attr_value == "retrieve": data_retrieve.add(attr) elif attr_value == "retain": data_retain.add(attr) elif attr_value == "parse": data_parse.add(attr) else: data_add.add(attr) x += 1 tkt.data_retrieve = data_retrieve tkt.data_retain = data_retain tkt.data_parse = data_parse tkt.data_add = data_add return tkt
def setUp(self): self.ticket1 = ticket.ImportTicket() self.src1 = source.Source() self.src1.id = "L5_SRC_1" self.src2 = source.Source() self.src2.id = "L5_SRC_2" self.src3 = source.Source() self.src3.id = "L5_SRC_3" self.cds1 = cds.Cds() self.cds1.id = "L5_CDS_1" self.cds2 = cds.Cds() self.cds2.id = "L5_CDS_2" self.cds3 = cds.Cds() self.cds3.id = "L5_CDS_3" self.trna1 = trna.Trna() self.trna1.id = "L5_TRNA_1" self.trna2 = trna.Trna() self.trna2.id = "L5_TRNA_2" self.trna3 = trna.Trna() self.trna3.id = "L5_TRNA_3" self.tmrna1 = tmrna.Tmrna() self.tmrna1.id = "L5_TMRNA_1" self.tmrna2 = tmrna.Tmrna() self.tmrna2.id = "L5_TMRNA_2" self.tmrna3 = tmrna.Tmrna() self.tmrna3.id = "L5_TMRNA_3" self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome1.cds_features = [self.cds1, self.cds2] self.genome1.source_features = [self.src1, self.src2] self.genome1.trna_features = [self.trna1, self.trna2] self.genome1.tmrna_features = [self.tmrna1, self.tmrna2] self.genome2 = genome.Genome() self.genome2.type = "mysql" self.genome_pair1 = genomepair.GenomePair() self.genome_pair2 = genomepair.GenomePair() self.bndl = bundle.Bundle() self.bndl.ticket = self.ticket1 self.bndl.genome_dict[self.genome1.type] = self.genome1 self.bndl.genome_dict[self.genome2.type] = self.genome2 self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1 self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2 self.eval_correct1 = evaluation.Evaluation(status="correct") self.eval_correct2 = evaluation.Evaluation(status="correct") self.eval_error1 = evaluation.Evaluation(status="error") self.eval_error2 = evaluation.Evaluation(status="error")
def setUp(self): self.genome1 = genome.Genome() self.genome2 = genome.Genome() self.tkt = ticket.ImportTicket() self.genome_pair = genomepair.GenomePair() self.genome_pair.genome1 = self.genome1 self.genome_pair.genome2 = self.genome2 self.date_jan1 = datetime.strptime('1/1/2000', '%m/%d/%Y') self.date_feb1 = datetime.strptime('2/1/2000', '%m/%d/%Y') self.date_feb1_b = datetime.strptime('2/1/2000', '%m/%d/%Y')
def setUp(self): self.data_dict = {} self.data_dict["host_genus"] = "Mycobacterium smegmatis" self.data_dict["accession"] = "ABC123.1" self.data_dict["annotation_status"] = "final" self.data_dict["cluster"] = "A" self.data_dict["subcluster"] = "A2" self.data_dict["annotation_author"] = 1 self.data_dict["retrieve_record"] = 1 self.tkt1 = ticket.ImportTicket() self.tkt1.phage_id = "Trixie_Draft" self.tkt1.data_dict = self.data_dict
def test_identify_duplicates_6(self): """Verify two tickets with multiple duplicates do generate multiple errors.""" ticket1 = ticket.ImportTicket() ticket1.id = 1 ticket1.type = "replace" ticket1.phage_id = "Trixie" ticket2 = ticket.ImportTicket() ticket2.id = 1 ticket2.type = "replace" ticket2.phage_id = "Trixie" null_set = set(["none"]) list_of_tickets = [ticket1, ticket2] id_dupes, phage_id_dupes = \ tickets.identify_duplicates(list_of_tickets, null_set=null_set) with self.subTest(): self.assertEqual(len(id_dupes), 1) with self.subTest(): self.assertEqual(len(phage_id_dupes), 1)
def test_identify_duplicates_2(self): """Verify two tickets with 'none' duplicates do not generate an error.""" ticket1 = ticket.ImportTicket() ticket1.id = "none" ticket1.type = "replace" ticket1.phage_id = "none" ticket2 = ticket.ImportTicket() ticket2.id = "none" ticket2.type = "replace" ticket2.phage_id = "none" null_set = set(["none"]) list_of_tickets = [ticket1, ticket2] id_dupes, phage_id_dupes = \ tickets.identify_duplicates(list_of_tickets, null_set=null_set) with self.subTest(): self.assertEqual(len(id_dupes), 0) with self.subTest(): self.assertEqual(len(phage_id_dupes), 0)
def test_identify_duplicates_1(self): """Verify no duplicates are produced.""" ticket1 = ticket.ImportTicket() ticket1.id = 1 ticket1.type = "replace" ticket1.phage_id = "Trixie" ticket2 = ticket.ImportTicket() ticket2.id = 2 ticket2.type = "replace" ticket2.phage_id = "L5" null_set = set(["none"]) list_of_tickets = [ticket1, ticket2] id_dupes, phage_id_dupes = \ tickets.identify_duplicates(list_of_tickets, null_set=null_set) with self.subTest(): self.assertEqual(len(id_dupes), 0) with self.subTest(): self.assertEqual(len(phage_id_dupes), 0)
def save_files_and_tkts(record_list, accession_dict, output_folder): """Save flat files retrieved from GenBank and create import tickets.""" import_tickets = [] genome_folder = pathlib.Path(output_folder, GENOMES_DIR) genome_folder.mkdir() for record in record_list: accession = record.name accession = accession.split('.')[0] gnm = accession_dict[accession] ncbi_filename = f"{gnm.name.lower()}__{accession}.gb" flatfile_path = pathlib.Path(genome_folder, ncbi_filename) SeqIO.write(record, str(flatfile_path), "genbank") tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = gnm.id tkt.data_dict["host_genus"] = gnm.host_genus tkt.data_dict["cluster"] = gnm.cluster tkt.data_dict["subcluster"] = gnm.subcluster tkt.data_dict["annotation_status"] = gnm.annotation_status tkt.data_dict["annotation_author"] = gnm.annotation_author tkt.description_field = "product" # Accession is set to 'parse' to ensure that during import, # the file's accession is directly compared to the database # record's accession. # tkt.data_dict["accession"] = gnm.accession tkt.data_dict["accession"] = "parse" tkt.eval_mode = "auto" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) # Now make the import table. if len(import_tickets) > 0: filepath = basic.prepare_filepath(output_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(output_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True)
def create_draft_ticket(name): """Create ImportTicket for draft genome.""" tkt = ticket.ImportTicket() tkt.type = "add" tkt.phage_id = name tkt.description_field = "product" tkt.eval_mode = "draft" tkt.data_dict = { "host_genus": "retrieve", "cluster": "retrieve", "subcluster": "retrieve", "annotation_status": "draft", "annotation_author": 1, "accession": "none", "retrieve_record": 1 } return tkt
def create_genbank_ticket(gnm): """Create ImportTicket for GenBank record.""" # Accession is set to 'parse' to ensure that during import, # the file's accession is directly compared to the database # record's accession. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = gnm.id tkt.description_field = "product" tkt.eval_mode = "auto" tkt.data_dict = { "host_genus": "retain", # formerly gnm.host_genus, "cluster": "retain", # formerly gnm.cluster, "subcluster": "retain", # formerly gnm.subcluster, "annotation_status": "retain", # formerly gnm.annotation_status, "annotation_author": "retain", # formerly gnm.annotation_author, "accession": "parse", "retrieve_record": "retain", # formerly 1 } return tkt
def setUp(self): self.ticket1 = ticket.ImportTicket() self.src1 = source.Source() self.src1.id = "L5_SRC_1" self.src2 = source.Source() self.src2.id = "L5_SRC_2" self.src3 = source.Source() self.src3.id = "L5_SRC_3" self.cds1 = cds.Cds() self.cds1.id = "L5_CDS_1" self.cds2 = cds.Cds() self.cds2.id = "L5_CDS_2" self.cds3 = cds.Cds() self.cds3.id = "L5_CDS_3" self.genome1 = genome.Genome() self.genome1.type = "flat_file" self.genome1.cds_features.append(self.cds1) self.genome1.cds_features.append(self.cds2) self.genome1.source_features.append(self.src1) self.genome1.source_features.append(self.src2) self.genome2 = genome.Genome() self.genome2.type = "mysql" self.genome_pair1 = genomepair.GenomePair() self.genome_pair2 = genomepair.GenomePair() self.bndl = bundle.Bundle() self.bndl.ticket = self.ticket1 self.bndl.genome_dict[self.genome1.type] = self.genome1 self.bndl.genome_dict[self.genome2.type] = self.genome2 self.bndl.genome_pair_dict["genome_pair1"] = self.genome_pair1 self.bndl.genome_pair_dict["genome_pair2"] = self.genome_pair2 self.eval_correct1 = eval.Eval(status="correct") self.eval_correct2 = eval.Eval(status="correct") self.eval_error1 = eval.Eval(status="error") self.eval_error2 = eval.Eval(status="error")
def setUp(self): # Empty ticket to test simple methods self.tkt = ticket.ImportTicket()
def get_final_data(output_folder, matched_genomes): """Run sub-pipeline to retrieve 'final' genomes from PhagesDB.""" phagesdb_folder = pathlib.Path(output_folder, "phagesdb") phagesdb_folder.mkdir() genome_folder = pathlib.Path(phagesdb_folder, GENOMES_DIR) genome_folder.mkdir() import_tickets = [] failed_list = [] # Iterate through each phage in the MySQL database for gnm_pair in matched_genomes: mysqldb_gnm = gnm_pair.genome1 phagesdb_gnm = gnm_pair.genome2 # Not all phages have associated Genbank-formatted files # available on PhagesDB. Check to see if there is a flatfile for # this phage. Download the flatfile only if there is a date tag, # and only if that date is more recent than the date stored in # the MySQL database for that genome. The tagged date only reflects when # the file was uploaded into PhagesDB. The date the actual # Genbank record was created is stored within the file, # and this too could be less recent than the current version in # the MySQL database; however, this part gets checked during the import # stage. set_phagesdb_gnm_date(phagesdb_gnm) set_phagesdb_gnm_file(phagesdb_gnm) if (phagesdb_gnm.filename != "" and phagesdb_gnm.date > mysqldb_gnm.date): # Save the file on the hard drive with the same name as # stored on PhagesDB flatfile_data = phagesdb.retrieve_url_data(phagesdb_gnm.filename) if flatfile_data == "": failed_list.append(mysqldb_gnm.id) else: flatfile_filename = phagesdb_gnm.filename.split("/")[-1] flatfile_path = pathlib.Path(genome_folder, flatfile_filename) with flatfile_path.open("w") as fh: fh.write(flatfile_data) # Create the new import ticket # Since the PhagesDB phage has been matched to # the MySQL database phage, the AnnotationAuthor field # could be assigned from the current mysqldb author # variable. However, since this genbank-formatted # file is acquired through PhagesDB, both the # Annotation status is expected to be 'final' and # the Annotation author is expected to be 'hatfull'. tkt = ticket.ImportTicket() tkt.type = "replace" tkt.phage_id = mysqldb_gnm.id tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "final" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "retrieve" tkt.eval_mode = "final" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = mysqldb_gnm.id tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) count1 = len(import_tickets) if count1 > 0: print(f"\n\n{count1} phage(s) were retrieved from PhagesDB.") filepath = basic.prepare_filepath(phagesdb_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(phagesdb_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.") # Now remove empty folders. if len(basic.identify_contents(genome_folder, kind=None)) == 0: genome_folder.rmdir() if len(basic.identify_contents(phagesdb_folder, kind=None)) == 0: phagesdb_folder.rmdir()
def retrieve_drafts(output_folder, phage_list): """Retrieve auto-annotated 'draft' genomes from PECAAN.""" print(f"\n\nRetrieving {len(phage_list)} new phages from PECAAN") genome_folder = pathlib.Path(output_folder, GENOMES_DIR) genome_folder.mkdir() # Keep track of how many genomes were retrieved from PECAAN retrieved_tally = 0 failed_list = [] import_tickets = [] # Iterate through each row in the file for new_phage in phage_list: pecaan_link = constants.PECAAN_PREFIX + new_phage response = phagesdb.retrieve_url_data(pecaan_link) if response == "": print(f"Error: unable to retrieve {new_phage} draft genome.") print(pecaan_link) failed_list.append(new_phage) else: pecaan_filename = f"{new_phage}.txt" pecaan_filepath = pathlib.Path(genome_folder, pecaan_filename) with pecaan_filepath.open("w") as fh: fh.write(response) tkt = ticket.ImportTicket() tkt.type = "add" tkt.phage_id = new_phage tkt.data_dict["host_genus"] = "retrieve" tkt.data_dict["cluster"] = "retrieve" tkt.data_dict["subcluster"] = "retrieve" tkt.data_dict["annotation_status"] = "draft" tkt.data_dict["annotation_author"] = 1 tkt.description_field = "product" tkt.data_dict["accession"] = "none" tkt.eval_mode = "draft" # TODO secondary_phage_id data is for old ticket format. tkt.data_dict["secondary_phage_id"] = "none" tkt.data_dict["retrieve_record"] = 1 import_tickets.append(tkt) print(f"{new_phage} retrieved from PECAAN.") retrieved_tally += 1 # Now make the import table. if len(import_tickets) > 0: filepath = basic.prepare_filepath(output_folder, "legacy_import_table.csv") import_tickets1 = convert_tickets_to_dict(import_tickets, old_format=True) basic.export_data_dict(import_tickets1, filepath, IMPORT_COLUMNS1) # TODO new dictwriter. Use this block instead of above once the # new import script is functioning. if BOTH: filepath2 = basic.prepare_filepath(output_folder, "import_table.csv") import_tickets2 = convert_tickets_to_dict(import_tickets) basic.export_data_dict(import_tickets2, filepath2, IMPORT_COLUMNS2, include_headers=True) # Report results if retrieved_tally > 0: print(f"{retrieved_tally} phage(s) were successfully retrieved") if len(failed_list) > 0: print(f"{len(failed_list)} phage(s) failed to be retrieved:") for element in failed_list: print(element) input("\n\nPress ENTER to continue.")