def test_parse_genome_data_4(self): """Verify that multiple Genome objects with CDS features are constructed correctly for multiple valid PhageIDs.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_query=self.phage_query, gene_query=self.gene_query) genome_dict = {} for gnm in genome_list: genome_dict[gnm.id] = gnm with self.subTest(): self.assertEqual(len(genome_list), 3) with self.subTest(): self.assertEqual(genome_dict["Trixie"].seq, "AATT") with self.subTest(): self.assertEqual(len(genome_dict["Trixie"].cds_features), 3) with self.subTest(): self.assertEqual( genome_dict["Trixie"].cds_features[0].genome_length, 4) with self.subTest(): self.assertEqual( genome_dict["Trixie"].cds_features[1].genome_length, 4) with self.subTest(): self.assertEqual(len(genome_dict["D29"].cds_features), 1) with self.subTest(): self.assertEqual(genome_dict["D29"].cds_features[0].genome_length, 5) with self.subTest(): self.assertEqual(len(genome_dict["L5"].cds_features), 0)
def test_parse_genome_data_2(self): """Verify that an empty Genome object list is constructed for an invalid PhageID.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_id_list=["EagleEye"], phage_query=self.phage_query) self.assertEqual(len(genome_list), 0)
def test_parse_genome_data_3(self): """Verify that a Genome object with CDS, tRNA, and tmRNA features is constructed correctly for a valid PhageID.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_id_list=["Trixie"], phage_query=PHAGE_QUERY, gene_query=GENE_QUERY, trna_query=TRNA_QUERY, tmrna_query=TMRNA_QUERY) with self.subTest(): self.assertEqual(len(genome_list), 1) with self.subTest(): self.assertEqual(genome_list[0].id, "Trixie") with self.subTest(): self.assertEqual(genome_list[0].seq, "AATT") with self.subTest(): self.assertEqual(genome_list[0].type, "") with self.subTest(): self.assertEqual(genome_list[0].date, constants.EMPTY_DATE) with self.subTest(): self.assertEqual(len(genome_list[0].cds_features), 3) with self.subTest(): self.assertEqual(genome_list[0].cds_features[0].genome_length, 4) with self.subTest(): self.assertEqual(len(genome_list[0].trna_features), 2) with self.subTest(): self.assertEqual(genome_list[0].trna_features[0].genome_length, 4) with self.subTest(): self.assertEqual(len(genome_list[0].tmrna_features), 1) with self.subTest(): self.assertEqual(genome_list[0].tmrna_features[0].genome_length, 4)
def get_genome_seqrecords(alchemist, values=[], verbose=False): genomes = mysqldb.parse_genome_data(alchemist.engine, phage_id_list=values, phage_query=PHAGE_QUERY, gene_query=GENE_QUERY) seqrecords = [] for gnm in genomes: process_cds_features(gnm) if verbose: print(f"Converting {gnm.name}...") seqrecords.append(flat_files.genome_to_seqrecord(gnm)) return seqrecords
def test_parse_genome_data_4(self): """Verify that multiple Genome objects with CDS, tRNA, and tmRNA features are constructed correctly for multiple valid PhageIDs.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_query=PHAGE_QUERY, gene_query=GENE_QUERY, trna_query=TRNA_QUERY, tmrna_query=TMRNA_QUERY) genome_dict = {} for gnm in genome_list: genome_dict[gnm.id] = gnm with self.subTest(): self.assertEqual(len(genome_list), 3) with self.subTest(): self.assertEqual(genome_dict["Trixie"].seq, "AATT") with self.subTest(): self.assertEqual(len(genome_dict["Trixie"].cds_features), 3) with self.subTest(): self.assertEqual( genome_dict["Trixie"].cds_features[0].genome_length, 4) with self.subTest(): self.assertEqual( genome_dict["Trixie"].cds_features[1].genome_length, 4) with self.subTest(): self.assertEqual(len(genome_dict["Trixie"].trna_features), 2) with self.subTest(): self.assertEqual(len(genome_dict["Trixie"].tmrna_features), 1) with self.subTest(): self.assertEqual(len(genome_dict["D29"].cds_features), 1) with self.subTest(): self.assertEqual(genome_dict["D29"].cds_features[0].genome_length, 5) with self.subTest(): self.assertEqual(len(genome_dict["D29"].trna_features), 1) with self.subTest(): self.assertEqual(genome_dict["D29"].trna_features[0].id, "D29_1") with self.subTest(): self.assertEqual(len(genome_dict["D29"].tmrna_features), 0) with self.subTest(): self.assertEqual(len(genome_dict["L5"].cds_features), 0) with self.subTest(): self.assertEqual(len(genome_dict["L5"].trna_features), 0) with self.subTest(): self.assertEqual(len(genome_dict["L5"].tmrna_features), 1) with self.subTest(): self.assertEqual(genome_dict["L5"].tmrna_features[0].id, "L5_1")
def build_id_record_map(alchemist, phageids): id_record_map = {} if not phageids: return id_record_map genomes = mysqldb.parse_genome_data(alchemist.engine, phage_id_list=phageids, phage_query=PHAGE_QUERY, gene_query=GENE_QUERY, trna_query=TRNA_QUERY, tmrna_query=TMRNA_QUERY) for genome in genomes: record = flat_files.genome_to_seqrecord(genome) id_record_map[record.id] = record return id_record_map
def get_single_genome(alchemist, phageid, get_features=False, data_cache=None): gene_query = None trna_query = None tmrna_query = None if get_features: gene_query = GENE_QUERY trna_query = TRNA_QUERY tmrna_query = TMRNA_QUERY genome = mysqldb.parse_genome_data( alchemist.engine, phage_id_list=[phageid], phage_query=PHAGE_QUERY, gene_query=gene_query, trna_query=trna_query, tmrna_query=tmrna_query)[0] if data_cache is not None: data_cache[phageid] = genome return genome
def test_parse_genome_data_1(self): """Verify that a Genome object is constructed correctly for a valid PhageID.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_id_list=["L5"], phage_query=self.phage_query, gnm_type="mysql") with self.subTest(): self.assertEqual(len(genome_list), 1) with self.subTest(): self.assertEqual(genome_list[0].id, "L5") with self.subTest(): self.assertEqual(genome_list[0].seq, "ATCG") with self.subTest(): self.assertEqual(genome_list[0].type, "mysql") with self.subTest(): self.assertEqual(genome_list[0].date, constants.EMPTY_DATE) with self.subTest(): self.assertEqual(len(genome_list[0].cds_features), 0)
def test_parse_genome_data_3(self): """Verify that a Genome object with CDS features is constructed correctly for a valid PhageID.""" genome_list = mysqldb.parse_genome_data(self.engine, phage_id_list=["Trixie"], phage_query=self.phage_query, gene_query=self.gene_query) with self.subTest(): self.assertEqual(len(genome_list), 1) with self.subTest(): self.assertEqual(genome_list[0].id, "Trixie") with self.subTest(): self.assertEqual(genome_list[0].seq, "AATT") with self.subTest(): self.assertEqual(genome_list[0].type, "") with self.subTest(): self.assertEqual(genome_list[0].date, constants.EMPTY_DATE) with self.subTest(): self.assertEqual(len(genome_list[0].cds_features), 3) with self.subTest(): self.assertEqual(genome_list[0].cds_features[0].genome_length, 4)
def execute_ffx_export(alchemist, output_path, file_format, db_version, table="phage", values=[], verbose=False): if verbose: print(f"Retrieving {data_name} data from {sql_handle.database}...") if table == "phage": genomes = mysqldb.parse_genome_data(alchemist.engine, phage_id_list=values, phage_query="SELECT * FROM phage", gene_query="SELECT * FROM gene") else: raise ValueError if verbose: print(f"Converting {data_name} data to SeqRecord format...") seqrecords = [] if table == "phage": for gnm in genomes: set_cds_seqfeatures(gnm) if verbose: print(f"Converting {gnm.name}...") seqrecords.append(flat_files.genome_to_seqrecord(gnm)) if verbose: print("Appending database version...") for record in seqrecords: append_database_version(record, db_version) else: raise ValueError write_seqrecord(seqrecords, file_format, output_path, verbose=verbose)
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) force = args.force_download args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(RESULTS_FOLDER) working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=50) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) # Create config object with data obtained from file and/or defaults. config = configfile.build_complete_config(args.config_file) mysql_creds = config["mysql"] ncbi_creds = config["ncbi"] # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") alchemist = AlchemyHandler(database=args.database, username=mysql_creds["user"], password=mysql_creds["password"]) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: # With default date, the date of all records retrieved will be newer. if force: gnm.date = constants.EMPTY_DATE mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = tup[0] unmatched_phagesdb_ids = tup[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_creds, args.genbank_results, force=force) if args.draft is True: if force: # Add all draft genomes currently in database to the list of # draft genomes to be downloaded. drafts = get_matched_drafts(matched_genomes) unmatched_phagesdb_ids |= drafts get_draft_data(working_path, unmatched_phagesdb_ids)
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_data") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = match_output[0] unmatched_phagesdb_ids = match_output[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict, args.genbank_results) if args.draft is True: get_draft_data(working_path, unmatched_phagesdb_ids) print("\n\n\nRetrieve updates script completed.")