def execute_s_report_export(alchemist, db_filter, conditionals, export_path, verbose=False): """Executes export of summary data for all reviewed phams. :param alchemist: A connected and fully build :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param export_path: A path to a valid dir for new file creation. :type export_path: Path :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ phams = db_filter.values if verbose: print(f"Retrieving SummaryReport data...") phages_histogram = {} for pham in phams: db_filter.values = [pham] phages = db_filter.build_values(column="phage.PhageID", where=conditionals) increment_histogram(phages, phages_histogram) db_filter.values = phams db_filter.transpose("phage.PhageID", set_values=True) db_filter.sort("phage.DateLastModified") submitted_sorted_phages = db_filter.values submitted_sorted_phages.reverse() last_submitted_phages = chunk_list(submitted_sorted_phages, 5)[0] occurance_sorted_phages = sort_histogram_keys(phages_histogram) top_occuring_phages = chunk_list(occurance_sorted_phages, 5)[0] version_data = mysqldb_basic.get_first_row_data(alchemist.engine, "version") if verbose: print(f"Writing SummaryReport.txt in {export_path.name}") s_path = export_path.joinpath("SummaryReport.txt") s_file = open(s_path, "w") s_file.write(f"Phams reviewed on: {time.strftime('%d-%m-%Y')}\n") s_file.write(f"Database reviewed: {alchemist.database}\n") s_file.write(f"Schema version: {version_data['SchemaVersion']} " f"Database version: {version_data['Version']}\n\n") s_file.write(f"Phams reviewed using the following base conditionals:\n") s_file.write(f" {BASE_CONDITIONALS}\n") s_file.write(f"\n\n") s_file.write(f"Most occuring phages: {', '.join(top_occuring_phages)}\n") s_file.write(f"Phages recently submitted: {', '.join(last_submitted_phages)}\n") s_file.close() db_filter.values = phams db_filter.key = "pham.PhamID"
def execute_make_db(alchemist, db_type, values=None, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, verbose=False, filters="", groups=[], db_name=None, threads=1, use_mpi=False, mol_type=None, hash_index=False, parse_seqids=True, gi_mask=False, mask_data=None, mask_id=None, logfile=None, tax_id=None, tax_id_map=None): if db_name is None: db_name = alchemist.database if verbose: print("Retrieving database version...") db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version") db_filter = pipelines_basic.build_filter(alchemist, "pham", filters, values=values, verbose=verbose) working_path = pipelines_basic.create_working_path(folder_path, folder_name) conditionals_map = pipelines_basic.build_groups_map( db_filter, working_path, groups=groups, verbose=verbose) data_cache = {} values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received for '{mapped_path}'.") continue pipelines_basic.create_working_dir(mapped_path) if db_type == "hhsuite": execute_make_hhsuite_database(alchemist, db_filter.values, mapped_path, db_name, db_version, data_cache=data_cache, threads=threads, verbose=verbose, use_mpi=use_mpi) elif db_type == "blast": execute_make_blast_database( alchemist, db_filter.values, mapped_path, db_name, db_version, data_cache=data_cache, verbose=verbose, hash_index=False, parse_seqids=True, gi_mask=False, mask_data=None, mask_id=None, logfile=None, tax_id=None, tax_id_map=None)
def change_version(engine, amount=1): """Change the database version number. :param engine: SQLAlchemy Engine object able to connect to a MySQL database. :type engine: Engine :param amount: Amount to increment/decrement version number. :type amount: int """ result = mysqldb_basic.get_first_row_data(engine, "version") current = result["Version"] new = current + amount print(f"Updating version from {current} to {new}.") statement = (f"UPDATE version SET Version = {new}") engine.execute(statement)
def test_get_first_row_data_3(self): """Verify dictionary is returned when there are two rows of data.""" phage_data1 = test_data_utils.get_trixie_phage_data() phage_data2 = test_data_utils.get_trixie_phage_data() phage_data1["PhageID"] = "Trixie" phage_data2["PhageID"] = "L5" test_db_utils.insert_data(PHAGE, phage_data1) test_db_utils.insert_data(PHAGE, phage_data2) # Get all data from table just to confirm there is more than one row. all_data = test_db_utils.get_data(test_db_utils.phage_table_query) data = mysqldb_basic.get_first_row_data(self.engine, TABLE) with self.subTest(): self.assertEqual(len(all_data), 2) with self.subTest(): self.assertTrue(COLUMN in data.keys())
def get_schema_version(engine): """Identify the schema version of the database_versions_list. Schema version data has not been persisted in every schema version, so if schema version data is not found, it is deduced from other parts of the schema. :param engine: SQLAlchemy Engine object able to connect to a MySQL database. :type engine: Engine :returns: The version of the pdm_utils database schema. :rtype: int """ # 1. If the version table does not exist, schema_version = 0. # 2. If there is no schema_version or SchemaVersion field, # it is either schema_version = 1 or 2. # 3. If AnnotationAuthor, Program, AnnotationQC, and RetrieveRecord # columns are in phage table, schema_version = 2. db_tables = mysqldb_basic.get_tables(engine, engine.url.database) if "version" in db_tables: version_table = True else: version_table = False if version_table == True: version_columns = mysqldb_basic.get_first_row_data(engine, "version") if "schema_version" in version_columns.keys(): schema_version = version_columns["schema_version"] elif "SchemaVersion" in version_columns.keys(): schema_version = version_columns["SchemaVersion"] else: phage_columns = mysqldb_basic.get_columns(engine, engine.url.database, "phage") expected = { "AnnotationAuthor", "Program", "AnnotationQC", "RetrieveRecord" } diff = expected - phage_columns if len(diff) == 0: schema_version = 2 else: schema_version = 1 else: schema_version = 0 return schema_version
def get_summary_data(alchemist, db_filter, verbose=False): phams = db_filter.values phages_data = db_filter.retrieve("phage.PhageID", filter=True) db_filter.values = phams db_filter.transpose("phage.PhageID", set_values=True) db_filter.sort("phage.DateLastModified") version_data = mysqldb_basic.get_first_row_data(alchemist.engine, "version") summary_data = {} summary_data["recent_phages"] = db_filter.values summary_data["recurring_phages"] = phages_data summary_data["version_data"] = version_data format_summary_data(summary_data) db_filter.values = phams db_filter.key = "gene.PhamID" return summary_data
def test_get_first_row_data_2(self): """Verify dictionary is returned when there is one row of data.""" phage_data = test_data_utils.get_trixie_phage_data() test_db_utils.insert_data(PHAGE, phage_data) data = mysqldb_basic.get_first_row_data(self.engine, TABLE) self.assertTrue(COLUMN in data.keys())
def test_get_first_row_data_1(self): """Verify empty dictionary is returned when there is no data.""" data = mysqldb_basic.get_first_row_data(self.engine, TABLE) self.assertEqual(len(data.keys()), 0)
def execute_export(alchemist, folder_path, folder_name, pipeline, values=[], verbose=False, table=DEFAULT_TABLE, filters="", groups=[], sort=[], include_columns=[], exclude_columns=[], sequence_columns=False, raw_bytes=False, concatenate=False): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param pipeline: File type that dictates data processing. :type pipeline: str :param values: List of values to filter database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param table: MySQL table name. :type table: str :param filters: A list of lists with filter values, grouped by ORs. :type filters: list[list[str]] :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :param include_columns: A csv export column selection parameter. :type include_columns: list[str] :param exclude_columns: A csv export column selection parameter. :type exclude_columns: list[str] :param sequence_columns: A boolean to toggle inclusion of sequence data. :type sequence_columns: bool :param concatenate: A boolean to toggle concaternation for SeqRecords. :type concaternate: bool """ if verbose: print("Retrieving database version...") db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version") if pipeline == "csv": if verbose: print("Processing columns for csv export...") csv_columns = filter_csv_columns(alchemist, table, include_columns=include_columns, exclude_columns=exclude_columns, sequence_columns=sequence_columns) if pipeline in FILTERABLE_PIPELINES: if verbose: print("Processing columns for sorting...") db_filter = apply_filters(alchemist, table, filters, verbose=verbose) if verbose: print("Creating export folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) if pipeline == "sql": if verbose: print("Writing SQL database file...") write_database(alchemist, db_version["Version"], export_path) elif pipeline in FILTERABLE_PIPELINES: conditionals_map = {} build_groups_map(db_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning export...") for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received from {table} " f"for '{mapped_path}'.") continue if sort: sort_columns = get_sort_columns(alchemist, sort) db_filter.sort(sort_columns) if pipeline in BIOPYTHON_PIPELINES: execute_ffx_export(alchemist, mapped_path, export_path, db_filter.values, pipeline, db_version, table, concatenate=concatenate, verbose=verbose) else: execute_csv_export(db_filter, mapped_path, export_path, csv_columns, table, raw_bytes=raw_bytes, verbose=verbose) else: print("Unrecognized export pipeline, aborting export") sys.exit(1)
def execute_export(alchemist, pipeline, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False, dump=False, force=False, table=DEFAULT_TABLE, filters="", groups=[], sort=[], include_columns=[], exclude_columns=[], sequence_columns=False, raw_bytes=False, concatenate=False, db_name=None, phams_out=False, threads=1): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param pipeline: File type that dictates data processing. :type pipeline: str :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param force: A boolean to toggle aggresive building of directories. :type force: bool :param values: List of values to filter database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param dump: A boolean value to toggle dump in current working dir. :type dump: bool :param table: MySQL table name. :type table: str :param filters: A list of lists with filter values, grouped by ORs. :type filters: str :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :type sort: list[str] :param include_columns: A csv export column selection parameter. :type include_columns: list[str] :param exclude_columns: A csv export column selection parameter. :type exclude_columns: list[str] :param sequence_columns: A boolean to toggle inclusion of sequence data. :type sequence_columns: bool :param concatenate: A boolean to toggle concaternation for SeqRecords. :type concaternate: bool :param threads: Number of processes/threads to spawn during the pipeline :type threads: int """ if verbose: print("Retrieving database version...") db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version") if pipeline == "csv": if verbose: print("Processing columns for csv export...") csv_columns = filter_csv_columns(alchemist, table, include_columns=include_columns, exclude_columns=exclude_columns, sequence_columns=sequence_columns) if pipeline in FILTERABLE_PIPELINES: db_filter = pipelines_basic.build_filter(alchemist, table, filters, values=values, verbose=verbose) if sort: pipelines_basic.add_sort_columns(db_filter, sort, verbose=verbose) if verbose: print("Creating export folder...") export_path = pipelines_basic.create_working_path(folder_path, folder_name, dump=dump, force=force) data_cache = {} if pipeline == "sql": execute_sql_export(alchemist, export_path, folder_path, db_version, db_name=db_name, dump=dump, force=force, phams_out=phams_out, threads=threads, verbose=verbose) elif pipeline in FILTERABLE_PIPELINES: conditionals_map = pipelines_basic.build_groups_map( db_filter, export_path, groups=groups, verbose=verbose, force=force) if verbose: print("Prepared query and path structure, beginning export...") values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received from {table} " f"for '{mapped_path}'.") continue if sort: sort_columns = get_sort_columns(alchemist, sort) db_filter.sort(sort_columns) export_name = None if dump: if mapped_path == export_path: export_name = folder_name pipelines_basic.create_working_dir(mapped_path, dump=dump, force=force) if pipeline in BIOPYTHON_PIPELINES + ["tbl"]: execute_ffx_export(alchemist, mapped_path, export_path, db_filter.values, pipeline, db_version, table, concatenate=concatenate, data_cache=data_cache, export_name=export_name, threads=threads, verbose=verbose, dump=dump) elif pipeline == "csv": execute_csv_export(db_filter, mapped_path, export_path, csv_columns, table, raw_bytes=raw_bytes, data_cache=data_cache, verbose=verbose, dump=dump) else: print("Unrecognized export pipeline, aborting export") sys.exit(1)
def execute_get_server_db(alchemist, database, url, folder_path=None, folder_name=RESULTS_FOLDER, db_name=None, config_file=None, verbose=False, subdirectory=None, download_only=False, get_fastas=False, get_alns=False, force_pull=False, get_version=False, schema_version=None): if subdirectory: url = "".join([url, str(subdirectory), "/"]) pool = url_basic.create_pool(pipeline=True) if database is None: print("Loading get_db interactive environment...") cmd = pipeline_shells.GetDBCMD(url, name=alchemist.username, pool=pool) cmd.cmdloop(intro=pipeline_shells.GET_DB_CMD_INTRO) if cmd.selected is None: return database = cmd.selected.name pkg_url = "".join([cmd.selected.get_abs_path(), "/"]) else: response = url_basic.pool_request(url, pool=pool, pipeline=True) directory_listing = url_basic.get_url_listing_dirs(response) if database not in directory_listing: print("Requested database is not at the specified url.\n" "Please check the database availability.") return response.close() pkg_url = "".join([url, database, "/"]) if db_name is None: db_name = database pkg_response = url_basic.pool_request(pkg_url, pool=pool, pipeline=True) sql_file_listing = url_basic.get_url_listing_files(pkg_response, "sql") version_file_listing = url_basic.get_url_listing_files( pkg_response, "version") if not sql_file_listing: print("Requested database file package does not have a SQL file.\n" "Please check SQL file availability at the specified url.") return database_filename = sql_file_listing[0] if not version_file_listing: if get_version: print("Requested database file package does not have" "a version file.\nPlease check version file availability " "at the specified url.") return else: version_filename = None else: version_filename = version_file_listing[0] if folder_path is None: output_path = pipelines_basic.create_working_path( pathlib.Path(DEFAULT_OUTPUT_FOLDER), folder_name) if output_path.is_dir(): shutil.rmtree(output_path) pipelines_basic.create_working_dir(output_path, force=True) else: output_path = pipelines_basic.create_working_path( folder_path, folder_name) pipelines_basic.create_working_dir(output_path) # Only look for version file is selected. if version_filename is not None: version_filepath, status1 = prepare_download(output_path, pkg_url, version_filename, "version") version_filehandle = version_filepath.open(mode="r") version = int(version_filehandle.readline().rstrip()) else: status1 = True version = 0 if (not force_pull) and (version > 0): if db_name in alchemist.databases: alchemist.database = db_name alchemist.build_engine() curr_schema_version = mysqldb.get_schema_version(alchemist.engine) if curr_schema_version > 2: curr_version_data = mysqldb_basic.get_first_row_data( alchemist.engine, "version") curr_version = int(curr_version_data.get("Version", 0)) if curr_version >= version: print(f"Current database version of {db_name} " "is greater than or equal to the database version " "at the specified listing.\nPlease use " "the --force_pull flag if you would like to " "indiscriminately pull and install a database.") return db_filepath, status2 = prepare_download(output_path, pkg_url, database_filename, "sql", verbose=verbose) if not status1 or not status2: print("Unable to download data from server.\n Aborting pipeline.") return # If downloading from server, user may have selected to not # install the database file. if (not download_only) and (not get_fastas) and (not get_alns): install_db(alchemist, db_name, db_filepath=db_filepath, config_file=config_file, schema_version=schema_version, verbose=verbose) # The output folder was only created for downloading from server. print("Removing downloaded data.") shutil.rmtree(output_path)