Пример #1
0
def execute_s_report_export(alchemist, db_filter, conditionals, export_path, 
                                                    verbose=False):
    """Executes export of summary data for all reviewed phams.

    :param alchemist: A connected and fully build
    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param export_path: A path to a valid dir for new file creation.
    :type export_path: Path
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    phams = db_filter.values

    if verbose:
        print(f"Retrieving SummaryReport data...")
    phages_histogram = {}
 
    for pham in phams:
        db_filter.values = [pham]
        phages = db_filter.build_values(column="phage.PhageID",
                                        where=conditionals)

        increment_histogram(phages, phages_histogram)

    db_filter.values = phams
    db_filter.transpose("phage.PhageID", set_values=True)
    db_filter.sort("phage.DateLastModified")

    submitted_sorted_phages = db_filter.values
    submitted_sorted_phages.reverse()
    last_submitted_phages = chunk_list(submitted_sorted_phages, 5)[0]

    occurance_sorted_phages = sort_histogram_keys(phages_histogram)
    top_occuring_phages = chunk_list(occurance_sorted_phages, 5)[0]

    version_data = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    if verbose:
        print(f"Writing SummaryReport.txt in {export_path.name}")

    s_path = export_path.joinpath("SummaryReport.txt")
    s_file = open(s_path, "w")

    s_file.write(f"Phams reviewed on: {time.strftime('%d-%m-%Y')}\n")
    s_file.write(f"Database reviewed: {alchemist.database}\n")
    s_file.write(f"Schema version: {version_data['SchemaVersion']} "
                 f"Database version: {version_data['Version']}\n\n") 
    s_file.write(f"Phams reviewed using the following base conditionals:\n")
    s_file.write(f"    {BASE_CONDITIONALS}\n")

    s_file.write(f"\n\n")
    s_file.write(f"Most occuring phages: {', '.join(top_occuring_phages)}\n")
    s_file.write(f"Phages recently submitted: {', '.join(last_submitted_phages)}\n")
    s_file.close()

    db_filter.values = phams
    db_filter.key = "pham.PhamID"
Пример #2
0
def execute_make_db(alchemist, db_type, values=None, folder_path=None,
                    folder_name=DEFAULT_FOLDER_NAME, verbose=False, filters="",
                    groups=[], db_name=None, threads=1, use_mpi=False,
                    mol_type=None, hash_index=False, parse_seqids=True,
                    gi_mask=False, mask_data=None, mask_id=None, logfile=None,
                    tax_id=None, tax_id_map=None):
    if db_name is None:
        db_name = alchemist.database

    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    db_filter = pipelines_basic.build_filter(alchemist, "pham", filters,
                                             values=values,
                                             verbose=verbose)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(
                                            db_filter, working_path,
                                            groups=groups, verbose=verbose)

    data_cache = {}
    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")
            continue

        pipelines_basic.create_working_dir(mapped_path)

        if db_type == "hhsuite":
            execute_make_hhsuite_database(alchemist, db_filter.values,
                                          mapped_path, db_name, db_version,
                                          data_cache=data_cache,
                                          threads=threads, verbose=verbose,
                                          use_mpi=use_mpi)
        elif db_type == "blast":
            execute_make_blast_database(
                    alchemist, db_filter.values, mapped_path, db_name,
                    db_version, data_cache=data_cache, verbose=verbose,
                    hash_index=False, parse_seqids=True, gi_mask=False,
                    mask_data=None, mask_id=None, logfile=None, tax_id=None,
                    tax_id_map=None)
Пример #3
0
def change_version(engine, amount=1):
    """Change the database version number.

    :param engine: SQLAlchemy Engine object able to connect to a MySQL database.
    :type engine: Engine
    :param amount: Amount to increment/decrement version number.
    :type amount: int
    """
    result = mysqldb_basic.get_first_row_data(engine, "version")
    current = result["Version"]
    new = current + amount
    print(f"Updating version from {current} to {new}.")
    statement = (f"UPDATE version SET Version = {new}")
    engine.execute(statement)
Пример #4
0
 def test_get_first_row_data_3(self):
     """Verify dictionary is returned when there are two rows of data."""
     phage_data1 = test_data_utils.get_trixie_phage_data()
     phage_data2 = test_data_utils.get_trixie_phage_data()
     phage_data1["PhageID"] = "Trixie"
     phage_data2["PhageID"] = "L5"
     test_db_utils.insert_data(PHAGE, phage_data1)
     test_db_utils.insert_data(PHAGE, phage_data2)
     # Get all data from table just to confirm there is more than one row.
     all_data = test_db_utils.get_data(test_db_utils.phage_table_query)
     data = mysqldb_basic.get_first_row_data(self.engine, TABLE)
     with self.subTest():
         self.assertEqual(len(all_data), 2)
     with self.subTest():
         self.assertTrue(COLUMN in data.keys())
Пример #5
0
def get_schema_version(engine):
    """Identify the schema version of the database_versions_list.

    Schema version data has not been persisted in every schema version,
    so if schema version data is not found, it is deduced from other
    parts of the schema.

    :param engine: SQLAlchemy Engine object able to connect to a MySQL database.
    :type engine: Engine
    :returns: The version of the pdm_utils database schema.
    :rtype: int
    """
    # 1. If the version table does not exist, schema_version = 0.
    # 2. If there is no schema_version or SchemaVersion field,
    #    it is either schema_version = 1 or 2.
    # 3. If AnnotationAuthor, Program, AnnotationQC, and RetrieveRecord
    #    columns are in phage table, schema_version = 2.

    db_tables = mysqldb_basic.get_tables(engine, engine.url.database)
    if "version" in db_tables:
        version_table = True
    else:
        version_table = False

    if version_table == True:
        version_columns = mysqldb_basic.get_first_row_data(engine, "version")
        if "schema_version" in version_columns.keys():
            schema_version = version_columns["schema_version"]
        elif "SchemaVersion" in version_columns.keys():
            schema_version = version_columns["SchemaVersion"]
        else:
            phage_columns = mysqldb_basic.get_columns(engine,
                                                      engine.url.database,
                                                      "phage")
            expected = {
                "AnnotationAuthor", "Program", "AnnotationQC", "RetrieveRecord"
            }
            diff = expected - phage_columns
            if len(diff) == 0:
                schema_version = 2
            else:
                schema_version = 1
    else:
        schema_version = 0
    return schema_version
Пример #6
0
def get_summary_data(alchemist, db_filter, verbose=False):
    phams = db_filter.values

    phages_data = db_filter.retrieve("phage.PhageID", filter=True)

    db_filter.values = phams
    db_filter.transpose("phage.PhageID", set_values=True)
    db_filter.sort("phage.DateLastModified")

    version_data = mysqldb_basic.get_first_row_data(alchemist.engine,
                                                    "version")

    summary_data = {}
    summary_data["recent_phages"] = db_filter.values
    summary_data["recurring_phages"] = phages_data
    summary_data["version_data"] = version_data

    format_summary_data(summary_data)

    db_filter.values = phams
    db_filter.key = "gene.PhamID"
    return summary_data
Пример #7
0
 def test_get_first_row_data_2(self):
     """Verify dictionary is returned when there is one row of data."""
     phage_data = test_data_utils.get_trixie_phage_data()
     test_db_utils.insert_data(PHAGE, phage_data)
     data = mysqldb_basic.get_first_row_data(self.engine, TABLE)
     self.assertTrue(COLUMN in data.keys())
Пример #8
0
 def test_get_first_row_data_1(self):
     """Verify empty dictionary is returned when there is no data."""
     data = mysqldb_basic.get_first_row_data(self.engine, TABLE)
     self.assertEqual(len(data.keys()), 0)
Пример #9
0
def execute_export(alchemist,
                   folder_path,
                   folder_name,
                   pipeline,
                   values=[],
                   verbose=False,
                   table=DEFAULT_TABLE,
                   filters="",
                   groups=[],
                   sort=[],
                   include_columns=[],
                   exclude_columns=[],
                   sequence_columns=False,
                   raw_bytes=False,
                   concatenate=False):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param pipeline: File type that dictates data processing.
    :type pipeline: str
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: list[list[str]]
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :param include_columns: A csv export column selection parameter.
    :type include_columns: list[str]
    :param exclude_columns: A csv export column selection parameter.
    :type exclude_columns: list[str]
    :param sequence_columns: A boolean to toggle inclusion of sequence data.
    :type sequence_columns: bool
    :param concatenate: A boolean to toggle concaternation for SeqRecords.
    :type concaternate: bool
    """
    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    if pipeline == "csv":
        if verbose:
            print("Processing columns for csv export...")
        csv_columns = filter_csv_columns(alchemist,
                                         table,
                                         include_columns=include_columns,
                                         exclude_columns=exclude_columns,
                                         sequence_columns=sequence_columns)

    if pipeline in FILTERABLE_PIPELINES:
        if verbose:
            print("Processing columns for sorting...")
        db_filter = apply_filters(alchemist, table, filters, verbose=verbose)

    if verbose:
        print("Creating export folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    if pipeline == "sql":
        if verbose:
            print("Writing SQL database file...")
        write_database(alchemist, db_version["Version"], export_path)

    elif pipeline in FILTERABLE_PIPELINES:
        conditionals_map = {}
        build_groups_map(db_filter,
                         export_path,
                         conditionals_map,
                         groups=groups,
                         verbose=verbose)

        if verbose:
            print("Prepared query and path structure, beginning export...")

        for mapped_path in conditionals_map.keys():
            db_filter.reset()
            db_filter.values = values

            conditionals = conditionals_map[mapped_path]
            db_filter.values = db_filter.build_values(where=conditionals)

            if db_filter.hits() == 0:
                print(f"No database entries received from {table} "
                      f"for '{mapped_path}'.")
                continue

            if sort:
                sort_columns = get_sort_columns(alchemist, sort)
                db_filter.sort(sort_columns)

            if pipeline in BIOPYTHON_PIPELINES:
                execute_ffx_export(alchemist,
                                   mapped_path,
                                   export_path,
                                   db_filter.values,
                                   pipeline,
                                   db_version,
                                   table,
                                   concatenate=concatenate,
                                   verbose=verbose)
            else:
                execute_csv_export(db_filter,
                                   mapped_path,
                                   export_path,
                                   csv_columns,
                                   table,
                                   raw_bytes=raw_bytes,
                                   verbose=verbose)
    else:
        print("Unrecognized export pipeline, aborting export")
        sys.exit(1)
Пример #10
0
def execute_export(alchemist, pipeline, folder_path=None,
                   folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False,
                   dump=False, force=False, table=DEFAULT_TABLE, filters="",
                   groups=[], sort=[], include_columns=[], exclude_columns=[],
                   sequence_columns=False, raw_bytes=False, concatenate=False,
                   db_name=None, phams_out=False, threads=1):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param pipeline: File type that dictates data processing.
    :type pipeline: str
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param include_columns: A csv export column selection parameter.
    :type include_columns: list[str]
    :param exclude_columns: A csv export column selection parameter.
    :type exclude_columns: list[str]
    :param sequence_columns: A boolean to toggle inclusion of sequence data.
    :type sequence_columns: bool
    :param concatenate: A boolean to toggle concaternation for SeqRecords.
    :type concaternate: bool
    :param threads: Number of processes/threads to spawn during the pipeline
    :type threads: int
    """
    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    if pipeline == "csv":
        if verbose:
            print("Processing columns for csv export...")
        csv_columns = filter_csv_columns(alchemist, table,
                                         include_columns=include_columns,
                                         exclude_columns=exclude_columns,
                                         sequence_columns=sequence_columns)

    if pipeline in FILTERABLE_PIPELINES:
        db_filter = pipelines_basic.build_filter(alchemist, table, filters,
                                                 values=values,
                                                 verbose=verbose)
        if sort:
            pipelines_basic.add_sort_columns(db_filter, sort, verbose=verbose)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path, folder_name,
                                                      dump=dump, force=force)

    data_cache = {}
    if pipeline == "sql":
        execute_sql_export(alchemist, export_path, folder_path, db_version,
                           db_name=db_name, dump=dump, force=force,
                           phams_out=phams_out, threads=threads,
                           verbose=verbose)
    elif pipeline in FILTERABLE_PIPELINES:
        conditionals_map = pipelines_basic.build_groups_map(
                                                db_filter, export_path,
                                                groups=groups,
                                                verbose=verbose, force=force)

        if verbose:
            print("Prepared query and path structure, beginning export...")

        values = db_filter.values
        for mapped_path in conditionals_map.keys():
            db_filter.reset()
            db_filter.values = values

            conditionals = conditionals_map[mapped_path]
            db_filter.values = db_filter.build_values(where=conditionals)

            if db_filter.hits() == 0:
                print(f"No database entries received from {table} "
                      f"for '{mapped_path}'.")
                continue

            if sort:
                sort_columns = get_sort_columns(alchemist, sort)
                db_filter.sort(sort_columns)

            export_name = None
            if dump:
                if mapped_path == export_path:
                    export_name = folder_name

            pipelines_basic.create_working_dir(mapped_path, dump=dump,
                                               force=force)

            if pipeline in BIOPYTHON_PIPELINES + ["tbl"]:
                execute_ffx_export(alchemist, mapped_path, export_path,
                                   db_filter.values, pipeline, db_version,
                                   table, concatenate=concatenate,
                                   data_cache=data_cache,
                                   export_name=export_name, threads=threads,
                                   verbose=verbose, dump=dump)
            elif pipeline == "csv":
                execute_csv_export(db_filter, mapped_path, export_path,
                                   csv_columns, table, raw_bytes=raw_bytes,
                                   data_cache=data_cache,
                                   verbose=verbose, dump=dump)
    else:
        print("Unrecognized export pipeline, aborting export")
        sys.exit(1)
Пример #11
0
def execute_get_server_db(alchemist,
                          database,
                          url,
                          folder_path=None,
                          folder_name=RESULTS_FOLDER,
                          db_name=None,
                          config_file=None,
                          verbose=False,
                          subdirectory=None,
                          download_only=False,
                          get_fastas=False,
                          get_alns=False,
                          force_pull=False,
                          get_version=False,
                          schema_version=None):

    if subdirectory:
        url = "".join([url, str(subdirectory), "/"])

    pool = url_basic.create_pool(pipeline=True)
    if database is None:
        print("Loading get_db interactive environment...")
        cmd = pipeline_shells.GetDBCMD(url, name=alchemist.username, pool=pool)
        cmd.cmdloop(intro=pipeline_shells.GET_DB_CMD_INTRO)

        if cmd.selected is None:
            return

        database = cmd.selected.name
        pkg_url = "".join([cmd.selected.get_abs_path(), "/"])
    else:
        response = url_basic.pool_request(url, pool=pool, pipeline=True)
        directory_listing = url_basic.get_url_listing_dirs(response)

        if database not in directory_listing:
            print("Requested database is not at the specified url.\n"
                  "Please check the database availability.")
            return

        response.close()
        pkg_url = "".join([url, database, "/"])

    if db_name is None:
        db_name = database

    pkg_response = url_basic.pool_request(pkg_url, pool=pool, pipeline=True)

    sql_file_listing = url_basic.get_url_listing_files(pkg_response, "sql")
    version_file_listing = url_basic.get_url_listing_files(
        pkg_response, "version")

    if not sql_file_listing:
        print("Requested database file package does not have a SQL file.\n"
              "Please check SQL file availability at the specified url.")
        return
    database_filename = sql_file_listing[0]

    if not version_file_listing:
        if get_version:
            print("Requested database file package does not have"
                  "a version file.\nPlease check version file availability "
                  "at the specified url.")
            return
        else:
            version_filename = None
    else:
        version_filename = version_file_listing[0]

    if folder_path is None:
        output_path = pipelines_basic.create_working_path(
            pathlib.Path(DEFAULT_OUTPUT_FOLDER), folder_name)
        if output_path.is_dir():
            shutil.rmtree(output_path)
        pipelines_basic.create_working_dir(output_path, force=True)
    else:
        output_path = pipelines_basic.create_working_path(
            folder_path, folder_name)
        pipelines_basic.create_working_dir(output_path)

    # Only look for version file is selected.
    if version_filename is not None:
        version_filepath, status1 = prepare_download(output_path, pkg_url,
                                                     version_filename,
                                                     "version")
        version_filehandle = version_filepath.open(mode="r")
        version = int(version_filehandle.readline().rstrip())
    else:
        status1 = True
        version = 0

    if (not force_pull) and (version > 0):
        if db_name in alchemist.databases:
            alchemist.database = db_name
            alchemist.build_engine()

            curr_schema_version = mysqldb.get_schema_version(alchemist.engine)
            if curr_schema_version > 2:
                curr_version_data = mysqldb_basic.get_first_row_data(
                    alchemist.engine, "version")
                curr_version = int(curr_version_data.get("Version", 0))
                if curr_version >= version:
                    print(f"Current database version of {db_name} "
                          "is greater than or equal to the database version "
                          "at the specified listing.\nPlease use "
                          "the --force_pull flag if you would like to "
                          "indiscriminately pull and install a database.")
                    return

    db_filepath, status2 = prepare_download(output_path,
                                            pkg_url,
                                            database_filename,
                                            "sql",
                                            verbose=verbose)
    if not status1 or not status2:
        print("Unable to download data from server.\n Aborting pipeline.")
        return

    # If downloading from server, user may have selected to not
    # install the database file.
    if (not download_only) and (not get_fastas) and (not get_alns):
        install_db(alchemist,
                   db_name,
                   db_filepath=db_filepath,
                   config_file=config_file,
                   schema_version=schema_version,
                   verbose=verbose)

        # The output folder was only created for downloading from server.
        print("Removing downloaded data.")
        shutil.rmtree(output_path)