示例#1
0
def execute_make_db(alchemist, db_type, values=None, folder_path=None,
                    folder_name=DEFAULT_FOLDER_NAME, verbose=False, filters="",
                    groups=[], db_name=None, threads=1, use_mpi=False,
                    mol_type=None, hash_index=False, parse_seqids=True,
                    gi_mask=False, mask_data=None, mask_id=None, logfile=None,
                    tax_id=None, tax_id_map=None):
    if db_name is None:
        db_name = alchemist.database

    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    db_filter = pipelines_basic.build_filter(alchemist, "pham", filters,
                                             values=values,
                                             verbose=verbose)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(
                                            db_filter, working_path,
                                            groups=groups, verbose=verbose)

    data_cache = {}
    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")
            continue

        pipelines_basic.create_working_dir(mapped_path)

        if db_type == "hhsuite":
            execute_make_hhsuite_database(alchemist, db_filter.values,
                                          mapped_path, db_name, db_version,
                                          data_cache=data_cache,
                                          threads=threads, verbose=verbose,
                                          use_mpi=use_mpi)
        elif db_type == "blast":
            execute_make_blast_database(
                    alchemist, db_filter.values, mapped_path, db_name,
                    db_version, data_cache=data_cache, verbose=verbose,
                    hash_index=False, parse_seqids=True, gi_mask=False,
                    mask_data=None, mask_id=None, logfile=None, tax_id=None,
                    tax_id_map=None)
示例#2
0
def execute_pham_review(alchemist,
                        folder_path=None,
                        folder_name=DEFAULT_FOLDER_NAME,
                        no_review=False,
                        values=[],
                        filters="",
                        groups=[],
                        sort=[],
                        s_report=False,
                        gr_reports=False,
                        psr_reports=False,
                        production=False,
                        verbose=False,
                        force=False):
    """Executes the entirety of the pham review pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param csv_title: Title for an appended csv file prefix.
    :type csv_title: str
    :param review: A boolean to toggle filtering of phams by pham discrepancies
    :type review: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: list[list[str]]
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :param gr_reports: A boolean to toggle export of additional pham info
    :type gr_reports: bool
    :param production: Toggles additional filters for production-level review
    :type production: bool
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "gene.PhamID",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    if production:
        db_filter.add(BASE_CONDITIONALS)
        db_filter.update()
    else:
        conditionals = db_filter.build_where_clauses()
        db_filter.values = db_filter.build_values(where=conditionals)

    if not db_filter.values:
        print("Current settings produced no database hits.")
        sys.exit(1)
    else:
        if verbose:
            print(f"Identified {db_filter.hits()} phams to review...")

    if not no_review:
        review_phams(db_filter, verbose=verbose)

    if sort:
        db_filter.sort(sort)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        export_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)

    if verbose:
        print("Prepared query and path structure, beginning review export...")
    original_phams = db_filter.values
    gr_data_cache = {}
    psr_data_cache = {}
    for mapped_path in conditionals_map.keys():
        conditionals = conditionals_map[mapped_path]
        db_filter.values = original_phams
        db_filter.values = db_filter.build_values(where=conditionals)

        pipelines_basic.create_working_dir(mapped_path, force=force)

        review_data = get_review_data(alchemist, db_filter, verbose=verbose)
        write_report(review_data,
                     mapped_path,
                     REVIEW_HEADER,
                     csv_name="FunctionReport",
                     verbose=verbose)

        if s_report:
            summary_data = get_summary_data(alchemist, db_filter)
            write_summary_report(alchemist,
                                 summary_data,
                                 mapped_path,
                                 verbose=verbose)

        if gr_reports or psr_reports:
            execute_pham_report_export(alchemist,
                                       db_filter,
                                       mapped_path,
                                       gr_reports=gr_reports,
                                       psr_reports=psr_reports,
                                       gr_data_cache=gr_data_cache,
                                       psr_data_cache=psr_data_cache,
                                       verbose=verbose)
示例#3
0
def execute_build_pan(alchemist,
                      hhdb_path=None,
                      pan_name=None,
                      folder_path=None,
                      folder_name=DEFAULT_FOLDER_NAME,
                      values=None,
                      verbose=False,
                      filters="",
                      groups=[],
                      threads=1,
                      M=50,
                      aD=75,
                      mD=65,
                      B=0.2,
                      PANgraph_out=None):
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "pham",
                                             filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")
            continue

        if pan_name is None:
            pan_name = folder_name

        pipelines_basic.create_working_dir(mapped_path)
        pan_path = mapped_path.joinpath(".".join([pan_name, "sqlite"]))

        pan_alchemist = pan_handling.build_pan(pan_path)
        pan_alchemist.expire_on_commit = True

        pham_data_dir = mapped_path.joinpath("pham_alns")
        pham_data_dir.mkdir()
        data_maps_tuple = create_pham_alns(alchemist.engine,
                                           db_filter.values,
                                           pham_data_dir,
                                           threads=threads,
                                           M=M,
                                           verbose=verbose)

        build_pan_nodes(pan_alchemist,
                        db_filter.values,
                        data_maps_tuple,
                        threads=threads,
                        verbose=verbose)

        cent_data_dir = mapped_path.joinpath("cent_alns")
        cent_data_dir.mkdir()
        build_pan_neighborhoods(alchemist,
                                pan_alchemist,
                                db_filter.values,
                                cent_data_dir,
                                data_maps_tuple,
                                aD=aD,
                                mD=mD,
                                B=B,
                                threads=threads,
                                verbose=verbose)

        hmm_data_dir = mapped_path.joinpath("pham_hhrs")
        hmm_data_dir.mkdir()

        if hhdb_path is not None:
            raise NotImplementedError(
                "Town building is not implemented yet... :(")
            if verbose:
                print("...Calculating pham HMM profiles...")
            hmm_path_map = alignment.create_hmms(data_maps_tuple[0],
                                                 name=True,
                                                 M=M,
                                                 threads=threads,
                                                 verbose=verbose)
            build_pan_towns(alchemist,
                            pan_alchemist,
                            hhdb_path,
                            hmm_data_dir,
                            hmm_path_map,
                            threads=threads,
                            verbose=verbose)

        if PANgraph_out is not None:
            pan_graph = pan_handling.to_networkx(pan_alchemist)
            pde_fileio.write_graph(pan_graph,
                                   PANgraph_out,
                                   mapped_path,
                                   pan_name,
                                   edge_weights=PAN_GRAPH_EDGEWEIGHTS)

        shutil.rmtree(Path(TEMP_DIR))
示例#4
0
def execute_export(alchemist, pipeline, folder_path=None,
                   folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False,
                   dump=False, force=False, table=DEFAULT_TABLE, filters="",
                   groups=[], sort=[], include_columns=[], exclude_columns=[],
                   sequence_columns=False, raw_bytes=False, concatenate=False,
                   db_name=None, phams_out=False, threads=1):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param pipeline: File type that dictates data processing.
    :type pipeline: str
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param include_columns: A csv export column selection parameter.
    :type include_columns: list[str]
    :param exclude_columns: A csv export column selection parameter.
    :type exclude_columns: list[str]
    :param sequence_columns: A boolean to toggle inclusion of sequence data.
    :type sequence_columns: bool
    :param concatenate: A boolean to toggle concaternation for SeqRecords.
    :type concaternate: bool
    :param threads: Number of processes/threads to spawn during the pipeline
    :type threads: int
    """
    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version")

    if pipeline == "csv":
        if verbose:
            print("Processing columns for csv export...")
        csv_columns = filter_csv_columns(alchemist, table,
                                         include_columns=include_columns,
                                         exclude_columns=exclude_columns,
                                         sequence_columns=sequence_columns)

    if pipeline in FILTERABLE_PIPELINES:
        db_filter = pipelines_basic.build_filter(alchemist, table, filters,
                                                 values=values,
                                                 verbose=verbose)
        if sort:
            pipelines_basic.add_sort_columns(db_filter, sort, verbose=verbose)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path, folder_name,
                                                      dump=dump, force=force)

    data_cache = {}
    if pipeline == "sql":
        execute_sql_export(alchemist, export_path, folder_path, db_version,
                           db_name=db_name, dump=dump, force=force,
                           phams_out=phams_out, threads=threads,
                           verbose=verbose)
    elif pipeline in FILTERABLE_PIPELINES:
        conditionals_map = pipelines_basic.build_groups_map(
                                                db_filter, export_path,
                                                groups=groups,
                                                verbose=verbose, force=force)

        if verbose:
            print("Prepared query and path structure, beginning export...")

        values = db_filter.values
        for mapped_path in conditionals_map.keys():
            db_filter.reset()
            db_filter.values = values

            conditionals = conditionals_map[mapped_path]
            db_filter.values = db_filter.build_values(where=conditionals)

            if db_filter.hits() == 0:
                print(f"No database entries received from {table} "
                      f"for '{mapped_path}'.")
                continue

            if sort:
                sort_columns = get_sort_columns(alchemist, sort)
                db_filter.sort(sort_columns)

            export_name = None
            if dump:
                if mapped_path == export_path:
                    export_name = folder_name

            pipelines_basic.create_working_dir(mapped_path, dump=dump,
                                               force=force)

            if pipeline in BIOPYTHON_PIPELINES + ["tbl"]:
                execute_ffx_export(alchemist, mapped_path, export_path,
                                   db_filter.values, pipeline, db_version,
                                   table, concatenate=concatenate,
                                   data_cache=data_cache,
                                   export_name=export_name, threads=threads,
                                   verbose=verbose, dump=dump)
            elif pipeline == "csv":
                execute_csv_export(db_filter, mapped_path, export_path,
                                   csv_columns, table, raw_bytes=raw_bytes,
                                   data_cache=data_cache,
                                   verbose=verbose, dump=dump)
    else:
        print("Unrecognized export pipeline, aborting export")
        sys.exit(1)
示例#5
0
def execute_cluster_db(
                alchemist, folder_path=None,
                folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=None,
                filters="", groups=[], threads=1,
                kmer=DEFAULT_SETTINGS["kmer"],
                sketch=DEFAULT_SETTINGS["sketch"],
                gcs=DEFAULT_SETTINGS["gcs"], ani=DEFAULT_SETTINGS["ani"],
                gcsmax=DEFAULT_SETTINGS["gcsmax"],
                animax=DEFAULT_SETTINGS["animax"],
                gcsS=DEFAULT_SETTINGS["gcsS"], gcsM=DEFAULT_SETTINGS["gcsM"],
                aniS=DEFAULT_SETTINGS["aniS"], aniM=DEFAULT_SETTINGS["aniM"],
                mat_out=False, evaluate=False, subcluster=False,
                cluster_prefix=None):
    db_filter = pipelines_basic.build_filter(alchemist, "phage", filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(
                                            folder_path, folder_name)
    temp_dir = create_temp_path(TEMP_DIR)
    conditionals_map = pipelines_basic.build_groups_map(
                                            db_filter, working_path,
                                            groups=groups, verbose=verbose)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if verbose:
            print("Querying MySQL database for clustering metadata...")
        cluster_metadata = query_cluster_metadata(db_filter)

        gcs_matrix = calculate_gcs_matrix(alchemist, db_filter.values,
                                          verbose=verbose, cores=threads)

        pipelines_basic.create_working_dir(mapped_path)

        if verbose:
            print("Clustering database genomes...")
        cluster_scheme = gcs_cluster(
                                mapped_path, gcs_matrix,
                                cluster_metadata[0], cluster_metadata[1],
                                gcs=gcs, gcsmax=gcsmax, S=gcsS, M=gcsM,
                                evaluate=evaluate, cores=threads,
                                verbose=verbose,
                                cluster_prefix=cluster_prefix)

        if subcluster:
            sketch_path_map = sketch_genomes(db_filter, temp_dir,
                                             verbose=verbose)

            if verbose:
                print("Subclustering database genomes...")
            ani_subcluster(mapped_path, sketch_path_map, cluster_scheme,
                           cluster_metadata[0], cluster_metadata[1],
                           cluster_metadata[2], cores=threads,
                           verbose=verbose, ani=ani, animax=animax,
                           evaluate=evaluate)

            empty = True
            for _ in mapped_path.iterdir():
                empty = False

            if empty:
                shutil.rmtree(mapped_path)
示例#6
0
def execute_pham_align(alchemist,
                       folder_path=None,
                       folder_name=DEFAULT_FOLDER_NAME,
                       values=None,
                       filters="",
                       groups=[],
                       file_type="fasta",
                       mat_out=False,
                       tree_out=False,
                       threads=1,
                       verbose=False,
                       dump=False,
                       force=False):
    """Executes the entirety of the pham align pipeline.
    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for working dir creation.
    :type folder_path: Path
    :param folder_name: A name for the working directory.
    :type folder_name: str
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param dump: A boolean value to toggle dump in current working dir.
    :type dump: bool
    :param filters: A MySQL formatted WHERE clause string
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param file_type: Format type of sequence alignment file to export.
    :type file_type: str
    :param mat_out: A boolean to toggle distance matrix file generation.
    :type mat_out: bool
    :param tree_out: A boolean to toggle guidetree file generation.
    :type tree_out: bool
    :param threads: Number of processes to spawn during alignment workflow.
    :type threads: int
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "pham",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name,
                                                       dump=dump,
                                                       force=force)

    data_cache = {}
    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)
    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'")
            continue

        pipelines_basic.create_working_dir(mapped_path, dump=dump, force=force)

    execute_pham_MSA_alignment(alchemist,
                               mapped_path,
                               db_filter.values,
                               data_cache=data_cache,
                               file_type=file_type,
                               mat_out=mat_out,
                               tree_out=tree_out,
                               threads=threads,
                               verbose=verbose)
示例#7
0
def execute_get_gb_records(alchemist, file_type, folder_path=None,
                           folder_name=DEFAULT_FOLDER_NAME, 
                           config=None, values=None, verbose=False, 
                           force=False, filters="", groups=[]):
    """Executes the entirety of the get_gb_records pipeline

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param file_type: File type to be exported.
    :type file_type: str
    :param config: ConfigParser object containing NCBI credentials.
    :type config: ConfigParser
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statemtns.
    :type verbose: bool
    :param filters: A List of lists with filter value,grouped by ORs.
    :type filter: str
    :param groups: A list of supported MySQL column names to goup by.
    :type groups: list[str]
    """
    ncbi_creds = {}
    if config is not None:
        ncbi_creds = config["ncbi"]

    db_filter = pipelines_basic.build_filter(alchemist, FILTER_KEY, filters,
                                             values=values, verbose=verbose)

    if verbose:
        print("Creating records folder...")
    records_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name,
                                                       force=force)

    conditionals_map = pipelines_basic.build_groups_map(
                                                db_filter, records_path,
                                                groups=groups, verbose=verbose,
                                                force=force)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        # Create data sets
        if verbose:
            print("Retrieving accessions from the database...")
        accession_data = db_filter.select(["phage.PhageID", "phage.Accession"])

        acc_id_dict = {}
        for data_dict in accession_data:
            accession = data_dict["Accession"]
            if not (accession is None or accession == ""):
                acc_id_dict[accession] = data_dict["PhageID"]

        pipelines_basic.create_working_dir(mapped_path, force=force)
        if len(acc_id_dict.keys()) > 0:
            ncbi_handle = ncbi.get_verified_data_handle(
                                                     acc_id_dict,
                                                     ncbi_cred_dict=ncbi_creds,
                                                     file_type=file_type)

            copy_gb_data(ncbi_handle, acc_id_dict, mapped_path, file_type,
                         verbose=verbose)
        else:
            print(f"There are no records to retrieve for '{mapped_path}'.")
            continue
示例#8
0
def execute_remote_revise(alchemist,
                          folder_path=None,
                          folder_name=DEFAULT_FOLDER_NAME,
                          config=None,
                          output_type="p_curation",
                          values=None,
                          filters="",
                          groups=[],
                          verbose=False,
                          force=False):
    ncbi_creds = {}
    if config is not None:
        ncbi_creds = config["ncbi"]

    db_filter = pipelines_basic.build_filter(alchemist,
                                             "phage",
                                             filters,
                                             values=values,
                                             verbose=verbose)
    db_filter.add(BASE_CONDITIONALS)

    revise_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        revise_path,
                                                        groups=groups,
                                                        verbose=verbose,
                                                        force=force)

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        db_filter.reset()
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]
        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print(f"No database entries received for '{mapped_path}'.")

        pipelines_basic.create_working_dir(mapped_path, force=force)
        build_revise_log_file(mapped_path)

        logger.info(f"pdm_utils version: {VERSION}")
        logger.info(f"Revise run date: {CURRENT_DATE}")
        logger.info(f"Connected to database: {alchemist.database}")

        accession_data = db_filter.select(["phage.PhageID", "phage.Accession"])

        acc_id_dict = {}
        for data_dict in accession_data:
            accession = data_dict["Accession"]
            if not (accession is None or accession == ""):
                acc_id_dict[accession] = data_dict["PhageID"]

        tbl_records = get_tbl_records(acc_id_dict, ncbi_cred_dict=ncbi_creds)

        validated_phages = []
        for tbl_record in tbl_records:
            validated_phages.append(tbl_record.name)

        id_record_map = build_id_record_map(alchemist, validated_phages)

        if output_type == "tbl":
            revised_records = revise_seqrecords(id_record_map,
                                                tbl_records,
                                                verbose=verbose)

            if not revised_records:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        elif output_type == "p_curation":
            curation_data_dicts = find_product_discrepancies(id_record_map,
                                                             tbl_records,
                                                             verbose=verbose)

            if not curation_data_dicts:
                print("No discrepancies detected between "
                      f"local data and GenBank data for '{mapped_path}'.")
                continue

        if output_type == "tbl":
            fileio.write_feature_table(revised_records,
                                       mapped_path,
                                       verbose=verbose)
        elif output_type == "p_curation":
            file_path = mapped_path.joinpath("revise.csv")
            fileio.export_data_dict(curation_data_dicts,
                                    file_path,
                                    CURATION_HEADER,
                                    include_headers=True)
示例#9
0
def execute_local_revise(alchemist,
                         revisions_file_path,
                         folder_path=None,
                         folder_name=DEFAULT_FOLDER_NAME,
                         config=None,
                         input_type="function_report",
                         output_type="p_curation",
                         production=False,
                         filters="",
                         groups=[],
                         force=False,
                         verbose=False):
    """Executes the entirety of the genbank local revise pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param revisions_data_dicts: Data dictionaries containing pham/notes data.
    :type revisions_data_dicts: list[dict]
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param input_type: Specifies the file format of the input file
    :type input_type: str
    :param output_type: Specifies the file format of the outputted file
    :type output_type: str
    :param production: Toggles additional filters for production-level revision
    :type production: bool
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param force: A boolean to toggle aggresive building of directories.
    :type force: bool
    """
    keys = INPUT_FILE_KEYS.get(input_type)
    if keys is None:
        raise ValueError(f"Revision input type {input_type} is not supported.")

    revisions_data_dicts = fileio.retrieve_data_dict(revisions_file_path)

    values = []
    for data_dict in revisions_data_dicts:
        values.append(data_dict[keys['data_key']])

    db_filter = pipelines_basic.build_filter(alchemist,
                                             keys['filter_key'],
                                             filters,
                                             values=values,
                                             verbose=verbose)

    if production:
        db_filter.add(BASE_CONDITIONALS)

    revise_columns = db_filter.get_columns(REVISION_COLUMNS)

    if verbose:
        print("Creating export folder...")
    export_path = pipelines_basic.create_working_path(folder_path,
                                                      folder_name,
                                                      force=force)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        export_path,
                                                        force=force,
                                                        groups=groups,
                                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning revise export...")

    for mapped_path in conditionals_map.keys():
        conditionals = conditionals_map[mapped_path]

        if input_type == "function_report":
            export_dicts = use_function_report_data(db_filter,
                                                    revisions_data_dicts,
                                                    revise_columns,
                                                    conditionals,
                                                    verbose=verbose)
        elif input_type == "csv":
            export_dicts = use_csv_data(db_filter,
                                        revisions_data_dicts,
                                        revise_columns,
                                        conditionals,
                                        verbose=verbose)

        if not export_dicts:
            if verbose:
                print(f"'{mapped_path.name}' data selected does not require "
                      "revision; no file exported...")

            continue

        pipelines_basic.create_working_dir(mapped_path, force=force)

        write_revise_file(export_dicts,
                          mapped_path,
                          file_format=output_type,
                          verbose=verbose)
示例#10
0
def execute_pham_finder(alchemist, folder_path, folder_name, 
                        adatabase, bdatabase, values=None,
                        filters="", groups=[], sort=[],
                        show_per=False, use_locus=False, verbose=False):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param adatabase: Name of reference database to source phams from.
    :type adatabase: str
    :param bdatabase: Name of database to find corresponding phams for.
    :type bdatabase: str
    :param values: List of values to filter database results:
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param show_per: Enables display gene coverage of the corresponding phams.
    :type show_per: bool
    :param use_locus: Toggles conversion between phams using LocusTag instead
    :type use_locus: bool
    """
    if not (adatabase in alchemist.databases and \
            bdatabase in alchemist.databases):
        print("User credentials does not have access to both "
             f"databases {adatabase} and {bdatabase}.\n"
              "Please check your database access and try again.")
        sys.exit(1)

    alchemist.database = adatabase
    alchemist.connect()
    a_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters,
                                            values=values, verbose=verbose)

    alchemist.database = bdatabase
    alchemist.connect()
    if use_locus:
        b_filter = pipelines_basic.build_filter(alchemist, "gene.LocusTag", "")
    else:
        b_filter = pipelines_basic.build_filter(alchemist, "gene", "")

    if sort:
        try:
            a_filter.sort(sort)
        except:
            print("Please check your syntax for sorting columns:\n"
                 f"{', '.join(sort)}")
            sys.exit(1)

    if verbose:
        print("Creating pham_finder folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    pipelines_basic.build_groups_map(a_filter, export_path,
                                        conditionals_map,
                                        groups=groups,
                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning export...")

    values = a_filter.values
    for mapped_path in conditionals_map.keys():
        a_filter.reset()
        a_filter.values = values

        conditionals = conditionals_map[mapped_path]
        a_filter.values = a_filter.build_values(where=conditionals)
        
        if a_filter.hits() == 0:
            print("No database entries received from gene.PhamID "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        if sort:
            sort_columns = get_sort_columns(alchemist, sort)
            a_filter.sort(sort_columns)

        mapped_phams = find_phams(a_filter, b_filter, show_per=show_per)
        if not mapped_phams:
            print("Phams are consistent between the two databases "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        out_data_dicts = []
        for ref_pham, corr_phams in mapped_phams.items():
            data_dict = {}
            data_dict[PHAM_FINDER_HEADER[0]] = ref_pham
            data_dict[PHAM_FINDER_HEADER[1]] = corr_phams
            out_data_dicts.append(data_dict)

        file_path = mapped_path.joinpath("PhamMap.csv")
        fileio.export_data_dict(out_data_dicts, file_path, PHAM_FINDER_HEADER,
                                include_headers=True)
示例#11
0
def execute_find_primers(alchemist,
                         folder_path=None,
                         folder_name=DEFAULT_FOLDER_NAME,
                         values=None,
                         filters="",
                         groups=[],
                         verbose=False,
                         threads=4,
                         prc=0.7,
                         dev_net=0,
                         len_oligomer=20,
                         minD=900,
                         maxD=1100,
                         tm_min=52.0,
                         tm_max=58.0,
                         hpn_min=-2000,
                         ho_min=-5000,
                         GC_max=60.0,
                         het_min=-5000,
                         tm_gap=5.0,
                         ta_min=48.0,
                         fwd_in=None,
                         rvs_in=None,
                         ta_max=68.0,
                         mode=0,
                         full_genome=False,
                         soft_cap=None,
                         phams_in=[]):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path
    :type folder_path: Path
    :param folder_name: A name for the working directory folder
    :type folder_name: str
    :param values: List of values to filter database results
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param filters: A pseudo-SQL WHERE clause string to filter values.
    :type filters: str
    :param groups: A list of SQL column names to filter values.
    :type groups: list[str]
    :param threads: Number of child process workers to utilize
    :type threads: int
    :param prc: Percentage of genomes a pham must exist in to pass prefiltering
    :type prc: float
    :param dev_net: Allowance for the primer positions to pass prefiltering
    :type dev_net: int
    :param len_oligomer: Length of the oligomers used to create the primers
    :type len_oligomer: int
    :param minD: Minimum primer product length to pass primer testing
    :type minD: int
    :param maxD: Maximum primer product length to pass primer testing
    :type maxD: int
    :param tm_min: Minimum primer melting temperature to pass primer testing
    :type tm_min: float
    :param tm_max: Maximum primer melting temperature to pass primer testing
    :type tm_max: float
    :param hpn_min: Minimum hairpin Gibbs free energy to pass primer testing
    :type hpn_min: int
    :param ho_min: Minimum homodimer Gibbs free energy to pass primer testing
    :type ho_min: int
    :param GC_max: Maximum GC content percentage allowed for an oligomer
    :type GC_max: float
    :param het_min: Minimum heterodimer Gibbs free energy to pass testing
    :type het_min: int
    :param tm_gap: Maximum allowed melting temperature gap between oligomers
    :type tm_gap: float
    :param ta_min: Minimum allowed optimal annealing temperature
    :type ta_min: float
    :param ta_max: Maximum allowed optimal annealing temperature
    :type ta_max: float
    :param fwd_in: Fixed forward sequence to find primer pairs for
    :type fwd_in: str
    :param rvs_in: Fixed reverse sequence to find primer pairs for
    :type rvs_in: str
    :param mode: Run mode for find primers analysis
    :type mode: int
    :param soft_cap: Cap limit on number of pairs evaluated after testing
    :type soft_cap: int
    :param phams_in: Phams to evaluate during count min sketch eval of kmers
    :type phams_in: list[str]
    """
    db_filter = pipelines_basic.build_filter(alchemist,
                                             "phage",
                                             filters,
                                             values=values)

    working_path = pipelines_basic.create_working_path(folder_path,
                                                       folder_name)

    conditionals_map = pipelines_basic.build_groups_map(db_filter,
                                                        working_path,
                                                        groups=groups,
                                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning primer search...")

    if not TEMP_DIR.is_dir():
        TEMP_DIR.mkdir()
    pickled_results_file = TEMP_DIR.joinpath(PICKLED_FILE_NAME)
    if pickled_results_file.is_file():
        pickled_results_file.unlink()

    values = db_filter.values
    for mapped_path in conditionals_map.keys():
        results_map = {}
        db_filter.reset()
        db_filter.key = "phage"
        db_filter.values = values

        conditionals = conditionals_map[mapped_path]

        db_filter.values = db_filter.build_values(where=conditionals)

        if db_filter.hits() == 0:
            print("No database entries received from phage "
                  f" for '{mapped_path.name}'.")

        genome_map = {}
        for genome_id in db_filter.values:
            export_db.get_single_genome(alchemist,
                                        genome_id,
                                        data_cache=genome_map)

        if verbose:
            print(f"...Identifying primer pairs for '{mapped_path}'...")

        if full_genome:
            F_results, R_results = find_full_genome_oligomers(
                genome_map,
                verbose=verbose,
                threads=threads,
                prc=prc,
                minD=minD,
                maxD=maxD,
                len_oligomer=len_oligomer,
                tm_min=tm_min,
                tm_max=tm_max,
                hpn_min=hpn_min,
                ho_min=ho_min,
                GC_max=GC_max)
        else:
            pham_gene_map = build_pham_gene_map(db_filter,
                                                conditionals,
                                                phams_in=phams_in,
                                                verbose=verbose)
            if not pham_gene_map:
                print(f"No valid phams found for '{mapped_path}' with current "
                      "settings")

            F_results, R_results = find_oligomers(alchemist,
                                                  pham_gene_map,
                                                  genome_map,
                                                  verbose=verbose,
                                                  threads=threads,
                                                  prc=prc,
                                                  minD=minD,
                                                  maxD=maxD,
                                                  len_oligomer=len_oligomer,
                                                  tm_min=tm_min,
                                                  tm_max=tm_max,
                                                  hpn_min=hpn_min,
                                                  ho_min=ho_min,
                                                  GC_max=GC_max,
                                                  fwd_in=fwd_in,
                                                  rvs_in=rvs_in)

        if (not F_results) or (not R_results):
            if verbose:
                print(f"No valid oligomers found for '{mapped_path.name}'")
            continue

        if verbose:
            print("...Matching oligomers to create primer pairs...")

        primer_pairs = match_oligomers(F_results,
                                       R_results,
                                       minD=minD,
                                       maxD=maxD,
                                       dev_net=dev_net,
                                       threads=threads)

        if not primer_pairs:
            print(f"No valid primer pairs found for '{mapped_path}' with "
                  "current parameters...")
            continue

        if verbose:
            print(f"...Identified {len(primer_pairs)} valid primer pairs.")

        if verbose:
            print(f"...Testing primer pairs for '{mapped_path}'...")
        primer_pairs = test_primer_pairs(primer_pairs,
                                         genome_map,
                                         threads=threads,
                                         verbose=verbose,
                                         minD=minD,
                                         maxD=maxD,
                                         het_min=het_min,
                                         ta_min=ta_min,
                                         ta_max=ta_max,
                                         tm_gap_max=tm_gap)

        if verbose:
            print(f"...{len(primer_pairs)} passed primer testing.")

        if soft_cap is not None:
            if len(primer_pairs) > soft_cap:
                primer_pairs = primer_pairs[:soft_cap]

        if pickled_results_file.is_file():
            with pickled_results_file.open(mode="rb") as filehandle:
                results_map = pickle.load(filehandle)

        if primer_pairs:
            results_map[mapped_path] = (primer_pairs, genome_map)

        with pickled_results_file.open(mode="wb") as filehandle:
            pickle.dump(results_map, filehandle)

    if pickled_results_file.is_file():
        pickled_results_file.unlink()

    if not results_map:
        print("No primer pairs found with current parameters...")

    results_map = select_primer_pairs(results_map,
                                      verbose=verbose,
                                      mode=mode,
                                      het_min=het_min)

    for mapped_path, primer_pairs in results_map.items():
        pipelines_basic.create_working_dir(mapped_path)
        file_path = mapped_path.joinpath("primer.txt")
        fileio.write_primer_txt_file(primer_pairs[0][0], file_path)