def execute_make_db(alchemist, db_type, values=None, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, verbose=False, filters="", groups=[], db_name=None, threads=1, use_mpi=False, mol_type=None, hash_index=False, parse_seqids=True, gi_mask=False, mask_data=None, mask_id=None, logfile=None, tax_id=None, tax_id_map=None): if db_name is None: db_name = alchemist.database if verbose: print("Retrieving database version...") db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version") db_filter = pipelines_basic.build_filter(alchemist, "pham", filters, values=values, verbose=verbose) working_path = pipelines_basic.create_working_path(folder_path, folder_name) conditionals_map = pipelines_basic.build_groups_map( db_filter, working_path, groups=groups, verbose=verbose) data_cache = {} values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received for '{mapped_path}'.") continue pipelines_basic.create_working_dir(mapped_path) if db_type == "hhsuite": execute_make_hhsuite_database(alchemist, db_filter.values, mapped_path, db_name, db_version, data_cache=data_cache, threads=threads, verbose=verbose, use_mpi=use_mpi) elif db_type == "blast": execute_make_blast_database( alchemist, db_filter.values, mapped_path, db_name, db_version, data_cache=data_cache, verbose=verbose, hash_index=False, parse_seqids=True, gi_mask=False, mask_data=None, mask_id=None, logfile=None, tax_id=None, tax_id_map=None)
def execute_pham_review(alchemist, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, no_review=False, values=[], filters="", groups=[], sort=[], s_report=False, gr_reports=False, psr_reports=False, production=False, verbose=False, force=False): """Executes the entirety of the pham review pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param csv_title: Title for an appended csv file prefix. :type csv_title: str :param review: A boolean to toggle filtering of phams by pham discrepancies :type review: bool :param values: List of values to filter database results. :type values: list[str] :param force: A boolean to toggle aggresive building of directories. :type force: bool :param filters: A list of lists with filter values, grouped by ORs. :type filters: list[list[str]] :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :param gr_reports: A boolean to toggle export of additional pham info :type gr_reports: bool :param production: Toggles additional filters for production-level review :type production: bool :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ db_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters, values=values, verbose=verbose) if production: db_filter.add(BASE_CONDITIONALS) db_filter.update() else: conditionals = db_filter.build_where_clauses() db_filter.values = db_filter.build_values(where=conditionals) if not db_filter.values: print("Current settings produced no database hits.") sys.exit(1) else: if verbose: print(f"Identified {db_filter.hits()} phams to review...") if not no_review: review_phams(db_filter, verbose=verbose) if sort: db_filter.sort(sort) if verbose: print("Creating export folder...") export_path = pipelines_basic.create_working_path(folder_path, folder_name, force=force) conditionals_map = pipelines_basic.build_groups_map(db_filter, export_path, groups=groups, verbose=verbose, force=force) if verbose: print("Prepared query and path structure, beginning review export...") original_phams = db_filter.values gr_data_cache = {} psr_data_cache = {} for mapped_path in conditionals_map.keys(): conditionals = conditionals_map[mapped_path] db_filter.values = original_phams db_filter.values = db_filter.build_values(where=conditionals) pipelines_basic.create_working_dir(mapped_path, force=force) review_data = get_review_data(alchemist, db_filter, verbose=verbose) write_report(review_data, mapped_path, REVIEW_HEADER, csv_name="FunctionReport", verbose=verbose) if s_report: summary_data = get_summary_data(alchemist, db_filter) write_summary_report(alchemist, summary_data, mapped_path, verbose=verbose) if gr_reports or psr_reports: execute_pham_report_export(alchemist, db_filter, mapped_path, gr_reports=gr_reports, psr_reports=psr_reports, gr_data_cache=gr_data_cache, psr_data_cache=psr_data_cache, verbose=verbose)
def execute_build_pan(alchemist, hhdb_path=None, pan_name=None, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False, filters="", groups=[], threads=1, M=50, aD=75, mD=65, B=0.2, PANgraph_out=None): db_filter = pipelines_basic.build_filter(alchemist, "pham", filters, values=values) working_path = pipelines_basic.create_working_path(folder_path, folder_name) conditionals_map = pipelines_basic.build_groups_map(db_filter, working_path, groups=groups, verbose=verbose) values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received for '{mapped_path}'.") continue if pan_name is None: pan_name = folder_name pipelines_basic.create_working_dir(mapped_path) pan_path = mapped_path.joinpath(".".join([pan_name, "sqlite"])) pan_alchemist = pan_handling.build_pan(pan_path) pan_alchemist.expire_on_commit = True pham_data_dir = mapped_path.joinpath("pham_alns") pham_data_dir.mkdir() data_maps_tuple = create_pham_alns(alchemist.engine, db_filter.values, pham_data_dir, threads=threads, M=M, verbose=verbose) build_pan_nodes(pan_alchemist, db_filter.values, data_maps_tuple, threads=threads, verbose=verbose) cent_data_dir = mapped_path.joinpath("cent_alns") cent_data_dir.mkdir() build_pan_neighborhoods(alchemist, pan_alchemist, db_filter.values, cent_data_dir, data_maps_tuple, aD=aD, mD=mD, B=B, threads=threads, verbose=verbose) hmm_data_dir = mapped_path.joinpath("pham_hhrs") hmm_data_dir.mkdir() if hhdb_path is not None: raise NotImplementedError( "Town building is not implemented yet... :(") if verbose: print("...Calculating pham HMM profiles...") hmm_path_map = alignment.create_hmms(data_maps_tuple[0], name=True, M=M, threads=threads, verbose=verbose) build_pan_towns(alchemist, pan_alchemist, hhdb_path, hmm_data_dir, hmm_path_map, threads=threads, verbose=verbose) if PANgraph_out is not None: pan_graph = pan_handling.to_networkx(pan_alchemist) pde_fileio.write_graph(pan_graph, PANgraph_out, mapped_path, pan_name, edge_weights=PAN_GRAPH_EDGEWEIGHTS) shutil.rmtree(Path(TEMP_DIR))
def execute_export(alchemist, pipeline, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=False, dump=False, force=False, table=DEFAULT_TABLE, filters="", groups=[], sort=[], include_columns=[], exclude_columns=[], sequence_columns=False, raw_bytes=False, concatenate=False, db_name=None, phams_out=False, threads=1): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param pipeline: File type that dictates data processing. :type pipeline: str :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param force: A boolean to toggle aggresive building of directories. :type force: bool :param values: List of values to filter database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param dump: A boolean value to toggle dump in current working dir. :type dump: bool :param table: MySQL table name. :type table: str :param filters: A list of lists with filter values, grouped by ORs. :type filters: str :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :type sort: list[str] :param include_columns: A csv export column selection parameter. :type include_columns: list[str] :param exclude_columns: A csv export column selection parameter. :type exclude_columns: list[str] :param sequence_columns: A boolean to toggle inclusion of sequence data. :type sequence_columns: bool :param concatenate: A boolean to toggle concaternation for SeqRecords. :type concaternate: bool :param threads: Number of processes/threads to spawn during the pipeline :type threads: int """ if verbose: print("Retrieving database version...") db_version = mysqldb_basic.get_first_row_data(alchemist.engine, "version") if pipeline == "csv": if verbose: print("Processing columns for csv export...") csv_columns = filter_csv_columns(alchemist, table, include_columns=include_columns, exclude_columns=exclude_columns, sequence_columns=sequence_columns) if pipeline in FILTERABLE_PIPELINES: db_filter = pipelines_basic.build_filter(alchemist, table, filters, values=values, verbose=verbose) if sort: pipelines_basic.add_sort_columns(db_filter, sort, verbose=verbose) if verbose: print("Creating export folder...") export_path = pipelines_basic.create_working_path(folder_path, folder_name, dump=dump, force=force) data_cache = {} if pipeline == "sql": execute_sql_export(alchemist, export_path, folder_path, db_version, db_name=db_name, dump=dump, force=force, phams_out=phams_out, threads=threads, verbose=verbose) elif pipeline in FILTERABLE_PIPELINES: conditionals_map = pipelines_basic.build_groups_map( db_filter, export_path, groups=groups, verbose=verbose, force=force) if verbose: print("Prepared query and path structure, beginning export...") values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received from {table} " f"for '{mapped_path}'.") continue if sort: sort_columns = get_sort_columns(alchemist, sort) db_filter.sort(sort_columns) export_name = None if dump: if mapped_path == export_path: export_name = folder_name pipelines_basic.create_working_dir(mapped_path, dump=dump, force=force) if pipeline in BIOPYTHON_PIPELINES + ["tbl"]: execute_ffx_export(alchemist, mapped_path, export_path, db_filter.values, pipeline, db_version, table, concatenate=concatenate, data_cache=data_cache, export_name=export_name, threads=threads, verbose=verbose, dump=dump) elif pipeline == "csv": execute_csv_export(db_filter, mapped_path, export_path, csv_columns, table, raw_bytes=raw_bytes, data_cache=data_cache, verbose=verbose, dump=dump) else: print("Unrecognized export pipeline, aborting export") sys.exit(1)
def execute_cluster_db( alchemist, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, verbose=None, filters="", groups=[], threads=1, kmer=DEFAULT_SETTINGS["kmer"], sketch=DEFAULT_SETTINGS["sketch"], gcs=DEFAULT_SETTINGS["gcs"], ani=DEFAULT_SETTINGS["ani"], gcsmax=DEFAULT_SETTINGS["gcsmax"], animax=DEFAULT_SETTINGS["animax"], gcsS=DEFAULT_SETTINGS["gcsS"], gcsM=DEFAULT_SETTINGS["gcsM"], aniS=DEFAULT_SETTINGS["aniS"], aniM=DEFAULT_SETTINGS["aniM"], mat_out=False, evaluate=False, subcluster=False, cluster_prefix=None): db_filter = pipelines_basic.build_filter(alchemist, "phage", filters, values=values) working_path = pipelines_basic.create_working_path( folder_path, folder_name) temp_dir = create_temp_path(TEMP_DIR) conditionals_map = pipelines_basic.build_groups_map( db_filter, working_path, groups=groups, verbose=verbose) values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if verbose: print("Querying MySQL database for clustering metadata...") cluster_metadata = query_cluster_metadata(db_filter) gcs_matrix = calculate_gcs_matrix(alchemist, db_filter.values, verbose=verbose, cores=threads) pipelines_basic.create_working_dir(mapped_path) if verbose: print("Clustering database genomes...") cluster_scheme = gcs_cluster( mapped_path, gcs_matrix, cluster_metadata[0], cluster_metadata[1], gcs=gcs, gcsmax=gcsmax, S=gcsS, M=gcsM, evaluate=evaluate, cores=threads, verbose=verbose, cluster_prefix=cluster_prefix) if subcluster: sketch_path_map = sketch_genomes(db_filter, temp_dir, verbose=verbose) if verbose: print("Subclustering database genomes...") ani_subcluster(mapped_path, sketch_path_map, cluster_scheme, cluster_metadata[0], cluster_metadata[1], cluster_metadata[2], cores=threads, verbose=verbose, ani=ani, animax=animax, evaluate=evaluate) empty = True for _ in mapped_path.iterdir(): empty = False if empty: shutil.rmtree(mapped_path)
def execute_pham_align(alchemist, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, filters="", groups=[], file_type="fasta", mat_out=False, tree_out=False, threads=1, verbose=False, dump=False, force=False): """Executes the entirety of the pham align pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for working dir creation. :type folder_path: Path :param folder_name: A name for the working directory. :type folder_name: str :param force: A boolean to toggle aggresive building of directories. :type force: bool :param values: List of values to filter database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param dump: A boolean value to toggle dump in current working dir. :type dump: bool :param filters: A MySQL formatted WHERE clause string :type filters: str :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param file_type: Format type of sequence alignment file to export. :type file_type: str :param mat_out: A boolean to toggle distance matrix file generation. :type mat_out: bool :param tree_out: A boolean to toggle guidetree file generation. :type tree_out: bool :param threads: Number of processes to spawn during alignment workflow. :type threads: int """ db_filter = pipelines_basic.build_filter(alchemist, "pham", filters, values=values, verbose=verbose) working_path = pipelines_basic.create_working_path(folder_path, folder_name, dump=dump, force=force) data_cache = {} conditionals_map = pipelines_basic.build_groups_map(db_filter, working_path, groups=groups, verbose=verbose, force=force) values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received for '{mapped_path}'") continue pipelines_basic.create_working_dir(mapped_path, dump=dump, force=force) execute_pham_MSA_alignment(alchemist, mapped_path, db_filter.values, data_cache=data_cache, file_type=file_type, mat_out=mat_out, tree_out=tree_out, threads=threads, verbose=verbose)
def execute_get_gb_records(alchemist, file_type, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, config=None, values=None, verbose=False, force=False, filters="", groups=[]): """Executes the entirety of the get_gb_records pipeline :param alchemist: A connected and fully build AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param file_type: File type to be exported. :type file_type: str :param config: ConfigParser object containing NCBI credentials. :type config: ConfigParser :param force: A boolean to toggle aggresive building of directories. :type force: bool :param values: List of values to filter database results. :type values: list[str] :param verbose: A boolean value to toggle progress print statemtns. :type verbose: bool :param filters: A List of lists with filter value,grouped by ORs. :type filter: str :param groups: A list of supported MySQL column names to goup by. :type groups: list[str] """ ncbi_creds = {} if config is not None: ncbi_creds = config["ncbi"] db_filter = pipelines_basic.build_filter(alchemist, FILTER_KEY, filters, values=values, verbose=verbose) if verbose: print("Creating records folder...") records_path = pipelines_basic.create_working_path(folder_path, folder_name, force=force) conditionals_map = pipelines_basic.build_groups_map( db_filter, records_path, groups=groups, verbose=verbose, force=force) values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) # Create data sets if verbose: print("Retrieving accessions from the database...") accession_data = db_filter.select(["phage.PhageID", "phage.Accession"]) acc_id_dict = {} for data_dict in accession_data: accession = data_dict["Accession"] if not (accession is None or accession == ""): acc_id_dict[accession] = data_dict["PhageID"] pipelines_basic.create_working_dir(mapped_path, force=force) if len(acc_id_dict.keys()) > 0: ncbi_handle = ncbi.get_verified_data_handle( acc_id_dict, ncbi_cred_dict=ncbi_creds, file_type=file_type) copy_gb_data(ncbi_handle, acc_id_dict, mapped_path, file_type, verbose=verbose) else: print(f"There are no records to retrieve for '{mapped_path}'.") continue
def execute_remote_revise(alchemist, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, config=None, output_type="p_curation", values=None, filters="", groups=[], verbose=False, force=False): ncbi_creds = {} if config is not None: ncbi_creds = config["ncbi"] db_filter = pipelines_basic.build_filter(alchemist, "phage", filters, values=values, verbose=verbose) db_filter.add(BASE_CONDITIONALS) revise_path = pipelines_basic.create_working_path(folder_path, folder_name, force=force) conditionals_map = pipelines_basic.build_groups_map(db_filter, revise_path, groups=groups, verbose=verbose, force=force) values = db_filter.values for mapped_path in conditionals_map.keys(): db_filter.reset() db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print(f"No database entries received for '{mapped_path}'.") pipelines_basic.create_working_dir(mapped_path, force=force) build_revise_log_file(mapped_path) logger.info(f"pdm_utils version: {VERSION}") logger.info(f"Revise run date: {CURRENT_DATE}") logger.info(f"Connected to database: {alchemist.database}") accession_data = db_filter.select(["phage.PhageID", "phage.Accession"]) acc_id_dict = {} for data_dict in accession_data: accession = data_dict["Accession"] if not (accession is None or accession == ""): acc_id_dict[accession] = data_dict["PhageID"] tbl_records = get_tbl_records(acc_id_dict, ncbi_cred_dict=ncbi_creds) validated_phages = [] for tbl_record in tbl_records: validated_phages.append(tbl_record.name) id_record_map = build_id_record_map(alchemist, validated_phages) if output_type == "tbl": revised_records = revise_seqrecords(id_record_map, tbl_records, verbose=verbose) if not revised_records: print("No discrepancies detected between " f"local data and GenBank data for '{mapped_path}'.") continue elif output_type == "p_curation": curation_data_dicts = find_product_discrepancies(id_record_map, tbl_records, verbose=verbose) if not curation_data_dicts: print("No discrepancies detected between " f"local data and GenBank data for '{mapped_path}'.") continue if output_type == "tbl": fileio.write_feature_table(revised_records, mapped_path, verbose=verbose) elif output_type == "p_curation": file_path = mapped_path.joinpath("revise.csv") fileio.export_data_dict(curation_data_dicts, file_path, CURATION_HEADER, include_headers=True)
def execute_local_revise(alchemist, revisions_file_path, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, config=None, input_type="function_report", output_type="p_curation", production=False, filters="", groups=[], force=False, verbose=False): """Executes the entirety of the genbank local revise pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param revisions_data_dicts: Data dictionaries containing pham/notes data. :type revisions_data_dicts: list[dict] :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param input_type: Specifies the file format of the input file :type input_type: str :param output_type: Specifies the file format of the outputted file :type output_type: str :param production: Toggles additional filters for production-level revision :type production: bool :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param force: A boolean to toggle aggresive building of directories. :type force: bool """ keys = INPUT_FILE_KEYS.get(input_type) if keys is None: raise ValueError(f"Revision input type {input_type} is not supported.") revisions_data_dicts = fileio.retrieve_data_dict(revisions_file_path) values = [] for data_dict in revisions_data_dicts: values.append(data_dict[keys['data_key']]) db_filter = pipelines_basic.build_filter(alchemist, keys['filter_key'], filters, values=values, verbose=verbose) if production: db_filter.add(BASE_CONDITIONALS) revise_columns = db_filter.get_columns(REVISION_COLUMNS) if verbose: print("Creating export folder...") export_path = pipelines_basic.create_working_path(folder_path, folder_name, force=force) conditionals_map = pipelines_basic.build_groups_map(db_filter, export_path, force=force, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning revise export...") for mapped_path in conditionals_map.keys(): conditionals = conditionals_map[mapped_path] if input_type == "function_report": export_dicts = use_function_report_data(db_filter, revisions_data_dicts, revise_columns, conditionals, verbose=verbose) elif input_type == "csv": export_dicts = use_csv_data(db_filter, revisions_data_dicts, revise_columns, conditionals, verbose=verbose) if not export_dicts: if verbose: print(f"'{mapped_path.name}' data selected does not require " "revision; no file exported...") continue pipelines_basic.create_working_dir(mapped_path, force=force) write_revise_file(export_dicts, mapped_path, file_format=output_type, verbose=verbose)
def execute_pham_finder(alchemist, folder_path, folder_name, adatabase, bdatabase, values=None, filters="", groups=[], sort=[], show_per=False, use_locus=False, verbose=False): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully build AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param adatabase: Name of reference database to source phams from. :type adatabase: str :param bdatabase: Name of database to find corresponding phams for. :type bdatabase: str :param values: List of values to filter database results: :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param table: MySQL table name. :type table: str :param filters: A list of lists with filter values, grouped by ORs. :type filters: str :param groups: A list of supported MySQL column names to group by. :type groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :type sort: list[str] :param show_per: Enables display gene coverage of the corresponding phams. :type show_per: bool :param use_locus: Toggles conversion between phams using LocusTag instead :type use_locus: bool """ if not (adatabase in alchemist.databases and \ bdatabase in alchemist.databases): print("User credentials does not have access to both " f"databases {adatabase} and {bdatabase}.\n" "Please check your database access and try again.") sys.exit(1) alchemist.database = adatabase alchemist.connect() a_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters, values=values, verbose=verbose) alchemist.database = bdatabase alchemist.connect() if use_locus: b_filter = pipelines_basic.build_filter(alchemist, "gene.LocusTag", "") else: b_filter = pipelines_basic.build_filter(alchemist, "gene", "") if sort: try: a_filter.sort(sort) except: print("Please check your syntax for sorting columns:\n" f"{', '.join(sort)}") sys.exit(1) if verbose: print("Creating pham_finder folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) conditionals_map = {} pipelines_basic.build_groups_map(a_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning export...") values = a_filter.values for mapped_path in conditionals_map.keys(): a_filter.reset() a_filter.values = values conditionals = conditionals_map[mapped_path] a_filter.values = a_filter.build_values(where=conditionals) if a_filter.hits() == 0: print("No database entries received from gene.PhamID " f"for '{mapped_path}'.") shutil.rmtree(mapped_path) continue if sort: sort_columns = get_sort_columns(alchemist, sort) a_filter.sort(sort_columns) mapped_phams = find_phams(a_filter, b_filter, show_per=show_per) if not mapped_phams: print("Phams are consistent between the two databases " f"for '{mapped_path}'.") shutil.rmtree(mapped_path) continue out_data_dicts = [] for ref_pham, corr_phams in mapped_phams.items(): data_dict = {} data_dict[PHAM_FINDER_HEADER[0]] = ref_pham data_dict[PHAM_FINDER_HEADER[1]] = corr_phams out_data_dicts.append(data_dict) file_path = mapped_path.joinpath("PhamMap.csv") fileio.export_data_dict(out_data_dicts, file_path, PHAM_FINDER_HEADER, include_headers=True)
def execute_find_primers(alchemist, folder_path=None, folder_name=DEFAULT_FOLDER_NAME, values=None, filters="", groups=[], verbose=False, threads=4, prc=0.7, dev_net=0, len_oligomer=20, minD=900, maxD=1100, tm_min=52.0, tm_max=58.0, hpn_min=-2000, ho_min=-5000, GC_max=60.0, het_min=-5000, tm_gap=5.0, ta_min=48.0, fwd_in=None, rvs_in=None, ta_max=68.0, mode=0, full_genome=False, soft_cap=None, phams_in=[]): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully build AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path :type folder_path: Path :param folder_name: A name for the working directory folder :type folder_name: str :param values: List of values to filter database results :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param filters: A pseudo-SQL WHERE clause string to filter values. :type filters: str :param groups: A list of SQL column names to filter values. :type groups: list[str] :param threads: Number of child process workers to utilize :type threads: int :param prc: Percentage of genomes a pham must exist in to pass prefiltering :type prc: float :param dev_net: Allowance for the primer positions to pass prefiltering :type dev_net: int :param len_oligomer: Length of the oligomers used to create the primers :type len_oligomer: int :param minD: Minimum primer product length to pass primer testing :type minD: int :param maxD: Maximum primer product length to pass primer testing :type maxD: int :param tm_min: Minimum primer melting temperature to pass primer testing :type tm_min: float :param tm_max: Maximum primer melting temperature to pass primer testing :type tm_max: float :param hpn_min: Minimum hairpin Gibbs free energy to pass primer testing :type hpn_min: int :param ho_min: Minimum homodimer Gibbs free energy to pass primer testing :type ho_min: int :param GC_max: Maximum GC content percentage allowed for an oligomer :type GC_max: float :param het_min: Minimum heterodimer Gibbs free energy to pass testing :type het_min: int :param tm_gap: Maximum allowed melting temperature gap between oligomers :type tm_gap: float :param ta_min: Minimum allowed optimal annealing temperature :type ta_min: float :param ta_max: Maximum allowed optimal annealing temperature :type ta_max: float :param fwd_in: Fixed forward sequence to find primer pairs for :type fwd_in: str :param rvs_in: Fixed reverse sequence to find primer pairs for :type rvs_in: str :param mode: Run mode for find primers analysis :type mode: int :param soft_cap: Cap limit on number of pairs evaluated after testing :type soft_cap: int :param phams_in: Phams to evaluate during count min sketch eval of kmers :type phams_in: list[str] """ db_filter = pipelines_basic.build_filter(alchemist, "phage", filters, values=values) working_path = pipelines_basic.create_working_path(folder_path, folder_name) conditionals_map = pipelines_basic.build_groups_map(db_filter, working_path, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning primer search...") if not TEMP_DIR.is_dir(): TEMP_DIR.mkdir() pickled_results_file = TEMP_DIR.joinpath(PICKLED_FILE_NAME) if pickled_results_file.is_file(): pickled_results_file.unlink() values = db_filter.values for mapped_path in conditionals_map.keys(): results_map = {} db_filter.reset() db_filter.key = "phage" db_filter.values = values conditionals = conditionals_map[mapped_path] db_filter.values = db_filter.build_values(where=conditionals) if db_filter.hits() == 0: print("No database entries received from phage " f" for '{mapped_path.name}'.") genome_map = {} for genome_id in db_filter.values: export_db.get_single_genome(alchemist, genome_id, data_cache=genome_map) if verbose: print(f"...Identifying primer pairs for '{mapped_path}'...") if full_genome: F_results, R_results = find_full_genome_oligomers( genome_map, verbose=verbose, threads=threads, prc=prc, minD=minD, maxD=maxD, len_oligomer=len_oligomer, tm_min=tm_min, tm_max=tm_max, hpn_min=hpn_min, ho_min=ho_min, GC_max=GC_max) else: pham_gene_map = build_pham_gene_map(db_filter, conditionals, phams_in=phams_in, verbose=verbose) if not pham_gene_map: print(f"No valid phams found for '{mapped_path}' with current " "settings") F_results, R_results = find_oligomers(alchemist, pham_gene_map, genome_map, verbose=verbose, threads=threads, prc=prc, minD=minD, maxD=maxD, len_oligomer=len_oligomer, tm_min=tm_min, tm_max=tm_max, hpn_min=hpn_min, ho_min=ho_min, GC_max=GC_max, fwd_in=fwd_in, rvs_in=rvs_in) if (not F_results) or (not R_results): if verbose: print(f"No valid oligomers found for '{mapped_path.name}'") continue if verbose: print("...Matching oligomers to create primer pairs...") primer_pairs = match_oligomers(F_results, R_results, minD=minD, maxD=maxD, dev_net=dev_net, threads=threads) if not primer_pairs: print(f"No valid primer pairs found for '{mapped_path}' with " "current parameters...") continue if verbose: print(f"...Identified {len(primer_pairs)} valid primer pairs.") if verbose: print(f"...Testing primer pairs for '{mapped_path}'...") primer_pairs = test_primer_pairs(primer_pairs, genome_map, threads=threads, verbose=verbose, minD=minD, maxD=maxD, het_min=het_min, ta_min=ta_min, ta_max=ta_max, tm_gap_max=tm_gap) if verbose: print(f"...{len(primer_pairs)} passed primer testing.") if soft_cap is not None: if len(primer_pairs) > soft_cap: primer_pairs = primer_pairs[:soft_cap] if pickled_results_file.is_file(): with pickled_results_file.open(mode="rb") as filehandle: results_map = pickle.load(filehandle) if primer_pairs: results_map[mapped_path] = (primer_pairs, genome_map) with pickled_results_file.open(mode="wb") as filehandle: pickle.dump(results_map, filehandle) if pickled_results_file.is_file(): pickled_results_file.unlink() if not results_map: print("No primer pairs found with current parameters...") results_map = select_primer_pairs(results_map, verbose=verbose, mode=mode, het_min=het_min) for mapped_path, primer_pairs in results_map.items(): pipelines_basic.create_working_dir(mapped_path) file_path = mapped_path.joinpath("primer.txt") fileio.write_primer_txt_file(primer_pairs[0][0], file_path)