def execute_resubmit(alchemist, revisions_data_dicts, folder_path, folder_name, filters="", groups=[], verbose=False): """Executes the entirety of the genbank resubmit pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param revisions_data_dicts: Data dictionaries containing pham/notes data. :type revisions_data_dicts: list[dict] :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ db_filter = Filter(alchemist=alchemist) db_filter.key = "gene.PhamID" db_filter.add(BASE_CONDITIONALS) if filters != "": try: db_filter.add(filters) except: print("Please check your syntax for the conditional string:\n" f"{filters}") resubmit_columns = db_filter.get_columns(RESUBMIT_COLUMNS) phams = [] for data_dict in revisions_data_dicts: phams.append(data_dict["Pham"]) db_filter.values = phams if verbose: print("Creating export folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) conditionals_map = {} export_db.build_groups_map(db_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning review export...") for mapped_path in conditionals_map.keys(): if verbose: print("Retreiving phage data for pham revisions...") export_dicts = [] for data_dict in revisions_data_dicts: if verbose: print(f"...Retrieving data for pham {data_dict['Pham']}...") conditionals = conditionals_map[mapped_path] final_call = data_dict["Final Call"] if final_call == "Hypothetical Protein": final_call = "" conditionals.append( querying.build_where_clause(alchemist.graph, f"gene.Notes!={final_call}")) query = querying.build_select(alchemist.graph, resubmit_columns, where=conditionals) results = querying.execute(alchemist.engine, query, in_column=db_filter.key, values=[data_dict["Pham"]]) for result in results: format_resubmit_data(result, data_dict["Final Call"]) export_dicts.append(result) if not export_dicts: if verbose: print("'{mapped_path.name}' data selected for resubmision " "matches selected call; no resubmision exported...") mapped_path.rmdir() continue export_dicts = sorted(export_dicts, key=lambda export_dict: export_dict["Phage"]) if verbose: print(f"Writing {CSV_NAME} in {mapped_path.name}...") file_path = mapped_path.joinpath(CSV_NAME) basic.export_data_dict(export_dicts, file_path, RESUBMIT_HEADER, include_headers=True)
def execute_review(alchemist, folder_path, folder_name, review=True, values=[], filters="", groups=[], sort=[], g_reports=False, s_report=False, verbose=False): """Executes the entirety of the pham review pipeline. :param alchemist: A connected and fully built AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param csv_title: Title for an appended csv file prefix. :type csv_title: str :param review: A boolean to toggle filtering of phams by pham discrepancies. :type review: bool :param values: List of values to filter database results. :type values: list[str] :param filters: A list of lists with filter values, grouped by ORs. :type filters: list[list[str]] :param groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :param g_reports: A boolean to toggle export of additional pham information. :type g_reports: bool :param verbose: A boolean value to toggle progress print statements. :type verbose: bool """ db_filter = Filter(alchemist=alchemist) db_filter.key = ("gene.PhamID") if values: db_filter.values = values if verbose: print(f"Identified {len(values)} phams to review...") if filters != "": try: db_filter.add(filters) except: print("Please check your syntax for the conditional string:\n" f"{filters}") sys.exit(1) finally: db_filter.update() db_filter._filters = [] db_filter._updated = False db_filter._or_index = -1 db_filter.add(BASE_CONDITIONALS) db_filter.update() if not db_filter.values: print("Current settings produced no database hits.") sys.exit(1) if review: review_phams(db_filter, verbose=verbose) if sort: db_filter.sort(sort) if verbose: print("Creating export folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) conditionals_map = {} export_db.build_groups_map(db_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning review export...") original_phams = db_filter.values total_g_data = {} for mapped_path in conditionals_map.keys(): conditionals = conditionals_map[mapped_path] db_filter.values = original_phams db_filter.values = db_filter.build_values(where=conditionals) pf_data = get_pf_data(alchemist, db_filter, verbose=verbose) write_report(pf_data, mapped_path, PF_HEADER, csv_name=f"FunctionReport", verbose=verbose) if g_reports: execute_g_report_export(alchemist, db_filter, mapped_path, total_g_data=total_g_data, verbose=verbose) if s_report: execute_s_report_export(alchemist, db_filter, conditionals, mapped_path, verbose=verbose)
def main(argument_list): """ :param argument_list: :return: """ # Setup argument parser cdd_parser = setup_argparser() # Use argument parser to parse argument_list args = cdd_parser.parse_args(argument_list) # Store arguments in more easily accessible variables database = args.database cdd_dir = expand_path(args.cdd) cdd_name = learn_cdd_name(cdd_dir) threads = args.threads evalue = args.evalue rpsblast = args.rpsblast tmp_dir = args.tmp_dir output_folder = args.output_folder reset = args.reset batch_size = args.batch_size # Set up directory. output_folder = basic.set_path(output_folder, kind="dir", expect=True) results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=50) if results_path is None: print("Unable to create output_folder.") sys.exit(1) log_file = pathlib.Path(results_path, MAIN_LOG_FILE) # Set up root logger. logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG, format="pdm_utils find_domains: %(levelname)s: %(message)s") logger.info(f"pdm_utils version: {VERSION}") logger.info(f"CDD run date: {constants.CURRENT_DATE}") logger.info(f"Command line arguments: {' '.join(argument_list)}") logger.info(f"Results directory: {results_path}") # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are # unable to find one if cdd_name == "": msg = (f"Unable to learn CDD database name. Make sure the files in " f"{cdd_dir} all have the same basename.") logger.error(msg) print(msg) return # Get the rpsblast command and path. if rpsblast == "": command = get_rpsblast_command() rpsblast = get_rpsblast_path(command) # Verify database connection and schema compatibility. alchemist = AlchemyHandler(database=database) alchemist.connect(pipeline=True) engine = alchemist.engine logger.info(f"Connected to database: {database}.") mysqldb.check_schema_compatibility(engine, "the find_domains pipeline") logger.info(f"Schema version is compatible.") logger.info("Command line arguments verified.") if reset: logger.info("Clearing all domain data currently in the database.") clear_domain_data(engine) # Get gene data that needs to be processed # in dict format where key = column name, value = stored value. cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD) msg = f"{len(cdd_genes)} genes to search for conserved domains..." logger.info(msg) print(msg) # Only run the pipeline if there are genes returned that need it if len(cdd_genes) > 0: log_gene_ids(cdd_genes) make_tempdir(tmp_dir) # Identify unique translations to process mapped to GeneIDs. cds_trans_dict = create_cds_translation_dict(cdd_genes) unique_trans = list(cds_trans_dict.keys()) msg = (f"{len(unique_trans)} unique translations " "to search for conserved domains...") logger.info(msg) print(msg) # Process translations in batches. Otherwise, searching could take # so long that MySQL connection closes resulting in 1 or more # transaction errors. batch_indices = basic.create_indices(unique_trans, batch_size) total_rolled_back = 0 for indices in batch_indices: start = indices[0] stop = indices[1] msg = f"Processing translations {start + 1} to {stop}..." logger.info(msg) print(msg) sublist = unique_trans[start:stop] batch_rolled_back = search_translations( rpsblast, cdd_name, tmp_dir, evalue, threads, engine, sublist, cds_trans_dict) total_rolled_back += batch_rolled_back search_summary(total_rolled_back) engine.dispose() return
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) date = time.strftime("%Y%m%d") args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(f"{date}_get_data") working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=10) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") engine = mysqldb.connect_to_db(args.database) mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = match_output[0] unmatched_phagesdb_ids = match_output[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict, args.genbank_results) if args.draft is True: get_draft_data(working_path, unmatched_phagesdb_ids) print("\n\n\nRetrieve updates script completed.")
def main(unparsed_args_list): """Run main retrieve_updates pipeline.""" # Parse command line arguments args = parse_args(unparsed_args_list) force = args.force_download args.output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) working_dir = pathlib.Path(RESULTS_FOLDER) working_path = basic.make_new_dir(args.output_folder, working_dir, attempt=50) if working_path is None: print(f"Invalid working directory '{working_dir}'") sys.exit(1) ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file) # Verify database connection and schema compatibility. print("Preparing genome data sets from the MySQL database...") alchemist = AlchemyHandler(database=args.database) alchemist.connect(pipeline=True) engine = alchemist.engine mysqldb.check_schema_compatibility(engine, "the get_data pipeline") # Get existing data from MySQL to determine what needs to be updated. query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, " "DateLastModified, Accession, RetrieveRecord, Subcluster, " "AnnotationAuthor FROM phage") mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine, phage_query=query, gnm_type="mysqldb") engine.dispose() mysqldb_genome_dict = {} for gnm in mysqldb_genome_list: # With default date, the date of all records retrieved will be newer. if force: gnm.date = constants.EMPTY_DATE mysqldb_genome_dict[gnm.id] = gnm # Get data from PhagesDB if (args.updates or args.final or args.draft) is True: print("Retrieving data from PhagesDB...") phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED) phagesdb_phages_dict = basic.convert_list_to_dict( phagesdb_phages, "phage_name") phagesdb_genome_dict = phagesdb.parse_genomes_dict( phagesdb_phages_dict, gnm_type="phagesdb", seq=False) # Exit if all phage data wasn't retrieved. if len(phagesdb_genome_dict) == 0: sys.exit(1) # Returns a list of tuples. tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict) matched_genomes = tup[0] unmatched_phagesdb_ids = tup[1] if args.updates is True: get_update_data(working_path, matched_genomes) if args.final is True: get_final_data(working_path, matched_genomes) if args.genbank is True: get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict, args.genbank_results) if args.draft is True: if force: # Add all draft genomes currently in database to the list of # draft genomes to be downloaded. drafts = get_matched_drafts(matched_genomes) unmatched_phagesdb_ids |= drafts get_draft_data(working_path, unmatched_phagesdb_ids)
def execute_export(alchemist, output_path, output_name, values=[], verbose=False, csv_export=False, ffile_export=None, db_export=False, table="phage", filters=[], groups=[]): """Executes the entirety of the file export pipeline. :param sql_handle: Input a valid SQLAlchemy Engine object. :type sql_handle: Engine: :param export_path: Input a valid path to place export folder. :type export_path: Path :param folder_name: Input a name for the export folder. :type folder_name: str :param phage_filter_list: Input a list of phageIDs. :type phage_filter_list: List[str] :param verbose: Input a boolean value for verbose option. :type verbose: boolean :param csv_export: Input a boolean value to toggle csv_export. :type csv_export: boolean :param ffile_export: Input a SeqIO supported file format to toggle ffile_export. :type ffile_export: str :param db_export: Input a boolean value to toggle db_export. :type db_export: boolean :param filters: Input a list of lists with filter values :type filters: List[List[str]] :param groups: Input a list of supported group values. :type groups: List[str] """ if verbose: print("Retrieving database version...") db_version = mysqldb.get_version_table_data(alchemist.engine) if verbose: print("Creating export folder...") export_path = output_path.joinpath(output_name) export_path = basic.make_new_dir(output_path, export_path, attempt=50) if db_export: if verbose: print("Writing SQL database file...") write_database(alchemist, db_version["Version"], export_path) elif csv_export or ffile_export != None: table_obj = alchemist.get_table(table) for column in table_obj.primary_key.columns: primary_key = column db_filter = Filter(alchemist=alchemist, key=primary_key) db_filter.values = values for or_filters in filters: for filter in or_filters: db_filter.add(filter) db_filter.update() if filters and not db_filter.values: return values_map = {} if groups: build_groups_map(db_filter, export_path, groups=groups, values_map=values_map, verbose=verbose) else: values_map.update({export_path: db_filter.values}) for export_path in values_map.keys(): values = values_map[export_path] if csv_export: execute_csv_export(alchemist, export_path, table=table, values=values, verbose=verbose) elif ffile_export != None: execute_ffx_export(alchemist, export_path, ffile_export, db_version, table=table, values=values, verbose=verbose)
def execute_pham_finder(alchemist, folder_path, folder_name, adatabase, bdatabase, values=None, filters="", groups=[], sort=[], show_per=False, use_locus=False, verbose=False): """Executes the entirety of the file export pipeline. :param alchemist: A connected and fully build AlchemyHandler object. :type alchemist: AlchemyHandler :param folder_path: Path to a valid dir for new dir creation. :type folder_path: Path :param folder_name: A name for the export folder. :type folder_name: str :param adatabase: Name of reference database to source phams from. :type adatabase: str :param bdatabase: Name of database to find corresponding phams for. :type bdatabase: str :param values: List of values to filter database results: :type values: list[str] :param verbose: A boolean value to toggle progress print statements. :type verbose: bool :param table: MySQL table name. :type table: str :param filters: A list of lists with filter values, grouped by ORs. :type filters: str :param groups: A list of supported MySQL column names to group by. :type groups: A list of supported MySQL column names to group by. :type groups: list[str] :param sort: A list of supported MySQL column names to sort by. :type sort: list[str] :param show_per: Enables display gene coverage of the corresponding phams. :type show_per: bool :param use_locus: Toggles conversion between phams using LocusTag instead :type use_locus: bool """ if not (adatabase in alchemist.databases and \ bdatabase in alchemist.databases): print("User credentials does not have access to both " f"databases {adatabase} and {bdatabase}.\n" "Please check your database access and try again.") sys.exit(1) alchemist.database = adatabase alchemist.connect() a_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters, values=values, verbose=verbose) alchemist.database = bdatabase alchemist.connect() if use_locus: b_filter = pipelines_basic.build_filter(alchemist, "gene.LocusTag", "") else: b_filter = pipelines_basic.build_filter(alchemist, "gene", "") if sort: try: a_filter.sort(sort) except: print("Please check your syntax for sorting columns:\n" f"{', '.join(sort)}") sys.exit(1) if verbose: print("Creating pham_finder folder...") export_path = folder_path.joinpath(folder_name) export_path = basic.make_new_dir(folder_path, export_path, attempt=50) conditionals_map = {} pipelines_basic.build_groups_map(a_filter, export_path, conditionals_map, groups=groups, verbose=verbose) if verbose: print("Prepared query and path structure, beginning export...") values = a_filter.values for mapped_path in conditionals_map.keys(): a_filter.reset() a_filter.values = values conditionals = conditionals_map[mapped_path] a_filter.values = a_filter.build_values(where=conditionals) if a_filter.hits() == 0: print("No database entries received from gene.PhamID " f"for '{mapped_path}'.") shutil.rmtree(mapped_path) continue if sort: sort_columns = get_sort_columns(alchemist, sort) a_filter.sort(sort_columns) mapped_phams = find_phams(a_filter, b_filter, show_per=show_per) if not mapped_phams: print("Phams are consistent between the two databases " f"for '{mapped_path}'.") shutil.rmtree(mapped_path) continue out_data_dicts = [] for ref_pham, corr_phams in mapped_phams.items(): data_dict = {} data_dict[PHAM_FINDER_HEADER[0]] = ref_pham data_dict[PHAM_FINDER_HEADER[1]] = corr_phams out_data_dicts.append(data_dict) file_path = mapped_path.joinpath("PhamMap.csv") fileio.export_data_dict(out_data_dicts, file_path, PHAM_FINDER_HEADER, include_headers=True)
def main(unparsed_args_list): """Run the get_db pipeline. The database data can be retrieved from three places: The server, which needs to be downloaded to a new folder. A local file, in which no download and no new folder are needed. The empty schema stored within pdm_utils, in which no download, new folder, or local file are needed. """ args = parse_args(unparsed_args_list) # Set values that are shared between all three options. database = args.database option = args.option install = True schema_version = None db_filepath = None if option == "file": db_filepath = basic.set_path(args.filename, kind="file", expect=True) elif option == "new": schema_version = args.schema_version else: # option must be "server" server_url = args.url version_file = args.version output_folder = basic.set_path(args.output_folder, kind="dir", expect=True) download = True remove = True results_folder = pathlib.Path(RESULTS_FOLDER) results_path = basic.make_new_dir(output_folder, results_folder, attempt=50) if args.download_only == True: install = False remove = False if results_path is None: print("Unable to create results folder.") sys.exit(1) else: # Only look for version file is selected. if version_file: version_filepath, status1 = prepare_download( results_path, server_url, database, "version") else: status1 = True db_filepath, status2 = prepare_download(results_path, server_url, database, "sql") if (status1 == False or status2 == False): print("Unable to download data from server.") sys.exit(1) # If downloading from server, user may have selected to not # install the database file. if install == True: install_db(database, db_filepath=db_filepath, schema_version=schema_version) # The output folder was only created for downloading from server. if option == "server": if remove == True: print("Removing downloaded data.") shutil.rmtree(results_path)