示例#1
0
def execute_resubmit(alchemist,
                     revisions_data_dicts,
                     folder_path,
                     folder_name,
                     filters="",
                     groups=[],
                     verbose=False):
    """Executes the entirety of the genbank resubmit pipeline.

    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param revisions_data_dicts: Data dictionaries containing pham/notes data.
    :type revisions_data_dicts: list[dict]
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    db_filter = Filter(alchemist=alchemist)
    db_filter.key = "gene.PhamID"
    db_filter.add(BASE_CONDITIONALS)

    if filters != "":
        try:
            db_filter.add(filters)
        except:
            print("Please check your syntax for the conditional string:\n"
                  f"{filters}")

    resubmit_columns = db_filter.get_columns(RESUBMIT_COLUMNS)

    phams = []
    for data_dict in revisions_data_dicts:
        phams.append(data_dict["Pham"])

    db_filter.values = phams

    if verbose:
        print("Creating export folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    export_db.build_groups_map(db_filter,
                               export_path,
                               conditionals_map,
                               groups=groups,
                               verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning review export...")

    for mapped_path in conditionals_map.keys():
        if verbose:
            print("Retreiving phage data for pham revisions...")
        export_dicts = []
        for data_dict in revisions_data_dicts:
            if verbose:
                print(f"...Retrieving data for pham {data_dict['Pham']}...")

            conditionals = conditionals_map[mapped_path]

            final_call = data_dict["Final Call"]
            if final_call == "Hypothetical Protein":
                final_call = ""
            conditionals.append(
                querying.build_where_clause(alchemist.graph,
                                            f"gene.Notes!={final_call}"))

            query = querying.build_select(alchemist.graph,
                                          resubmit_columns,
                                          where=conditionals)

            results = querying.execute(alchemist.engine,
                                       query,
                                       in_column=db_filter.key,
                                       values=[data_dict["Pham"]])

            for result in results:
                format_resubmit_data(result, data_dict["Final Call"])
                export_dicts.append(result)

        if not export_dicts:
            if verbose:
                print("'{mapped_path.name}' data selected for resubmision "
                      "matches selected call; no resubmision exported...")

            mapped_path.rmdir()
            continue

        export_dicts = sorted(export_dicts,
                              key=lambda export_dict: export_dict["Phage"])

        if verbose:
            print(f"Writing {CSV_NAME} in {mapped_path.name}...")
        file_path = mapped_path.joinpath(CSV_NAME)
        basic.export_data_dict(export_dicts,
                               file_path,
                               RESUBMIT_HEADER,
                               include_headers=True)
示例#2
0
def execute_review(alchemist, folder_path, folder_name, 
                              review=True, values=[],
                              filters="", groups=[], sort=[],
                              g_reports=False, s_report=False,
                              verbose=False):
    """Executes the entirety of the pham review pipeline.
    
    :param alchemist: A connected and fully built AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param csv_title: Title for an appended csv file prefix.
    :type csv_title: str
    :param review: A boolean to toggle filtering of phams by pham discrepancies.
    :type review: bool
    :param values: List of values to filter database results.
    :type values: list[str]
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: list[list[str]]
    :param groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by. 
    :param g_reports: A boolean to toggle export of additional pham information.
    :type g_reports: bool
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    """
    db_filter = Filter(alchemist=alchemist)
    db_filter.key = ("gene.PhamID")
 
    if values:
        db_filter.values = values

    if verbose:
        print(f"Identified {len(values)} phams to review...")
           
    if filters != "":
        try:
            db_filter.add(filters)
        except:
            print("Please check your syntax for the conditional string:\n"
                 f"{filters}")
            sys.exit(1)
        finally:
            db_filter.update() 

    db_filter._filters = []
    db_filter._updated = False 
    db_filter._or_index = -1

    db_filter.add(BASE_CONDITIONALS)
    db_filter.update()

    if not db_filter.values:
        print("Current settings produced no database hits.")
        sys.exit(1)

    if review: 
        review_phams(db_filter, verbose=verbose)

    if sort:
        db_filter.sort(sort)

    if verbose:
        print("Creating export folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    export_db.build_groups_map(db_filter, export_path, conditionals_map, 
                                                       groups=groups,
                                                       verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning review export...")
    original_phams = db_filter.values
    total_g_data = {}
    for mapped_path in conditionals_map.keys():
        conditionals = conditionals_map[mapped_path]
        db_filter.values = original_phams
        db_filter.values = db_filter.build_values(where=conditionals)

        pf_data = get_pf_data(alchemist, db_filter, verbose=verbose) 
        write_report(pf_data, mapped_path, PF_HEADER,
                     csv_name=f"FunctionReport",
                     verbose=verbose)

        if g_reports:
            execute_g_report_export(alchemist, db_filter, mapped_path, 
                                                    total_g_data=total_g_data,
                                                    verbose=verbose)

        if s_report:
            execute_s_report_export(alchemist, db_filter, conditionals, 
                                                    mapped_path,
                                                    verbose=verbose)
示例#3
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.database
    cdd_dir = expand_path(args.cdd)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    reset = args.reset
    batch_size = args.batch_size

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder, results_folder,
                                      attempt=50)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, MAIN_LOG_FILE)

    # Set up root logger.
    logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG,
                        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
              f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    alchemist = AlchemyHandler(database=database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:

        log_gene_ids(cdd_genes)
        make_tempdir(tmp_dir)

        # Identify unique translations to process mapped to GeneIDs.
        cds_trans_dict = create_cds_translation_dict(cdd_genes)

        unique_trans = list(cds_trans_dict.keys())
        msg = (f"{len(unique_trans)} unique translations "
               "to search for conserved domains...")
        logger.info(msg)
        print(msg)

        # Process translations in batches. Otherwise, searching could take
        # so long that MySQL connection closes resulting in 1 or more
        # transaction errors.
        batch_indices = basic.create_indices(unique_trans, batch_size)
        total_rolled_back = 0
        for indices in batch_indices:
            start = indices[0]
            stop = indices[1]
            msg = f"Processing translations {start + 1} to {stop}..."
            logger.info(msg)
            print(msg)
            sublist = unique_trans[start:stop]
            batch_rolled_back = search_translations(
                                    rpsblast, cdd_name, tmp_dir, evalue,
                                    threads, engine, sublist, cds_trans_dict)
            total_rolled_back += batch_rolled_back

        search_summary(total_rolled_back)
        engine.dispose()

    return
示例#4
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")

    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_data")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = match_output[0]
        unmatched_phagesdb_ids = match_output[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict,
                         args.genbank_results)
    if args.draft is True:
        get_draft_data(working_path, unmatched_phagesdb_ids)
    print("\n\n\nRetrieve updates script completed.")
示例#5
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    force = args.force_download
    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)
    working_dir = pathlib.Path(RESULTS_FOLDER)
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=50)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    alchemist = AlchemyHandler(database=args.database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        # With default date, the date of all records retrieved will be newer.
        if force:
            gnm.date = constants.EMPTY_DATE
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = tup[0]
        unmatched_phagesdb_ids = tup[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict,
                         args.genbank_results)
    if args.draft is True:
        if force:
            # Add all draft genomes currently in database to the list of
            # draft genomes to be downloaded.
            drafts = get_matched_drafts(matched_genomes)
            unmatched_phagesdb_ids |= drafts
        get_draft_data(working_path, unmatched_phagesdb_ids)
示例#6
0
def execute_export(alchemist,
                   output_path,
                   output_name,
                   values=[],
                   verbose=False,
                   csv_export=False,
                   ffile_export=None,
                   db_export=False,
                   table="phage",
                   filters=[],
                   groups=[]):
    """Executes the entirety of the file export pipeline.

    :param sql_handle:
        Input a valid SQLAlchemy Engine object.
    :type sql_handle: Engine:
    :param export_path:
        Input a valid path to place export folder.
    :type export_path: Path
    :param folder_name:
        Input a name for the export folder.
    :type folder_name: str
    :param phage_filter_list:
        Input a list of phageIDs.
    :type phage_filter_list: List[str]
    :param verbose:
        Input a boolean value for verbose option.
    :type verbose: boolean
    :param csv_export:
        Input a boolean value to toggle csv_export.
    :type csv_export: boolean
    :param ffile_export:
        Input a SeqIO supported file format to toggle ffile_export.
    :type ffile_export: str
    :param db_export:
        Input a boolean value to toggle db_export.
    :type db_export: boolean
    :param filters:
        Input a list of lists with filter values
    :type filters: List[List[str]]
    :param groups:
        Input a list of supported group values.
    :type groups: List[str]
    """

    if verbose:
        print("Retrieving database version...")
    db_version = mysqldb.get_version_table_data(alchemist.engine)

    if verbose:
        print("Creating export folder...")

    export_path = output_path.joinpath(output_name)
    export_path = basic.make_new_dir(output_path, export_path, attempt=50)

    if db_export:
        if verbose:
            print("Writing SQL database file...")
        write_database(alchemist, db_version["Version"], export_path)

    elif csv_export or ffile_export != None:
        table_obj = alchemist.get_table(table)
        for column in table_obj.primary_key.columns:
            primary_key = column

        db_filter = Filter(alchemist=alchemist, key=primary_key)
        db_filter.values = values

        for or_filters in filters:
            for filter in or_filters:
                db_filter.add(filter)

        db_filter.update()

        if filters and not db_filter.values:
            return

        values_map = {}
        if groups:
            build_groups_map(db_filter,
                             export_path,
                             groups=groups,
                             values_map=values_map,
                             verbose=verbose)
        else:
            values_map.update({export_path: db_filter.values})

        for export_path in values_map.keys():
            values = values_map[export_path]

            if csv_export:
                execute_csv_export(alchemist,
                                   export_path,
                                   table=table,
                                   values=values,
                                   verbose=verbose)

            elif ffile_export != None:
                execute_ffx_export(alchemist,
                                   export_path,
                                   ffile_export,
                                   db_version,
                                   table=table,
                                   values=values,
                                   verbose=verbose)
示例#7
0
def execute_pham_finder(alchemist, folder_path, folder_name, 
                        adatabase, bdatabase, values=None,
                        filters="", groups=[], sort=[],
                        show_per=False, use_locus=False, verbose=False):
    """Executes the entirety of the file export pipeline.

    :param alchemist: A connected and fully build AlchemyHandler object.
    :type alchemist: AlchemyHandler
    :param folder_path: Path to a valid dir for new dir creation.
    :type folder_path: Path
    :param folder_name: A name for the export folder.
    :type folder_name: str
    :param adatabase: Name of reference database to source phams from.
    :type adatabase: str
    :param bdatabase: Name of database to find corresponding phams for.
    :type bdatabase: str
    :param values: List of values to filter database results:
    :type values: list[str]
    :param verbose: A boolean value to toggle progress print statements.
    :type verbose: bool
    :param table: MySQL table name.
    :type table: str
    :param filters: A list of lists with filter values, grouped by ORs.
    :type filters: str
    :param groups: A list of supported MySQL column names to group by.
    :type groups: A list of supported MySQL column names to group by.
    :type groups: list[str]
    :param sort: A list of supported MySQL column names to sort by.
    :type sort: list[str]
    :param show_per: Enables display gene coverage of the corresponding phams.
    :type show_per: bool
    :param use_locus: Toggles conversion between phams using LocusTag instead
    :type use_locus: bool
    """
    if not (adatabase in alchemist.databases and \
            bdatabase in alchemist.databases):
        print("User credentials does not have access to both "
             f"databases {adatabase} and {bdatabase}.\n"
              "Please check your database access and try again.")
        sys.exit(1)

    alchemist.database = adatabase
    alchemist.connect()
    a_filter = pipelines_basic.build_filter(alchemist, "gene.PhamID", filters,
                                            values=values, verbose=verbose)

    alchemist.database = bdatabase
    alchemist.connect()
    if use_locus:
        b_filter = pipelines_basic.build_filter(alchemist, "gene.LocusTag", "")
    else:
        b_filter = pipelines_basic.build_filter(alchemist, "gene", "")

    if sort:
        try:
            a_filter.sort(sort)
        except:
            print("Please check your syntax for sorting columns:\n"
                 f"{', '.join(sort)}")
            sys.exit(1)

    if verbose:
        print("Creating pham_finder folder...")
    export_path = folder_path.joinpath(folder_name)
    export_path = basic.make_new_dir(folder_path, export_path, attempt=50)

    conditionals_map = {}
    pipelines_basic.build_groups_map(a_filter, export_path,
                                        conditionals_map,
                                        groups=groups,
                                        verbose=verbose)

    if verbose:
        print("Prepared query and path structure, beginning export...")

    values = a_filter.values
    for mapped_path in conditionals_map.keys():
        a_filter.reset()
        a_filter.values = values

        conditionals = conditionals_map[mapped_path]
        a_filter.values = a_filter.build_values(where=conditionals)
        
        if a_filter.hits() == 0:
            print("No database entries received from gene.PhamID "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        if sort:
            sort_columns = get_sort_columns(alchemist, sort)
            a_filter.sort(sort_columns)

        mapped_phams = find_phams(a_filter, b_filter, show_per=show_per)
        if not mapped_phams:
            print("Phams are consistent between the two databases "
                 f"for '{mapped_path}'.")
            shutil.rmtree(mapped_path)
            continue

        out_data_dicts = []
        for ref_pham, corr_phams in mapped_phams.items():
            data_dict = {}
            data_dict[PHAM_FINDER_HEADER[0]] = ref_pham
            data_dict[PHAM_FINDER_HEADER[1]] = corr_phams
            out_data_dicts.append(data_dict)

        file_path = mapped_path.joinpath("PhamMap.csv")
        fileio.export_data_dict(out_data_dicts, file_path, PHAM_FINDER_HEADER,
                                include_headers=True)
示例#8
0
def main(unparsed_args_list):
    """Run the get_db pipeline.

    The database data can be retrieved from three places:
    The server, which needs to be downloaded to a new folder.
    A local file, in which no download and no new folder are needed.
    The empty schema stored within pdm_utils, in which no download, new folder,
    or local file are needed.
    """
    args = parse_args(unparsed_args_list)

    # Set values that are shared between all three options.
    database = args.database
    option = args.option

    install = True
    schema_version = None
    db_filepath = None

    if option == "file":
        db_filepath = basic.set_path(args.filename, kind="file", expect=True)
    elif option == "new":
        schema_version = args.schema_version
    else:
        # option must be "server"
        server_url = args.url
        version_file = args.version
        output_folder = basic.set_path(args.output_folder,
                                       kind="dir",
                                       expect=True)
        download = True
        remove = True
        results_folder = pathlib.Path(RESULTS_FOLDER)
        results_path = basic.make_new_dir(output_folder,
                                          results_folder,
                                          attempt=50)
        if args.download_only == True:
            install = False
            remove = False

        if results_path is None:
            print("Unable to create results folder.")
            sys.exit(1)
        else:
            # Only look for version file is selected.
            if version_file:
                version_filepath, status1 = prepare_download(
                    results_path, server_url, database, "version")
            else:
                status1 = True

            db_filepath, status2 = prepare_download(results_path, server_url,
                                                    database, "sql")
        if (status1 == False or status2 == False):
            print("Unable to download data from server.")
            sys.exit(1)

    # If downloading from server, user may have selected to not
    # install the database file.
    if install == True:
        install_db(database,
                   db_filepath=db_filepath,
                   schema_version=schema_version)

    # The output folder was only created for downloading from server.
    if option == "server":
        if remove == True:
            print("Removing downloaded data.")
            shutil.rmtree(results_path)