Пример #1
0
def get_files(directory, file, ignore):
    """
    Get the list of file(s) that need to be uploaded.

    :param directory: (optional) directory containing files for upload
    :type: directory: pathlib.Path
    :param file: (optional) file to upload
    :type file: pathlib.Path
    :param ignore: file(s) to ignore during upload process
    :type ignore: set
    :return: file_list
    """
    file_list = []

    if directory is not None:
        directory = basic.set_path(directory, kind="dir", expect=True)
        folder_files = basic.identify_contents(directory,
                                               kind="file",
                                               ignore_set=ignore)
        file_list.extend(folder_files)

    if file is not None:
        file = basic.set_path(file, kind="file", expect=True)
        file_list.append(file)

    return file_list
Пример #2
0
def main(unparsed_args_list):
    """Run the get_db pipeline.

    The database data can be retrieved from three places:
    The server, which needs to be downloaded to a new folder.
    A local file, in which no download and no new folder are needed.
    The empty schema stored within pdm_utils, in which no download, new folder,
    or local file are needed.
    """
    args = parse_args(unparsed_args_list)

    # Set all values to get rid of args object and to set additional values.
    database = args.database
    option = args.option
    install = True
    schema_version = None
    db_filepath = None

    if option == "file":
        db_filepath = basic.set_path(args.filename, kind="file", expect=True)
    elif option == "new":
        schema_version = args.schema_version
    else:
        # option must be "server"
        output_folder = basic.set_path(args.output_folder, kind="dir", expect=True)
        download = True
        remove = True
        results_folder = pathlib.Path(RESULTS_FOLDER)
        results_path = basic.make_new_dir(output_folder, results_folder, attempt=50)
        if args.download_only == True:
            install = False
            remove = False

        if results_path is None:
            print("Unable to create results folder.")
            sys.exit(1)
        else:
            version_filepath, status1 = prepare_download(results_path,
                                            constants.DB_WEBSITE,
                                            args.database, "version")
            db_filepath, status2 = prepare_download(results_path,
                                            constants.DB_WEBSITE,
                                            args.database, "sql")
        if (status1 == False or status2 == False):
            print("Unable to download data from server.")
            sys.exit(1)

    # If downloading from server, user may have selected to not
    # install the database file.
    if install == True:
        install_db(database, db_filepath=db_filepath, schema_version=schema_version)

    # The output folder was only created for downloading from server.
    if option == "server":
        if remove == True:
            print("Removing downloaded data.")
            shutil.rmtree(results_path)
Пример #3
0
def main(unparsed_args_list):
    """Run the push_db pipeline."""
    args = parse_args(unparsed_args_list)

    file_list = []
    if args.directory is not None:
        args.directory = basic.set_path(args.directory,
                                        kind="dir",
                                        expect=True)
        folder_files = basic.identify_contents(args.directory,
                                               kind="file",
                                               ignore_set=set([".DS_Store"]))
        file_list.extend(folder_files)
    if args.file is not None:
        args.file = basic.set_path(args.file, kind="file", expect=True)
        file_list.append(args.file)

    status = True
    if len(file_list) == 0:
        print("There are no files to upload.")
        status = False

    if status == True:
        server.set_log_file(str(args.log_file))
        transport = server.get_transport(constants.DB_HOST)
        if transport is None:
            status = False

    if status == True:
        sftp = server.setup_sftp_conn(transport, attempts=3)
        if sftp is None:
            status = False

    success = []
    fail = []
    if status == True:
        for local_filepath in file_list:
            print(f"Uploading {local_filepath.name}...")
            remote_filepath = pathlib.Path(constants.DB_HOST_DIR,
                                           local_filepath.name)
            result = server.upload_file(sftp, str(local_filepath),
                                        str(remote_filepath))
            if result:
                success.append(local_filepath.name)
            else:
                fail.append(local_filepath.name)
        sftp.close()
        transport.close()

    if len(fail) > 0:
        print("The following files were not uploaded:")
        for file in fail:
            print(file)
Пример #4
0
def main(unparsed_args_list):
    """Run main get_gb_records pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")
    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_gb_records")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)
    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline")

    # Create data sets
    print("Retrieving accessions from the database...")
    accessions = mysqldb.get_distinct_data(engine, "phage", "Accession")
    engine.dispose()
    if "" in accessions:
        accessions.remove("")
    if None in accessions:
        accessions.remove(None)

    get_genbank_data(working_path, accessions, ncbi_cred_dict)
Пример #5
0
 def test_set_path_4(self, verify_path2_mock):
     """Verify '..' directory resolution."""
     test_file = Path("/dir1/dir2/../file.txt")
     verify_path2_mock.return_value = (True, None)
     output = basic.set_path(test_file, kind="file", expect=True)
     exp = Path("/dir1/file.txt")
     self.assertEqual(output, exp)
Пример #6
0
def main(unparsed_args):
    """Runs the complete update pipeline."""
    args = parse_args(unparsed_args)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the update pipeline")

    if args.version == True:
        mysqldb.change_version(engine)
        print("Database version updated.")

    if args.ticket_table is not None:
        update_table_path = basic.set_path(args.ticket_table,
                                           kind="file",
                                           expect=True)

        # Iterate through the tickets and process them sequentially.
        list_of_update_tickets = []
        with update_table_path.open(mode='r') as f:
            file_reader = csv.DictReader(f)
            for dict in file_reader:
                list_of_update_tickets.append(dict)

        # Variables to be used for end summary
        processed = 0
        succeeded = 0
        failed = 0

        for dict in list_of_update_tickets:
            conn = engine.connect()
            # Pass the raw db_api connection
            handler = RandomFieldUpdateHandler(conn.connection)
            handler.table = dict["table"]  # Which table will be updated?
            handler.field = dict["field"]  # Which field will be updated?
            handler.value = dict[
                "value"]  # What value will be put in that field?
            handler.key_name = dict[
                "key_name"]  # How will we know which row is the right one?
            handler.key_value = dict[
                "key_value"]  # How will we know which row is the right one?
            handler.validate_ticket(
            )  # Make sure all handler attributes are valid
            status = handler.execute_ticket()  # Do what was requested
            if status == 1:
                processed += 1
                succeeded += 1
            else:
                processed += 1
                failed += 1

        engine.dispose()
        print("\nDone iterating through tickets.")
        if succeeded > 0:
            print(f"{succeeded} / {processed} tickets successfully handled.")
        if failed > 0:
            print(f"{failed} / {processed} tickets failed to be handled.")
Пример #7
0
def convert_file_path(path: str):
    """Function to convert argparse input to a working file path.

    :param path: A string to be converted into a Path object.
    :type path: str
    :returns: A Path object converted from the inputed string.
    :rtype: Path
    """
    return basic.set_path(Path(path), kind="file")
Пример #8
0
def create_empty_config_file(dir, file, null_value):
    """Create an empty config file with all available settings."""
    output_path = basic.set_path(dir, kind="dir", expect=True)
    config_path = basic.make_new_file(output_path, file, "txt", attempt=50)
    if config_path is None:
        print("Unable to create config file. File already exists.")
    else:
        parser = default_parser(null_value)
        write_config(parser, config_path)
Пример #9
0
def main(unparsed_args_list):
    """Run main get_gb_records pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)

    # Filters input: phage.Status=draft AND phage.HostGenus=Mycobacterium
    # Args structure: [['phage.Status=draft'], ['phage.HostGenus=Mycobacterium']]
    filters = args.filters
    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)
    output_folder = basic.set_path(args.output_folder, kind="dir", expect=True)
    working_dir = pathlib.Path(RESULTS_FOLDER)
    working_path = basic.make_new_dir(output_folder, working_dir, attempt=50)
    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")
    alchemist = AlchemyHandler(database=args.database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the get_gb_records pipeline")

    # Get SQLAlchemy metadata Table object
    # table_obj.primary_key.columns is a
    # SQLAlchemy ColumnCollection iterable object
    # Set primary key = 'phage.PhageID'
    alchemist.build_metadata()
    table = querying.get_table(alchemist.metadata, TARGET_TABLE)
    for column in table.primary_key.columns:
        primary_key = column

    # Create filter object and then add command line filter strings
    db_filter = Filter(alchemist=alchemist, key=primary_key)
    db_filter.values = []

    # Attempt to add filters and exit if needed.
    add_filters(db_filter, filters)

    # Performs the query
    db_filter.update()

    # db_filter.values now contains list of PhageIDs that pass the filters.
    # Get the accessions associated with these PhageIDs.
    keep_set = set(db_filter.values)

    # Create data sets
    print("Retrieving accessions from the database...")
    query = construct_accession_query(keep_set)
    list_of_dicts = mysqldb_basic.query_dict_list(engine, query)
    id_acc_dict = get_id_acc_dict(list_of_dicts)
    acc_id_dict = get_acc_id_dict(id_acc_dict)
    engine.dispose()
    if len(acc_id_dict.keys()) > 0:
        get_data(working_path, acc_id_dict, ncbi_cred_dict)
    else:
        print("There are no records to retrieve.")
Пример #10
0
 def test_set_path_5(self, verify_path2_mock):
     """Verify home directory expansion and '..' directory resolution."""
     home = Path("~")
     home = home.expanduser()
     test_file = Path("~/dir1/dir2/../file.txt")
     verify_path2_mock.return_value = (True, None)
     output = basic.set_path(test_file, kind="file", expect=True)
     exp = Path(home, "dir1/file.txt")
     self.assertEqual(output, exp)
Пример #11
0
def main(unparsed_args):
    """Runs the complete update pipeline."""
    args = parse_args(unparsed_args[2:])

    # Verify database connection and schema compatibility.
    print("Connecting to the MySQL database...")

    # Create config object with data obtained from file and/or defaults.
    config = configfile.build_complete_config(args.config_file)
    mysql_creds = config["mysql"]
    alchemist = AlchemyHandler(database=args.database,
                               username=mysql_creds["user"],
                               password=mysql_creds["password"])
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the update pipeline")

    if args.version is True:
        mysqldb.change_version(engine)
        print("Database version updated.")

    if args.ticket_table is not None:
        update_table_path = basic.set_path(args.ticket_table,
                                           kind="file",
                                           expect=True)

        # Iterate through the tickets and process them sequentially.
        list_of_update_tickets = []
        with update_table_path.open(mode='r') as f:
            file_reader = csv.DictReader(f)
            for dict in file_reader:
                list_of_update_tickets.append(dict)

        # Variables to be used for end summary
        processed = 0
        succeeded = 0
        failed = 0

        for dict in list_of_update_tickets:
            status = update_field(alchemist, dict)

            if status == 1:
                processed += 1
                succeeded += 1
            else:
                processed += 1
                failed += 1

        engine.dispose()
        print("\nDone iterating through tickets.")
        if succeeded > 0:
            print(f"{succeeded} / {processed} tickets successfully handled.")
        if failed > 0:
            print(f"{failed} / {processed} tickets failed to be handled.")
Пример #12
0
 def test_set_path_1(self):
     """Verify output when file exists and is expected to exist."""
     self.file.touch()
     # Since using tempfile, there is an added quirk.
     # the tempfile path may be a symlink, so passing it through set path
     # will resolve the symlink, changing the path, and breaking the test.
     self.file = self.file.resolve()
     output = basic.set_path(self.file, kind="file", expect=True)
     with self.subTest():
         self.assertIsInstance(output, Path)
     with self.subTest():
         self.assertEqual(str(self.file), str(output))
Пример #13
0
def parse_config(file, parser=None):
    """Get parameters from config file."""
    filepath = basic.set_path(file, kind="file", expect=True)
    if parser is None:
        parser = configparser.ConfigParser()

    try:
        parser.read(filepath)
    except:
        print("Unable to parse config file")
        sys.exit(1)
    else:
        return parser
Пример #14
0
def execute_get_file_db(alchemist,
                        database,
                        filename,
                        config_file=None,
                        schema_version=None,
                        verbose=False):
    db_filepath = basic.set_path(filename, kind="file", expect=True)

    install_db(alchemist,
               database,
               db_filepath=db_filepath,
               config_file=config_file,
               schema_version=schema_version,
               verbose=verbose)
Пример #15
0
def get_ncbi_creds(filename):
    """Get NCBI credentials from a file.

    :param filename: Path to config file containing NCBI login credentials.
    :type filename: Path
    :return: Dictionary of NCBI login credentials.
    :rtype: dict
    """
    ncbi_cred_dict = {}
    ncbi_cred_dict["ncbi_api_key"] = None
    ncbi_cred_dict["ncbi_email"] = None
    ncbi_cred_dict["ncbi_tool"] = None

    if filename is not None:
        filepath = basic.set_path(filename, kind="file", expect=True)
        config_dict = basic.parse_config_file(filepath)
        try:
            ncbi_cred_dict["ncbi_api_key"] = config_dict["ncbi_api_key"]
            ncbi_cred_dict["ncbi_email"] = config_dict["ncbi_email"]
            ncbi_cred_dict["ncbi_tool"] = config_dict["ncbi_tool"]
        except:
            print(f"Unable to parse NCBI credentials from {filepath.name}")
    return ncbi_cred_dict
Пример #16
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    force = args.force_download
    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)
    working_dir = pathlib.Path(RESULTS_FOLDER)
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=50)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    # Create config object with data obtained from file and/or defaults.
    config = configfile.build_complete_config(args.config_file)
    mysql_creds = config["mysql"]
    ncbi_creds = config["ncbi"]

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    alchemist = AlchemyHandler(database=args.database,
                               username=mysql_creds["user"],
                               password=mysql_creds["password"])
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        # With default date, the date of all records retrieved will be newer.
        if force:
            gnm.date = constants.EMPTY_DATE
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        tup = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = tup[0]
        unmatched_phagesdb_ids = tup[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path,
                         mysqldb_genome_dict,
                         ncbi_creds,
                         args.genbank_results,
                         force=force)
    if args.draft is True:
        if force:
            # Add all draft genomes currently in database to the list of
            # draft genomes to be downloaded.
            drafts = get_matched_drafts(matched_genomes)
            unmatched_phagesdb_ids |= drafts
        get_draft_data(working_path, unmatched_phagesdb_ids)
Пример #17
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.db
    cdd_dir = expand_path(args.dir)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    log_file = args.log_file
    reset = args.reset

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder,
                                      results_folder,
                                      attempt=10)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, log_file)

    # Set up root logger.
    logging.basicConfig(
        filename=log_file,
        filemode="w",
        level=logging.DEBUG,
        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
               f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    engine = mysqldb.connect_to_db(database)
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    # result = engine.execute(GET_GENES_FOR_CDD)
    cdd_genes = mysqldb.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:
        log_gene_ids(cdd_genes)

        # Create temp_dir
        make_tempdir(tmp_dir)

        # TODO dev
        # translations = get_unique_translations(cdd_genes)

        # Build jobs list
        jobs = []

        # TODO dev
        # translation_id = 0
        # for translation in translations:
        #     translation_id += 1
        #     jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
        #                  translation_id, translation))

        for cdd_gene in cdd_genes:
            jobs.append((rpsblast, cdd_name, tmp_dir, evalue,
                         cdd_gene["GeneID"], cdd_gene["Translation"]))

        results = parallelize(jobs, threads, search_and_process)
        print("\n")

        # TODO dev
        # results_dict = create_results_dict(results)
        # map_results_to_genes(cdd_genes, results_dict)

        insert_domain_data(engine, results)
        engine.dispose()
    return
Пример #18
0
def main(argument_list):
    """
    :param argument_list:
    :return:
    """
    # Setup argument parser
    cdd_parser = setup_argparser()

    # Use argument parser to parse argument_list
    args = cdd_parser.parse_args(argument_list)

    # Store arguments in more easily accessible variables
    database = args.database
    cdd_dir = expand_path(args.cdd)
    cdd_name = learn_cdd_name(cdd_dir)
    threads = args.threads
    evalue = args.evalue
    rpsblast = args.rpsblast
    tmp_dir = args.tmp_dir
    output_folder = args.output_folder
    reset = args.reset
    batch_size = args.batch_size

    # Set up directory.
    output_folder = basic.set_path(output_folder, kind="dir", expect=True)
    results_folder = pathlib.Path(RESULTS_FOLDER)
    results_path = basic.make_new_dir(output_folder, results_folder,
                                      attempt=50)
    if results_path is None:
        print("Unable to create output_folder.")
        sys.exit(1)

    log_file = pathlib.Path(results_path, MAIN_LOG_FILE)

    # Set up root logger.
    logging.basicConfig(filename=log_file, filemode="w", level=logging.DEBUG,
                        format="pdm_utils find_domains: %(levelname)s: %(message)s")
    logger.info(f"pdm_utils version: {VERSION}")
    logger.info(f"CDD run date: {constants.CURRENT_DATE}")
    logger.info(f"Command line arguments: {' '.join(argument_list)}")
    logger.info(f"Results directory: {results_path}")

    # Early exit if either 1) cdd_name == "" or 2) no rpsblast given and we are
    # unable to find one
    if cdd_name == "":
        msg = (f"Unable to learn CDD database name. Make sure the files in "
              f"{cdd_dir} all have the same basename.")
        logger.error(msg)
        print(msg)
        return

    # Get the rpsblast command and path.
    if rpsblast == "":
        command = get_rpsblast_command()
        rpsblast = get_rpsblast_path(command)

    # Verify database connection and schema compatibility.
    alchemist = AlchemyHandler(database=database)
    alchemist.connect(pipeline=True)
    engine = alchemist.engine
    logger.info(f"Connected to database: {database}.")
    mysqldb.check_schema_compatibility(engine, "the find_domains pipeline")
    logger.info(f"Schema version is compatible.")
    logger.info("Command line arguments verified.")

    if reset:
        logger.info("Clearing all domain data currently in the database.")
        clear_domain_data(engine)

    # Get gene data that needs to be processed
    # in dict format where key = column name, value = stored value.
    cdd_genes = mysqldb_basic.query_dict_list(engine, GET_GENES_FOR_CDD)
    msg = f"{len(cdd_genes)} genes to search for conserved domains..."
    logger.info(msg)
    print(msg)

    # Only run the pipeline if there are genes returned that need it
    if len(cdd_genes) > 0:

        log_gene_ids(cdd_genes)
        make_tempdir(tmp_dir)

        # Identify unique translations to process mapped to GeneIDs.
        cds_trans_dict = create_cds_translation_dict(cdd_genes)

        unique_trans = list(cds_trans_dict.keys())
        msg = (f"{len(unique_trans)} unique translations "
               "to search for conserved domains...")
        logger.info(msg)
        print(msg)

        # Process translations in batches. Otherwise, searching could take
        # so long that MySQL connection closes resulting in 1 or more
        # transaction errors.
        batch_indices = basic.create_indices(unique_trans, batch_size)
        total_rolled_back = 0
        for indices in batch_indices:
            start = indices[0]
            stop = indices[1]
            msg = f"Processing translations {start + 1} to {stop}..."
            logger.info(msg)
            print(msg)
            sublist = unique_trans[start:stop]
            batch_rolled_back = search_translations(
                                    rpsblast, cdd_name, tmp_dir, evalue,
                                    threads, engine, sublist, cds_trans_dict)
            total_rolled_back += batch_rolled_back

        search_summary(total_rolled_back)
        engine.dispose()

    return
Пример #19
0
def main(unparsed_args_list):
    """Run main retrieve_updates pipeline."""
    # Parse command line arguments
    args = parse_args(unparsed_args_list)
    date = time.strftime("%Y%m%d")

    args.output_folder = basic.set_path(args.output_folder,
                                        kind="dir",
                                        expect=True)

    working_dir = pathlib.Path(f"{date}_get_data")
    working_path = basic.make_new_dir(args.output_folder,
                                      working_dir,
                                      attempt=10)

    if working_path is None:
        print(f"Invalid working directory '{working_dir}'")
        sys.exit(1)

    ncbi_cred_dict = ncbi.get_ncbi_creds(args.ncbi_credentials_file)

    # Verify database connection and schema compatibility.
    print("Preparing genome data sets from the MySQL database...")
    engine = mysqldb.connect_to_db(args.database)
    mysqldb.check_schema_compatibility(engine, "the get_data pipeline")

    # Get existing data from MySQL to determine what needs to be updated.
    query = ("SELECT PhageID, Name, HostGenus, Status, Cluster, "
             "DateLastModified, Accession, RetrieveRecord, Subcluster, "
             "AnnotationAuthor FROM phage")

    mysqldb_genome_list = mysqldb.parse_genome_data(engine=engine,
                                                    phage_query=query,
                                                    gnm_type="mysqldb")
    engine.dispose()
    mysqldb_genome_dict = {}
    for gnm in mysqldb_genome_list:
        mysqldb_genome_dict[gnm.id] = gnm

    # Get data from PhagesDB
    if (args.updates or args.final or args.draft) is True:
        print("Retrieving data from PhagesDB...")
        phagesdb_phages = phagesdb.get_phagesdb_data(constants.API_SEQUENCED)
        phagesdb_phages_dict = basic.convert_list_to_dict(
            phagesdb_phages, "phage_name")
        phagesdb_genome_dict = phagesdb.parse_genomes_dict(
            phagesdb_phages_dict, gnm_type="phagesdb", seq=False)

        # Exit if all phage data wasn't retrieved.
        if len(phagesdb_genome_dict) == 0:
            sys.exit(1)

        # Returns a list of tuples.
        match_output = match_genomes(mysqldb_genome_dict, phagesdb_genome_dict)
        matched_genomes = match_output[0]
        unmatched_phagesdb_ids = match_output[1]

    if args.updates is True:
        get_update_data(working_path, matched_genomes)
    if args.final is True:
        get_final_data(working_path, matched_genomes)
    if args.genbank is True:
        get_genbank_data(working_path, mysqldb_genome_dict, ncbi_cred_dict,
                         args.genbank_results)
    if args.draft is True:
        get_draft_data(working_path, unmatched_phagesdb_ids)
    print("\n\n\nRetrieve updates script completed.")
Пример #20
0
 def test_set_path_2(self, sys_exit_mock):
     """Verify script exits when file does not exist,
     and is not expected to exist."""
     output = basic.set_path(self.file, kind="file", expect=True)
     self.assertTrue(sys_exit_mock.called)