Пример #1
0
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None):
    """Set up parser and logger, and coordinate prediction of CAZymes."""
    # build parser
    # Parse arguments
    # Check if namepsace isn't passed, if not parse command-line
    if argv is None:
        # Parse command-line
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initiate logger
    # Note: log file only created if specified at cmdline
    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # check current working directory, to make sure can access the CAZyme prediction tools
    check_cwd()

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # invoke prediction tools and build prediciton Proteome instances
    get_predictions(args)

    logger.info("Program finished, and no terminating.")
Пример #2
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser, loggers, and IO directories, then invoke scripts main function."""
    # Parser arguments
    # Check if namespace isn't passed, if not parser command-line
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initate logger
    # Note: log file only created if specificied at cmdline
    if logger is None:
        logger = config_logger(args)

    # Check config is present
    if (args.input is None) or (os.path.exists(args.input) is False):
        logger.error("No configuration file found. Terminating.")
        sys.exit(1)

    # If specified output directory, create output directory to write FASTA files too
    if args.outdir is not sys.stdout:
        make_output_directory(args.outdir, logger, args.force, args.nodelete)

    # Initate scripts main function
    read_configuration(args, logger)
Пример #3
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up parser and logger, and coordinate prediction of CAZymes."""
    if argv is None:
        # Parse command-line
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    # Initiate logger
    # Note: log file only created if specified at cmdline
    if logger is None:
        logger = logging.getLogger(__name__)
        config_logger(args)

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # open the CAZy dict
    cazy_dict = get_cazy_dict(args.cazy)

    # retrieve paths to all dirs
    predictions = get_predictions(args.input)  # USED IN R EVALUATION

    # perform stats evaluations
    stats.evaluate_performance(predictions, cazy_dict, args)

    if args.fam_freq is not None:  # retrieve the frequency of each CAZy family across all test sets
        time_stamp = datetime.now().strftime("%Y_%m_%d")
        stats.get_fam_freq(args, cazy_dict, time_stamp)  # USED IN R EVALUATION
Пример #4
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)

    Entrez.email = args.email

    make_output_directory(args.output, args.force, args.nodelete)

    # get the YAML file containing the genomic assemblies to be used for creating test sets
    assembly_dict = retrieve_assemblies_dict(args.yaml)

    # get dict containing the genomic assemblies of all CAZymes in CAZy
    cazy_dict = get_cazy_dict(args.cazy)

    temp_alignment_dir = args.output / "temp_alignment_dir"

    # create a test set for each genomic assembly
    for txid in tqdm(assembly_dict, desc="Parsing assemblies in config file"):
        for assembly in assembly_dict[txid]:
            # whipe temp dir clean
            prepare_output_dir(temp_alignment_dir)

            # download genomic assembly
            assembly_path = get_genomic_assembly(assembly, txid)

            # create a FASTA file containing all proteins sequences in the genomic assembly
            fasta_path = get_protein_seqs(assembly_path, assembly, txid)

            # differentiate between CAZymes and non-CAZymes and get test set of 100 CAZymes
            selected_cazymes, cazyme_fasta, non_cazymes, noncazyme_fasta = differentiate_cazymes_and_noncazymes(
                cazy_dict,
                fasta_path,
                temp_alignment_dir,
            )

            if selected_cazymes is None:
                continue

            alignment_df = align_cazymes_and_noncazymes(
                cazyme_fasta, noncazyme_fasta, temp_alignment_dir)
            if alignment_df is None:
                continue

            final_fasta = compile_output_file_path(fasta_path)

            write_out_test_set(selected_cazymes, non_cazymes, alignment_df,
                               final_fasta)

    # delete the temporary alignment dir
    shutil.rmtree(temp_alignment_dir)
Пример #5
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Set up loggers, parsers and directories for retrieval of genomes from NCBI.

    Then retrieve taxonomy data and GenBank files from NCBI.

    Return GenBank (.gbff) files and dataframe of taxonomy data.
    """
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)
    logger.info("Run initated")

    # Add users email address from parser
    if args.user is None:
        logger.error(
            "No user email provided. Email MUST be provided. Terminating programme"
        )
        sys.exit(1)
    else:
        Entrez.email = args.user

    # If specified output directory for genomic files, create output directory
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # Invoke main usage of programme
    # Create dataframe storing genus, species and NCBI Taxonomy ID, called 'species_table'
    species_table = parse_input_file(args.input_file, args.retries)

    # Pull down accession numbers and GenBank files (if not disabled)
    species_table["NCBI Accession Numbers"] = species_table.apply(
        get_accession_numbers, args=(args, ), axis=1)
    logger.info("Generated species table")

    # Write out dataframe
    if args.dataframe is not sys.stdout:
        write_out_dataframe(species_table, args.dataframe, args.force)
    else:
        species_table.to_csv(args.dataframe)

    # Program finished
    logger.info("Program finished and exiting")
Пример #6
0
def main(argv: Optional[List[str]] = None,
         logger: Optional[logging.Logger] = None):
    """Coordinate the retrieval of protein annotations from GenBank (.gbff) files.

    Including building parser, logger and output directory.

    Return dataframe of protein data.
    """
    if argv is None:
        parser = build_parser()
        args = parser.parse_args()
    else:
        parser = build_parser(argv)
        args = parser.parse_args()

    if logger is None:
        config_logger(args)
    logger = logging.getLogger(__name__)

    # If specified output directory, create output directory to write FASTA files too
    if args.output is not sys.stdout:
        make_output_directory(args.output, args.force, args.nodelete)

    # Open input dataframe
    logger.info("Opening input dataframe %s", args.input_df)
    input_df = pd.read_csv(args.input_df, header=0, index_col=0)

    # Build dataframe
    protein_annotation_df = create_dataframe(input_df, args)

    # Write out dataframe
    if args.output_df is not None:
        write_out_dataframe(protein_annotation_df, args.output_df, args.force)

    # Write out FASTA files
    index = 0
    for index in tqdm(range(len(protein_annotation_df["Genus"])),
                      desc="Writing protein to FASTA"):
        df_row = protein_annotation_df.iloc[index]
        write_fasta(df_row, logger, args)
        index += 1

    logger.info("Programme finsihed. Terminating.")
Пример #7
0
def get_predictions(args):
    """Build prediction queries and invoke prediction tools for each query.

    :param args: parser object

    Return list of Proteome class objects, queries to prediction tools.
    """
    # create list of paths to all fasta files in input directory
    all_fasta_paths = get_fasta_paths(args)

    # create empty list to store all instances of Proteome class objects
    predictions = []

    # for each FASTA file invoke dbCAN, CUPP and eCAMI
    for file_path in tqdm(all_fasta_paths, desc="Invoking tools for FASTA file"):  # make tqdm
        # retrieve data on source of protein sequences and species taxonomy ID
        protein_source = get_protein_source(file_path)
        tax_id = get_tax_id(file_path)

        time_stamp = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")

        # name output dir to store prediction output and statistical evaluation
        if tax_id is None:
            outdir_name = f"cazyme_predictions_{protein_source}_{time_stamp}"
        else:
            outdir_name = f"cazyme_predictions_{tax_id}_{protein_source}_{time_stamp}"

        # create output_dir for given input FASTA file within user specified parent directory
        output_path = args.output / outdir_name
        make_output_directory(output_path, args.force, args.nodelete)

        # create Proteome class object to store data on the query made to the prediction tools
        prediction_tool_query = Proteome(file_path, tax_id, protein_source, {"dir": output_path})

        # invoke prediction tools and retrieve paths to the prediction tools outputs
        full_outdir_path = invoke_prediction_tools(prediction_tool_query)

        # update outdir path with full path
        prediction_tool_query.prediction_paths["dir"] = full_outdir_path

        predictions.append(prediction_tool_query)

    return predictions
Пример #8
0
def test_make_existing_dir(testing_output_dir):
    """Test creation of output dir when it already exists."""
    path_ = testing_output_dir / "test_dir"
    # run twice to ensure directory exists
    file_io.make_output_directory(path_, True, True)
    file_io.make_output_directory(path_, True, True)
Пример #9
0
def test_output_dir_creation_nd_false(making_output_dir):
    """Test creation of output dir when nodelete is true"""
    file_io.make_output_directory(making_output_dir, True, True)