def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser and logger, and coordinate prediction of CAZymes.""" # build parser # Parse arguments # Check if namepsace isn't passed, if not parse command-line if argv is None: # Parse command-line parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initiate logger # Note: log file only created if specified at cmdline if logger is None: logger = logging.getLogger(__name__) config_logger(args) # check current working directory, to make sure can access the CAZyme prediction tools check_cwd() # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # invoke prediction tools and build prediciton Proteome instances get_predictions(args) logger.info("Program finished, and no terminating.")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser, loggers, and IO directories, then invoke scripts main function.""" # Parser arguments # Check if namespace isn't passed, if not parser command-line if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initate logger # Note: log file only created if specificied at cmdline if logger is None: logger = config_logger(args) # Check config is present if (args.input is None) or (os.path.exists(args.input) is False): logger.error("No configuration file found. Terminating.") sys.exit(1) # If specified output directory, create output directory to write FASTA files too if args.outdir is not sys.stdout: make_output_directory(args.outdir, logger, args.force, args.nodelete) # Initate scripts main function read_configuration(args, logger)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up parser and logger, and coordinate prediction of CAZymes.""" if argv is None: # Parse command-line parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() # Initiate logger # Note: log file only created if specified at cmdline if logger is None: logger = logging.getLogger(__name__) config_logger(args) # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # open the CAZy dict cazy_dict = get_cazy_dict(args.cazy) # retrieve paths to all dirs predictions = get_predictions(args.input) # USED IN R EVALUATION # perform stats evaluations stats.evaluate_performance(predictions, cazy_dict, args) if args.fam_freq is not None: # retrieve the frequency of each CAZy family across all test sets time_stamp = datetime.now().strftime("%Y_%m_%d") stats.get_fam_freq(args, cazy_dict, time_stamp) # USED IN R EVALUATION
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) Entrez.email = args.email make_output_directory(args.output, args.force, args.nodelete) # get the YAML file containing the genomic assemblies to be used for creating test sets assembly_dict = retrieve_assemblies_dict(args.yaml) # get dict containing the genomic assemblies of all CAZymes in CAZy cazy_dict = get_cazy_dict(args.cazy) temp_alignment_dir = args.output / "temp_alignment_dir" # create a test set for each genomic assembly for txid in tqdm(assembly_dict, desc="Parsing assemblies in config file"): for assembly in assembly_dict[txid]: # whipe temp dir clean prepare_output_dir(temp_alignment_dir) # download genomic assembly assembly_path = get_genomic_assembly(assembly, txid) # create a FASTA file containing all proteins sequences in the genomic assembly fasta_path = get_protein_seqs(assembly_path, assembly, txid) # differentiate between CAZymes and non-CAZymes and get test set of 100 CAZymes selected_cazymes, cazyme_fasta, non_cazymes, noncazyme_fasta = differentiate_cazymes_and_noncazymes( cazy_dict, fasta_path, temp_alignment_dir, ) if selected_cazymes is None: continue alignment_df = align_cazymes_and_noncazymes( cazyme_fasta, noncazyme_fasta, temp_alignment_dir) if alignment_df is None: continue final_fasta = compile_output_file_path(fasta_path) write_out_test_set(selected_cazymes, non_cazymes, alignment_df, final_fasta) # delete the temporary alignment dir shutil.rmtree(temp_alignment_dir)
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up loggers, parsers and directories for retrieval of genomes from NCBI. Then retrieve taxonomy data and GenBank files from NCBI. Return GenBank (.gbff) files and dataframe of taxonomy data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) logger.info("Run initated") # Add users email address from parser if args.user is None: logger.error( "No user email provided. Email MUST be provided. Terminating programme" ) sys.exit(1) else: Entrez.email = args.user # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Invoke main usage of programme # Create dataframe storing genus, species and NCBI Taxonomy ID, called 'species_table' species_table = parse_input_file(args.input_file, args.retries) # Pull down accession numbers and GenBank files (if not disabled) species_table["NCBI Accession Numbers"] = species_table.apply( get_accession_numbers, args=(args, ), axis=1) logger.info("Generated species table") # Write out dataframe if args.dataframe is not sys.stdout: write_out_dataframe(species_table, args.dataframe, args.force) else: species_table.to_csv(args.dataframe) # Program finished logger.info("Program finished and exiting")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Coordinate the retrieval of protein annotations from GenBank (.gbff) files. Including building parser, logger and output directory. Return dataframe of protein data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) # If specified output directory, create output directory to write FASTA files too if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Open input dataframe logger.info("Opening input dataframe %s", args.input_df) input_df = pd.read_csv(args.input_df, header=0, index_col=0) # Build dataframe protein_annotation_df = create_dataframe(input_df, args) # Write out dataframe if args.output_df is not None: write_out_dataframe(protein_annotation_df, args.output_df, args.force) # Write out FASTA files index = 0 for index in tqdm(range(len(protein_annotation_df["Genus"])), desc="Writing protein to FASTA"): df_row = protein_annotation_df.iloc[index] write_fasta(df_row, logger, args) index += 1 logger.info("Programme finsihed. Terminating.")
def get_predictions(args): """Build prediction queries and invoke prediction tools for each query. :param args: parser object Return list of Proteome class objects, queries to prediction tools. """ # create list of paths to all fasta files in input directory all_fasta_paths = get_fasta_paths(args) # create empty list to store all instances of Proteome class objects predictions = [] # for each FASTA file invoke dbCAN, CUPP and eCAMI for file_path in tqdm(all_fasta_paths, desc="Invoking tools for FASTA file"): # make tqdm # retrieve data on source of protein sequences and species taxonomy ID protein_source = get_protein_source(file_path) tax_id = get_tax_id(file_path) time_stamp = datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # name output dir to store prediction output and statistical evaluation if tax_id is None: outdir_name = f"cazyme_predictions_{protein_source}_{time_stamp}" else: outdir_name = f"cazyme_predictions_{tax_id}_{protein_source}_{time_stamp}" # create output_dir for given input FASTA file within user specified parent directory output_path = args.output / outdir_name make_output_directory(output_path, args.force, args.nodelete) # create Proteome class object to store data on the query made to the prediction tools prediction_tool_query = Proteome(file_path, tax_id, protein_source, {"dir": output_path}) # invoke prediction tools and retrieve paths to the prediction tools outputs full_outdir_path = invoke_prediction_tools(prediction_tool_query) # update outdir path with full path prediction_tool_query.prediction_paths["dir"] = full_outdir_path predictions.append(prediction_tool_query) return predictions
def test_make_existing_dir(testing_output_dir): """Test creation of output dir when it already exists.""" path_ = testing_output_dir / "test_dir" # run twice to ensure directory exists file_io.make_output_directory(path_, True, True) file_io.make_output_directory(path_, True, True)
def test_output_dir_creation_nd_false(making_output_dir): """Test creation of output dir when nodelete is true""" file_io.make_output_directory(making_output_dir, True, True)