def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Set up loggers, parsers and directories for retrieval of genomes from NCBI. Then retrieve taxonomy data and GenBank files from NCBI. Return GenBank (.gbff) files and dataframe of taxonomy data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) logger.info("Run initated") # Add users email address from parser if args.user is None: logger.error( "No user email provided. Email MUST be provided. Terminating programme" ) sys.exit(1) else: Entrez.email = args.user # If specified output directory for genomic files, create output directory if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Invoke main usage of programme # Create dataframe storing genus, species and NCBI Taxonomy ID, called 'species_table' species_table = parse_input_file(args.input_file, args.retries) # Pull down accession numbers and GenBank files (if not disabled) species_table["NCBI Accession Numbers"] = species_table.apply( get_accession_numbers, args=(args, ), axis=1) logger.info("Generated species table") # Write out dataframe if args.dataframe is not sys.stdout: write_out_dataframe(species_table, args.dataframe, args.force) else: species_table.to_csv(args.dataframe) # Program finished logger.info("Program finished and exiting")
def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = None): """Coordinate the retrieval of protein annotations from GenBank (.gbff) files. Including building parser, logger and output directory. Return dataframe of protein data. """ if argv is None: parser = build_parser() args = parser.parse_args() else: parser = build_parser(argv) args = parser.parse_args() if logger is None: config_logger(args) logger = logging.getLogger(__name__) # If specified output directory, create output directory to write FASTA files too if args.output is not sys.stdout: make_output_directory(args.output, args.force, args.nodelete) # Open input dataframe logger.info("Opening input dataframe %s", args.input_df) input_df = pd.read_csv(args.input_df, header=0, index_col=0) # Build dataframe protein_annotation_df = create_dataframe(input_df, args) # Write out dataframe if args.output_df is not None: write_out_dataframe(protein_annotation_df, args.output_df, args.force) # Write out FASTA files index = 0 for index in tqdm(range(len(protein_annotation_df["Genus"])), desc="Writing protein to FASTA"): df_row = protein_annotation_df.iloc[index] write_fasta(df_row, logger, args) index += 1 logger.info("Programme finsihed. Terminating.")
def test_writing_df_f_false(testing_df, df_output_file): """Tests function for writing out created dataframe when force is false""" file_io.write_out_dataframe(testing_df, df_output_file, False)
def test_writing_df_no_df(df_output_file): """Tests function for writing out created dataframe when no df is given""" file_io.write_out_dataframe(None, df_output_file, False)