def make_outdir(outdir: Path, force: bool, noclobber: bool) -> None: """Create output directory (allows for force and noclobber). :param outdir: Path, path to output directory :param force: bool, True if an existing directory will be reused :param noclobber: bool, True if existing files are not overwritten The intended outcomes are: outdir doesn't exist: create outdir outdir exists: raise exception outdir exists, --force only: remove the directory tree outdir exists, --force --noclobber: continue with existing directory tree So long as the outdir is created with this function, we need only check for args.noclobber elsewhere to see how to proceed when a file exists. """ # Create logger logger = logging.getLogger(__name__) logger.info("Creating output directory %s", outdir) if force: logger.warning(termcolor("Output directory overwrite forced", "red")) if outdir.is_dir() and noclobber is False: logger.warning( termcolor("Clobbering existing directory %s", "red"), outdir) shutil.rmtree(outdir) outdir.mkdir(parents=True, exist_ok=force)
def add_log_headers(): """Add headers to log output.""" logger = logging.getLogger(__name__) # Add citation information to log logger.info(termcolor("CITATION INFO", bold=True)) for line in CITATION_INFO: logger.info(line) # Add dependency citations logger.info(termcolor("DEPENDENCIES", bold=True)) dep_citations = [ "The authors of pyani gratefully acknowledge its dependence on", "the following bioinformatics software:", f"\t{termcolor('MUMmer3', 'cyan')}: S. Kurtz, A. Phillippy, A.L. Delcher, M. Smoot, M. Shumway,", "\tC. Antonescu, and S.L. Salzberg (2004), 'Versatile and open software", "\tfor comparing large genomes' Genome Biology 5:R12", f"\t{termcolor('BLAST+', 'cyan')}: Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,", "\tBealer K., & Madden T.L. (2008) 'BLAST+: architecture and applications.'", "\tBMC Bioinformatics 10:421.", f"\t{termcolor('BLAST', 'cyan')}: Altschul, S.F., Madden, T.L., Schäffer, A.A., Zhang, J.,", "\tZhang, Z., Miller, W. & Lipman, D.J. (1997) 'Gapped BLAST and PSI-BLAST:", "\ta new generation of protein database search programs.' Nucleic Acids Res.", "\t25:3389-3402", f"\t{termcolor('Biopython', 'cyan')}: C**k PA, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A,", "\tFriedberg I, Hamelryck T, Kauff F, Wilczynski B and de Hoon MJL", "\t(2009) Biopython: freely available Python tools for computational", "\tmolecular biology and bioinformatics. Bioinformatics, 25, 1422-1423", ] for line in dep_citations: logger.info(line)
def subcmd_download(args: Namespace) -> int: """Download assembled genomes in subtree of passed NCBI taxon ID. :param args: Namespace, command-line arguments """ # Create logger logger = logging.getLogger(__name__) logger.info(termcolor("Downloading genomes from NCBI", "red")) # Create output directory, respecting force/noclobber if args.dryrun: logger.warning( termcolor("Dry run only: will not overwrite or download", "cyan")) else: make_outdir(args.outdir, args.force, args.noclobber) api_key = configure_entrez(args) # set up email/get API key asm_dict = get_tax_asm_dict( args) # dictionary of assembly UIDs for download # Download contigs and hashes for each assembly UID in the dictionary # Collect class and label information for each downloaded genome, plus a list # of skipped genome data classes, labels, skippedlist = download_data(args, api_key, asm_dict) # Write class and label files if not args.dryrun: classfname = args.outdir / args.classfname logger.info("Writing classes file to %s", classfname) if classfname.exists() and args.noclobber: logger.warning("Class file %s exists, not overwriting", classfname) else: with open(classfname, "w") as ofh: ofh.write("\n".join(classes) + "\n") labelfname = args.outdir / args.labelfname logger.info("Writing labels file to %s", labelfname) if labelfname.exists() and args.noclobber: logger.warning("Labels file %s exists, not overwriting", labelfname) else: with open(labelfname, "w") as ofh: ofh.write("\n".join(labels) + "\n") # Report skipped genome list if skippedlist: logger.warning(termcolor("%s genome downloads were skipped", "red"), len(skippedlist)) for skipped in skippedlist: outstr = "\n\t".join([ f"taxon id: {skipped.taxon_id}", f"accession: {skipped.accession}", f"URL: {skipped.url}", f"source: {skipped.dltype}", ]) logger.warning("%s %s:\n\t%s", skipped.organism, skipped.strain, outstr) return 0
def download_data( args: Namespace, api_key: Optional[str], asm_dict: Dict[str, List], ) -> Tuple[List, List, List]: """Download the accessions indicated in the passed dictionary. :param args: Namespace of command-line arguments :param api_key: str, API key for NCBI downloads :param asm_dict: dictionary of assembly UIDs to download, keyed by taxID Returns lists of information about downloaded genome classes and labels, and a list of skipped downloads (as Skipped objects). """ logger = logging.getLogger(__name__) classes, labels, skippedlist = [], [], [] for tid, uids in asm_dict.items(): logger.info(termcolor("Downloading contigs for Taxon ID %s", "blue"), uids) for uid in uids: # Obtain eSummary for each assembly UID logger.info( termcolor("Retrieving eSummary information for UID %s", "cyan"), uid) esummary, filestem = download.get_ncbi_esummary( uid, args.retries, api_key) uid_class = download.get_ncbi_classification(esummary) logger.debug( "eSummary information (%s):\n\t%s", filestem, dl_info_to_str(esummary, uid_class), ) if args.dryrun: logger.warning("(dry-run) skipping download of %s", esummary["AssemblyAccession"]) continue # Download genome for UID, and extract compressed files dlstatus, skipped_genomes = download_genome( args, filestem, tid, uid, uid_class) skippedlist.extend(skipped_genomes) if not dlstatus.skipped: extract_genomes(args, dlstatus, esummary) labeltxt, classtxt = hash_genomes(args, dlstatus, filestem, uid_class) classes.append(classtxt) labels.append(labeltxt) logger.info( "Label and class file entries\n\tLabel: %s\n\tClass: %s", labeltxt, classtxt, ) return classes, labels, skippedlist
def get_ncbi_esummary(asm_uid, retries, api_key=None) -> Tuple: """Obtain full eSummary info for the passed assembly UID. :param asm_uid: :param retries: :param api_key: """ logger = logging.getLogger(__name__) # Obtain full eSummary data for the assembly summary = entrez_esummary( retries=retries, db="assembly", id=asm_uid, report="full", api_key=api_key ) # Extract filestem from assembly data try: data = summary["DocumentSummarySet"]["DocumentSummary"][0] except (IndexError, KeyError): # Something has gone awry with the download logger.warning( termcolor("Could not get eSummary for UID %s", "red"), asm_uid, exc_info=True, ) raise NCBIDownloadException(f"Could not get NCBI eSummary for UID {asm_uid}") filestem = extract_filestem(data) return (data, filestem)
def subcmd_plot(args: Namespace) -> int: """Produce graphical output for an analysis. :param args: Namespace of command-line arguments This is graphical output for representing the ANI analysis results, and takes the form of a heatmap, or heatmap with dendrogram. """ logger = logging.getLogger(__name__) # Announce what's going on to the user logger.info(termcolor("Generating graphical output for analyses", "red")) logger.info("Writing output to: %s", args.outdir) os.makedirs(args.outdir, exist_ok=True) logger.info("Rendering method: %s", args.method) # Connect to database session logger.debug("Activating session for database: %s", args.dbpath) session = pyani_orm.get_session(args.dbpath) # Parse output formats outfmts = args.formats.split(",") logger.debug("Requested output formats: %s", outfmts) # Work on each run: run_ids = [int(run) for run in args.run_id.split(",")] logger.debug("Generating graphics for runs: %s", run_ids) for run_id in run_ids: write_run_heatmaps(run_id, session, outfmts, args) return 0
def run_main(argv: Optional[List[str]] = None) -> int: """Run main process for pyani.py script. :param argv: """ # If we need to (i.e. a namespace isn't passed), parse the command-line if argv is None: args = parse_cmdline() else: args = parse_cmdline(argv) # Catch execution with no arguments if len(sys.argv) == 1: sys.stderr.write("pyani version: {0}\n".format(__version__)) return 0 # Set up logging time0 = time.time() logger = logging.getLogger(__name__) config_logger(args) # Boilerplate for log logger.info("Processed arguments: %s", args) args.cmdline = " ".join(sys.argv) logger.info("command-line: %s", args.cmdline) add_log_headers() # Run the subcommand returnval = args.func(args) logger.info( termcolor("Completed. Time taken: %.3f", bold=True), (time.time() - time0) ) return returnval
def add_log_headers(): """Add headers to log output.""" logger = logging.getLogger(__name__) # Add citation information to log logger.info(termcolor("CITATION INFO", bold=True)) pyani_citation = [ termcolor( "If you use pyani in your work, please cite the following publication:", "green", ), termcolor( "\tPritchard, L., Glover, R. H., Humphris, S., Elphinstone, J. G.,", "yellow", ), termcolor( "\t& Toth, I.K. (2016) 'Genomics and taxonomy in diagnostics for", "yellow" ), termcolor( "\tfood security: soft-rotting enterobacterial plant pathogens.'", "yellow" ), termcolor( "\tAnalytical Methods, 8(1), 12–24. http://doi.org/10.1039/C5AY02550H", "yellow", ), ] for line in pyani_citation: logger.info(line) # Add dependency citations logger.info(termcolor("DEPENDENCIES", bold=True)) dep_citations = [ "The authors of pyani gratefully acknowledge its dependence on", "the following bioinformatics software:", f"\t{termcolor('MUMmer3', 'cyan')}: S. Kurtz, A. Phillippy, A.L. Delcher, M. Smoot, M. Shumway,", "\tC. Antonescu, and S.L. Salzberg (2004), 'Versatile and open software", "\tfor comparing large genomes' Genome Biology 5:R12", f"\t{termcolor('BLAST+', 'cyan')}: Camacho C., Coulouris G., Avagyan V., Ma N., Papadopoulos J.,", "\tBealer K., & Madden T.L. (2008) 'BLAST+: architecture and applications.'", "\tBMC Bioinformatics 10:421.", f"\t{termcolor('BLAST', 'cyan')}: Altschul, S.F., Madden, T.L., Schäffer, A.A., Zhang, J.,", "\tZhang, Z., Miller, W. & Lipman, D.J. (1997) 'Gapped BLAST and PSI-BLAST:", "\ta new generation of protein database search programs.' Nucleic Acids Res.", "\t25:3389-3402", f"\t{termcolor('Biopython', 'cyan')}: C**k PA, Antao T, Chang JT, Chapman BA, Cox CJ, Dalke A,", "\tFriedberg I, Hamelryck T, Kauff F, Wilczynski B and de Hoon MJL", "\t(2009) Biopython: freely available Python tools for computational", "\tmolecular biology and bioinformatics. Bioinformatics, 25, 1422-1423", ] for line in dep_citations: logger.info(line)
def get_tax_asm_dict(args: Namespace) -> Dict[str, List]: """Return dictionary of assembly UIDs to download, keyed by taxID. :param args: Namespace of command-line arguments """ logger = logging.getLogger(__name__) taxon_ids = download.split_taxa(args.taxon) logger.info(termcolor("Taxon IDs received: %s", "blue"), taxon_ids) asm_dict = download.make_asm_dict(taxon_ids, args.retries) for tid, uids in asm_dict.items(): logger.debug( "Taxon ID summary\n\tQuery: %s\n\tasm count: %s\n\tUIDs: %s", tid, len(uids), uids, ) return asm_dict
def download_genome_and_hash( outdir: Path, timeout: int, dlfiledata: DLFileData, dltype: str = "RefSeq", disable_tqdm: bool = False, ) -> namedlist: """Download genome and accompanying MD5 hash from NCBI. :param args: Namespace for command-line arguments :param outdir: Path to output directory for downloads :param timeout: int: timeout for download attempt :param dlfiledata: namedtuple of info for file to download :param dltype: reference database to use: RefSeq or GenBank :param disable_tqdm: disable progress bar This function tries the (assumed to be passed) RefSeq FTP URL first and, if that fails, then attempts to download the corresponding GenBank data. We attempt to gracefully skip genomes with download errors. """ # Create logger logger = logging.getLogger(__name__) if dltype == "GenBank": filestem = re.sub("^GCF_", "GCA_", dlfiledata.filestem) else: filestem = dlfiledata.filestem dlstatus = retrieve_genome_and_hash( filestem, dlfiledata.suffix, dlfiledata.ftpstem, outdir, timeout, disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.error is not None: # pylint: disable=no-member logger.warning(termcolor("%s download failed: skipping!", "magenta"), dltype) logger.debug("Exception raised:\n%s", dlstatus.error) # pylint: disable=no-member dlstatus.skipped = True return dlstatus # pylint: disable=no-member
def run_anim_jobs(joblist: List[ComparisonJob], args: Namespace) -> None: """Pass ANIm nucmer jobs to the scheduler. :param joblist: list of ComparisonJob namedtuples :param args: command-line arguments for the run """ logger = logging.getLogger(__name__) logger.debug("Scheduler: %s", args.scheduler) if args.scheduler == "multiprocessing": logger.info("Running jobs with multiprocessing") if not args.workers: logger.debug("(using maximum number of worker threads)") else: logger.debug("(using %d worker threads, if available)", args.workers) cumval = run_mp.run_dependency_graph([_.job for _ in joblist], workers=args.workers) if cumval > 0: logger.error( "At least one NUCmer comparison failed. Please investigate (exiting)" ) raise PyaniException("Multiprocessing run failed in ANIm") logger.info("Multiprocessing run completed without error") elif args.scheduler.lower() == "sge": logger.info("Running jobs with SGE") logger.debug("Setting jobarray group size to %d", args.sgegroupsize) logger.debug("Joblist contains %d jobs", len(joblist)) run_sge.run_dependency_graph( [_.job for _ in joblist], jgprefix=args.jobprefix, sgegroupsize=args.sgegroupsize, sgeargs=args.sgeargs, ) else: logger.error(termcolor("Scheduler %s not recognised", "red"), args.scheduler) raise SystemError(1)
def subcmd_report(args: Namespace) -> int: """Present report on ANI results and/or database contents. :param args: Namespace, command-line arguments The report subcommand takes any of several long options that do one of two things: 1. perform a single action. 2. set a parameter/format These will typically take an output path to a file or directory into which the report will be written (whatever form it takes). By default, text output is written in plain text format, but for some outputs this can be modified by an 'excel' or 'html' format specifier, which writes outputs in that format, where possible. """ logger = logging.getLogger(__name__) # Output formats will apply across all tabular data requested # Expect comma-separated format arguments, and turn them into an iterable formats = process_formats(args) logger.info(termcolor("Creating report output in formats: %s", "red"), formats) # Declare which database is being used, and connect to session logger.debug("Using database: %s", args.dbpath) session = pyani_orm.get_session(args.dbpath) # Report runs in the database if args.show_runs: statement = session.query(Run.run_id, Run.name, Run.method, Run.date, Run.cmdline).statement headers = ["run ID", "name", "method", "date run", "command-line"] report(args, session, formats, ReportParams("runs", statement, headers)) # Report genomes in the database if args.show_genomes: statement = session.query( Genome.genome_id, Genome.description, Genome.path, Genome.genome_hash, Genome.length, ).statement headers = [ "genome ID", "description", "path", "MD5 hash", "genome length" ] report(args, session, formats, ReportParams("genomes", statement, headers)) # Report table of all genomes used for each run if args.show_runs_genomes: statement = (session.query( Run.run_id, Run.name, Run.method, Run.date, Genome.genome_id, Genome.description, Genome.path, Genome.genome_hash, Label.label, Label.class_label, ).join(rungenome, Genome.genome_id == rungenome.c.genome_id).join( Label, and_(Genome.genome_id == Label.genome_id, Run.run_id == Label.run_id), ).order_by(Run.run_id, Genome.genome_id).statement) headers = [ "run ID", "run name", "method", "date run", "genome ID", "genome description", "genome path", "genome hash", "genome label", "genome class", ] report( args, session, formats, ReportParams("runs_genomes", statement, headers), ) # Report table of all runs in which a genome is involved if args.show_genomes_runs: statement = (session.query( Genome.genome_id, Run.run_id, Genome.description, Genome.path, Genome.genome_hash, Label.label, Label.class_label, Run.name, Run.method, Run.date, ).join(rungenome, Run.run_id == rungenome.c.run_id).join( Label, and_(Genome.genome_id == Label.genome_id, Run.run_id == Label.run_id), ).order_by(Genome.genome_id, Run.run_id).statement) headers = [ "genome ID", "run ID", "genome description", "genome path", "genome hash", "genome label", "genome class", "run name", "method", "date run", ] report( args, session, formats, ReportParams("genomes_runs", statement, headers), ) # Report table of comparison results for the indicated runs if args.run_results: run_ids = [run_id.strip() for run_id in args.run_results.split(",")] logger.debug("Attempting to write results tables for runs: %s", run_ids) for run_id in run_ids: logger.debug("Processing run ID %s", run_id) genome_query = aliased(Genome, name="genome_query") genome_subject = aliased(Genome, name="genome_subject") statement = (session.query( Comparison.comparison_id, Comparison.query_id, genome_query.description, Comparison.subject_id, genome_subject.description, Comparison.identity, Comparison.cov_query, Comparison.cov_subject, Comparison.aln_length, Comparison.sim_errs, Comparison.program, Comparison.version, Comparison.fragsize, Comparison.maxmatch, Run.run_id, ).join( genome_query, Comparison.query_id == genome_query.genome_id).join( genome_subject, Comparison.subject_id == genome_subject.genome_id).filter( Run.run_id == run_id).statement) headers = [ "Comparison ID", "Query ID", "Query description", "Subject ID", "Subject description", "% identity", "% query coverage", "% subject coverage", "alignment length", "similarity errors", "program", "version", "fragment size", "maxmatch", "Run ID", ] report( args, session, formats, ReportParams(f"results_{run_id}", statement, headers), ) # Report matrices of comparison results for the indicated runs # For ANIm, all results other than coverage are symmetric matrices, # so we only get results in the forward direction. # As we need to pull down the matrices as Pandas dataframes by reading from # JSON, we don't bother with a helper function like report(), and write out # our matrices directly, here if args.run_matrices: for run_id in [ run_id.strip() for run_id in args.run_matrices.split(",") ]: logger.debug("Extracting matrices for run %s", run_id) run = session.query(Run).filter(Run.run_id == run_id).first() matlabel_dict = get_matrix_labels_for_run(session, run_id) for matdata in [ MatrixData(*_) for _ in [ ("identity", run.df_identity, { "colour_num": 0.95 }), ("coverage", run.df_coverage, { "colour_num": 0.95 }), ("aln_lengths", run.df_alnlength, {}), ("sim_errors", run.df_simerrors, {}), ("hadamard", run.df_hadamard, {}), ] ]: logger.debug("Writing %s results", matdata.name) matrix = pd.read_json(matdata.data) # Matrix rows and columns are labelled if there's a label dictionary, # and take the dataframe index otherwise matrix = label_results_matrix(matrix, matlabel_dict) pyani_report.write_dbtable( matrix, Path("_".join([ str(args.outdir / "matrix"), matdata.name, str(run_id) ])), formats, show_index=True, **matdata.graphic_args, ) return 0
def subcmd_anib(args: Namespace) -> None: """Perform ANIb on all genome files in an input directory. :param args: Namespace, command-line arguments Finds ANI by the ANIb method, as described in Goris J, Konstantinidis KT, Klappenbach JA, Coenye T, Vandamme P, et al. (2007) DNA-DNA hybridization values and their relationship to whole-genome sequence similarities. Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. All FASTA format files (selected by suffix) in the input directory are fragmented into (by default 1020nt) consecutive sections, and a BLAST+ database constructed from the whole genome input. The BLAST+ blastn tool is then used to query each set of fragments against each BLAST+ database, in turn. For each query, the BLAST+ .tab output is parsed to obtain alignment length, identity and similarity error count. Alignments below a threshold are not included in the calculation (this introduces systematic bias with respect to ANIm). The results are processed to calculate the ANI percentages, coverage, and similarity error. The calculated values are stored in the local SQLite3 database. """ logger = logging.getLogger(__name__) logger.info(termcolor("Running ANIm analysis", "red")) # announce that we're starting # Get BLAST+ version - this will be used in the database entries blastn_version = anib.get_version(args.blastn_exe) logger.info(termcolor("BLAST+ blastn version: %s", "cyan"), blastn_version) # Use provided name, or make new one for this analysis start_time = datetime.datetime.now() name = args.name or "_".join(["ANIb", start_time.isoformat()]) logger.info("Analysis name: %s", name) # Connect to existing database (which may be "clean" or have old analyses) logger.debug("Connecting to database %s", args.dbpath) try: session = get_session(args.dbpath) except Exception: logger.error("Could not connect to database %s (exiting)", args.dbpath, exc_info=True) raise SystemExit(1) # Add information about this run to the database logger.debug("Adding run info to database %s...", args.dbpath) try: run = add_run( session, method="ANIb", cmdline=args.cmdline, date=start_time, status="started", name=name, ) except PyaniORMException: logger.error("Could not add run to the database (exiting)", exc_info=True) raise SystemExit(1) logger.debug("\t...added run ID: %s to the database", run) # Identify input files for comparison, and populate the database logger.debug("Adding files for %s to database...", run) try: genome_ids = add_run_genomes(session, run, args.indir, args.classes, args.labels) except PyaniORMException: logger.error("Could not add genomes to database for run %s (exiting)", run, exc_info=True) logger.debug("\t...added genome IDs: %s", genome_ids) # Get list of genomes for this analysis from the database logger.info("Compiling genomes for comparison") genomes = run.genomes.all() logger.debug("\tCollected %s genomes for this run", len(genomes)) # Create output directories. We create the main parent directory (args.outdir), but # also subdirectories for the BLAST databases, logger.debug("Creating output directory %s", args.outdir) try: os.makedirs(args.outdir, exist_ok=True) except IOError: logger.error( f"Could not create output directory {args.outdir} (exiting)", exc_info=True) raise SystemError(1) fragdir = Path(str(args.outdir)) / "fragments" blastdbdir = Path(str(args.outdir)) / "blastdbs" logger.debug("\t...creating subdirectories") os.makedirs(fragdir, exist_ok=True) os.makedirs(blastdbdir, exist_ok=True) # Create a new sequence fragment file and a new BLAST+ database for each input genome, # and add this data to the database as a row in BlastDB logger.info("Creating input sequence fragment files") for genome in genomes: fragpath, fraglengths = fragment_fasta_file(Path(str(genome.path)), Path(str(fragdir)), args.fragsize) print(fragpath, len(fraglengths)) # blastdb = add_blastdb( # session, genome, run, fragpath, dbpath, fraglengths, dbcmd # ) raise NotImplementedError # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples logger.info( "Compiling pairwise comparisons (this can take time for large datasets)..." ) comparisons = list( permutations(tqdm(genomes, disable=args.disable_tqdm), 2)) logger.info( f"\t...total parwise comparisons to be performed: {len(comparisons)}") # Check for existing comparisons; if one has already been done (for the same # software package, version, and setting) we add the comparison to this run, # but remove it from the list of comparisons to be performed logger.info("Checking database for existing comparison data...") comparisons_to_run = filter_existing_comparisons(session, run, comparisons, "blastn", blastn_version, args.fragsize, None) logger.info( f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" ) # If there are no comparisons to run, update the Run matrices and exit # from this function if not comparisons_to_run: logger.info( termcolor( "All comparison results present in database (skipping comparisons)", "magenta", )) logger.info("Updating summary matrices with existing results") update_comparison_matrices(session, run) return # If we are in recovery mode, we are salvaging output from a previous # run, and do not necessarily need to rerun all the jobs. In this case, # we prepare a list of output files we want to recover from the results # in the output directory. if args.recovery: logger.warning("Entering recovery mode...") logger.debug( "\tIn this mode, existing comparison output from %s is reused", args.outdir) existingfiles = collect_existing_output(args.outdir, "blastn", args) logger.debug("\tIdentified %s existing output files for reuse", len(existingfiles)) else: existingfiles = None logger.debug(f"\tIdentified no existing output files") # Split the input genome files into contiguous fragments of the specified size, # as described in Goris et al. We create a new directory to hold sequence # fragments, away from the main genomes logger.info("Splitting input genome files into %snt fragments...", args.fragsize) fragdir = Path(args.outdir) / "fragments" os.makedirs(fragdir, exist_ok=True) fragfiles, fraglens = anib.fragment_fasta_files( [Path(str(_.path)) for _ in genomes], Path(args.outdir) / "fragments", args.fragsize, ) logger.debug("...wrote %s fragment files to %s", len(fragfiles), fragdir) # Create list of BLASTN jobs for each comparison still to be performed logger.info("Creating blastn jobs for ANIb...") joblist = generate_joblist(comparisons_to_run, existingfiles, fragfiles, fraglens, args) logger.debug(f"...created %s blastn jobs", len(joblist)) raise NotImplementedError
def subcmd_anim(args: Namespace) -> None: """Perform ANIm on all genome files in an input directory. :param args: Namespace, command-line arguments Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (whose path must be provided). For each pairwise comparison, the NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by sequences in the FASTA files. These are processed to calculated aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome - forward direction), and similarity error count for each pairwise comparison. The calculated values are deposited in the SQLite3 database being used for the analysis. For each pairwise comparison the NUCmer output is stored in the output directory for long enough to extract summary information, but for each run the output is gzip compressed. Once all runs are complete, the outputs for each comparison are concatenated into a single gzip archive. """ # Create logger logger = logging.getLogger(__name__) # Announce the analysis logger.info(termcolor("Running ANIm analysis", bold=True)) # Get current nucmer version nucmer_version = anim.get_version(args.nucmer_exe) logger.info(termcolor("MUMMer nucmer version: %s", "cyan"), nucmer_version) # Use the provided name or make one for the analysis start_time = datetime.datetime.now() name = args.name or "_".join(["ANIm", start_time.isoformat()]) logger.info(termcolor("Analysis name: %s", "cyan"), name) # Get connection to existing database. This may or may not have data logger.debug("Connecting to database %s", args.dbpath) try: session = get_session(args.dbpath) except Exception: logger.error("Could not connect to database %s (exiting)", args.dbpath, exc_info=True) raise SystemExit(1) # Add information about this run to the database logger.debug("Adding run info to database %s...", args.dbpath) try: run = add_run( session, method="ANIm", cmdline=args.cmdline, date=start_time, status="started", name=name, ) except PyaniORMException: logger.error("Could not add run %s to the database (exiting)", run, exc_info=True) raise SystemExit(1) logger.debug("...added run ID: %s to the database", run) # Identify input files for comparison, and populate the database logger.debug("Adding genomes for run %s to database...", run) try: genome_ids = add_run_genomes(session, run, args.indir, args.classes, args.labels) except PyaniORMException: logger.error("Could not add genomes to database for run %s (exiting)", run) raise SystemExit(1) logger.debug("\t...added genome IDs: %s", genome_ids) # Generate commandlines for NUCmer analysis and output compression logger.info("Generating ANIm command-lines") deltadir = args.outdir / pyani_config.ALIGNDIR["ANIm"] logger.debug("NUCmer output will be written temporarily to %s", deltadir) # Create output directories logger.debug("Creating output directory %s", deltadir) try: deltadir.mkdir(exist_ok=True, parents=True) except IOError: logger.error("Could not create output directory %s (exiting)", deltadir, exc_info=True) raise SystemError(1) # Get list of genome IDs for this analysis from the database logger.info("Compiling genomes for comparison") genomes = run.genomes.all() logger.debug("Collected %s genomes for this run", len(genomes)) # Generate all pair combinations of genome IDs as a list of (Genome, Genome) tuples logger.info( "Compiling pairwise comparisons (this can take time for large datasets)..." ) comparisons = list( combinations(tqdm(genomes, disable=args.disable_tqdm), 2)) logger.info("\t...total parwise comparisons to be performed: %s", len(comparisons)) # Check for existing comparisons; if one has been done (for the same # software package, version, and setting) we add the comparison to this run, # but remove it from the list of comparisons to be performed logger.info("Checking database for existing comparison data...") comparisons_to_run = filter_existing_comparisons(session, run, comparisons, "nucmer", nucmer_version, None, args.maxmatch) logger.info("\t...after check, still need to run %s comparisons", len(comparisons_to_run)) # If there are no comparisons to run, update the Run matrices and exit # from this function if not comparisons_to_run: logger.info( termcolor( "All comparison results present in database (skipping comparisons)", "magenta", )) logger.info("Updating summary matrices with existing results") update_comparison_matrices(session, run) return # If we are in recovery mode, we are salvaging output from a previous # run, and do not necessarily need to rerun all the jobs. In this case, # we prepare a list of output files we want to recover from the results # in the output directory. if args.recovery: logger.warning("Entering recovery mode") logger.debug( "\tIn this mode, existing comparison output from %s is reused", deltadir) existingfiles = collect_existing_output(deltadir, "nucmer", args) logger.debug("\tIdentified %s existing output files for reuse", len(existingfiles)) else: existingfiles = list() logger.debug("\tIdentified no existing output files") # Create list of NUCmer jobs for each comparison still to be performed logger.info("Creating NUCmer jobs for ANIm") joblist = generate_joblist(comparisons_to_run, existingfiles, args) logger.debug("Generated %s jobs, %s comparisons", len(joblist), len(comparisons_to_run)) # Pass jobs to appropriate scheduler logger.debug("Passing %s jobs to %s...", len(joblist), args.scheduler) run_anim_jobs(joblist, args) logger.info("...jobs complete") # Process output and add results to database # This requires us to drop out of threading/multiprocessing: Python's SQLite3 # interface doesn't allow sharing connections and cursors logger.info("Adding comparison results to database...") update_comparison_results(joblist, run, session, nucmer_version, args) update_comparison_matrices(session, run) logger.info("...database updated.")
def download_genome(args: Namespace, filestem: str, tid: str, uid: str, uid_class): """Download single genome data to output directory. :param args: Namespace, command-line arguments :param filestem: str, output filestem :param tid: str, taxonID :param uid: str, assembly UID :param uid_class: """ logger = logging.getLogger(__name__) skippedlist = [] refseq_status, genbank_status = True, True # set False if skipped dlfiledata = download.DLFileData(filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all", "genomic.fna.gz") logger.info("Retrieving URLs for %s", filestem) # Try RefSeq first dlstatus = download.download_genome_and_hash( args.outdir, args.timeout, dlfiledata, dltype="RefSeq", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, # pylint: disable=no-member "RefSeq", )) refseq_status = False # RefSeq fails, so try GenBank if refseq_status is False: logger.warning( termcolor("RefSeq failed. Trying GenBank alternative assembly", "magenta")) # Try GenBank assembly dlstatus = download.download_genome_and_hash( args.outdir, args.timeout, dlfiledata, dltype="GenBank", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, "GenBank", )) genbank_status = False logger.warning(termcolor("GenBank failed.", "magenta")) if genbank_status or refseq_status: # One of the downloads worked: report information logger.debug("Downloaded from URL: %s", dlstatus.url) logger.debug("Wrote assembly to: %s", dlstatus.outfname) logger.debug("Wrote MD5 hashes to: %s", dlstatus.outfhash) # Check hash for the download hashstatus = download.check_hash(dlstatus.outfname, dlstatus.outfhash) logger.debug("Local MD5 hash: %s", hashstatus.localhash) logger.debug("NCBI MD5 hash: %s", hashstatus.localhash) if hashstatus.passed: logger.info(termcolor("MD5 hash check passed", "green")) else: logger.warning("MD5 hash check failed. Please check and retry.") return dlstatus, skippedlist
def subcmd_download(args: Namespace) -> int: """Download assembled genomes in subtree of passed NCBI taxon ID. :param args: Namespace, command-line arguments """ # Create logger logger = logging.getLogger(__name__) logger.info(termcolor("Downloading genomes from NCBI", "red")) # Create output directory, respecting force/noclobber if not args.dryrun: tools.make_outdir(args.outdir, args.force, args.noclobber) else: logger.warning( termcolor("Dry run only: will not overwrite or download", "cyan")) # Set Entrez email download.set_ncbi_email(args.email) logger.info("Setting Entrez email address: %s", args.email) # Parse Entrez API key, if provided api_path = args.api_keypath.expanduser() if not api_path.is_file(): logger.warning("API path %s not a valid file. Not using API key.", api_path) api_key = None else: api_key = download.parse_api_key(api_path) logger.info("API key recovered from %s", api_path) # Get list of taxon IDs to download taxon_ids = download.split_taxa(args.taxon) logger.info(termcolor("Taxon IDs received: %s", "blue"), taxon_ids) # Get assembly UIDs for each taxon asm_dict = tools.make_asm_dict(taxon_ids, args.retries) for tid, uids in asm_dict.items(): logger.debug( "Taxon ID summary\n\tQuery: %s\n\tasm count: %s\n\tUIDs: %s", tid, len(uids), uids, ) # Compile outputs to write class and label files, and a list of # skipped downloads (and define a helper tuple for collating skipped # genome information) classes = [] labels = [] skippedlist = [] Skipped = namedtuple("Skipped", "taxon_id accession organism strain url dltype") # Download contigs and hashes for each assembly UID in the list # On completion of this loop, each assembly in the list will either be # downloaded or skipped (with skipped genome information preserved in # skippedlist), and class/label info will be collated, ready for writing # to file. # Summary information is reported to the logger for each eSummary that # can be recovered for tid, uids in asm_dict.items(): logger.info(termcolor("Downloading contigs for Taxon ID %s", "blue"), uids) for uid in uids: # Obtain eSummary logger.info( termcolor("Retrieving eSummary information for UID %s", "cyan"), uid) esummary, filestem = download.get_ncbi_esummary( uid, args.retries, api_key) uid_class = download.get_ncbi_classification(esummary) # Report summary outstr = "\n\t".join([ f"Species Taxid: {esummary['SpeciesTaxid']}", f"TaxID: {esummary['Taxid']}", f"Accession: {esummary['AssemblyAccession']}", f"Name: {esummary['AssemblyName']}", f"Organism: {uid_class.organism}", f"Genus: {uid_class.genus}", f"Species: {uid_class.species}", f"Strain: {uid_class.strain}", ]) logger.debug("eSummary information:\n\t%s", outstr) if args.dryrun: logger.warning("(dry-run) skipping download of %s", esummary["AssemblyAccession"]) continue # Obtain URLs, trying the RefSeq filestem first, then GenBank if # there's a failure dlfiledata = tools.DLFileData( filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all", "genomic.fna.gz") logger.info("Retrieving URLs for %s", filestem) # Try RefSeq first dlstatus = tools.download_genome_and_hash( args, dlfiledata, dltype="RefSeq", disable_tqdm=args.disable_tqdm, ) # RefSeq failed, try GenBank # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, # pylint: disable=no-member "RefSeq", )) logger.warning( "RefSeq failed. Trying GenBank alternative assembly") # Try GenBank assembly dlstatus = tools.download_genome_and_hash( args, dlfiledata, dltype="GenBank", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, "GenBank", )) logger.warning("GenBank failed.") continue # Move straight on to the next download # One of the downloads worked: report information logger.debug("Downloaded from URL: %s", dlstatus.url) logger.debug("Wrote assembly to: %s", dlstatus.outfname) logger.debug("Wrote MD5 hashes to: %s", dlstatus.outfhash) # Check hash for the download hashstatus = download.check_hash(dlstatus.outfname, dlstatus.outfhash) logger.debug("Local MD5 hash: %s", hashstatus.localhash) logger.debug("NCBI MD5 hash: %s", hashstatus.localhash) if hashstatus.passed: logger.info(termcolor("MD5 hash check passed", "green")) else: logger.warning( "MD5 hash check failed. Please check and retry.") # Extract downloaded files ename = dlstatus.outfname.with_suffix( "") # should strip only last suffix if ename.exists() and args.noclobber: logger.warning("Output file %s exists, not extracting", ename) else: logger.debug("Extracting archive %s to %s", dlstatus.outfname, ename) download.extract_contigs(dlstatus.outfname, ename) # Modify sequence ID header if Kraken option active if args.kraken: logger.warning( "Modifying downloaded sequence for Kraken compatibility") seqdata = list(SeqIO.parse(ename, "fasta")) logger.debug("Modifying %s", ename) for seq in seqdata: seq.id = "|".join( [seq.id, "kraken:taxid", esummary["SpeciesTaxid"]]) SeqIO.write(seqdata, ename, "fasta") # Create MD5 hash for the downloaded contigs logger.debug("Creating local MD5 hash for %s", ename) hashfname = ename.with_suffix(".md5") datahash = download.create_hash(ename) logger.debug("Writing hash to %s", hashfname) with open(hashfname, "w") as hfh: hfh.write("\t".join([datahash, str(ename)]) + "\n") # Make label/class text labeltxt, classtxt = download.create_labels( uid_class, filestem, datahash) classes.append(classtxt) labels.append(labeltxt) logger.info( "Label and class file entries\n\tLabel: %s\n\tClass: %s", labeltxt, classtxt, ) # Write class and label files classfname = args.outdir / args.classfname logger.info("Writing classes file to %s", classfname) if classfname.exists() and args.noclobber: logger.warning("Class file %s exists, not overwriting", classfname) else: with open(classfname, "w") as ofh: ofh.write("\n".join(classes) + "\n") labelfname = args.outdir / args.labelfname logger.info("Writing labels file to %s", labelfname) if labelfname.exists() and args.noclobber: logger.warning("Labels file %s exists, not overwriting", labelfname) else: with open(labelfname, "w") as ofh: ofh.write("\n".join(labels) + "\n") # Report skipped genome list if skippedlist: logger.warning(termcolor("%s genome downloads were skipped", "red"), len(skippedlist)) for skipped in skippedlist: outstr = "\n\t".join([ f"taxon id: {skipped.taxon_id}", f"accession: {skipped.accession}", f"URL: {skipped.url}", f"source: {skipped.dltype}", ]) logger.warning("%s %s:\n\t%s", skipped.organism, skipped.strain, outstr) return 0
import logging import sys import time from typing import List, Optional from pyani.logger import config_logger from pyani.pyani_tools import termcolor from .parsers import parse_cmdline from .. import __version__ CITATION_INFO = [ termcolor( "If you use pyani in your work, please cite the following publication:", "green", ), termcolor( "\tPritchard, L., Glover, R. H., Humphris, S., Elphinstone, J. G.,", "yellow", ), termcolor( "\t& Toth, I.K. (2016) 'Genomics and taxonomy in diagnostics for", "yellow"), termcolor( "\tfood security: soft-rotting enterobacterial plant pathogens.'", "yellow"), termcolor( "\tAnalytical Methods, 8(1), 12–24. http://doi.org/10.1039/C5AY02550H", "yellow", ),