def subcmd_index(args: Namespace, logger: Logger) -> int: """Generate a file with the MD5 hash for each genome in an input directory. :param args: Namespace, received command-line arguments :param logger: logging object Identify the genome files in the input directory, and generate a single MD5 for each so that <genome>.fna produces <genome>.md5 Genome files (FASTA) are identified from the file extension. """ # Get list of FASTA files in the input directory logger.info("Scanning directory %s for FASTA files", args.indir) fpaths = pyani_files.get_fasta_paths(args.indir) logger.info("Found FASTA files:") logger.info([f"\t{fpath}\n" for fpath in fpaths]) # Lists of class/label information classes = [] labels = [] # Create MD5 hash for each file, if needed for fpath in fpaths: hashfname = fpath.with_suffix(".md5") if hashfname.is_file(): logger.info("%s already indexed (using existing hash)", fpath) with open(hashfname, "r") as ifh: datahash = ifh.readline().split()[0] else: # Write an .md5 hash file datahash = download.create_hash(fpath) logger.info("Writing hash to %s", hashfname) with open(hashfname, "w") as hfh: hfh.write(f"{datahash}\t{fpath}\n") # Parse the file and get the label/class information with open(fpath, "r") as sfh: label = list(SeqIO.parse(sfh, "fasta"))[0].description.split(" ", 1)[-1] labels.append("\t".join([datahash, fpath.stem, label])) classes.append("\t".join([datahash, fpath.stem, label])) # Write class and label files classfname = args.indir / args.classfname logger.info("Writing classes file to %s", classfname) if classfname.exists(): logger.warning("Class file %s exists, not overwriting", classfname) else: with open(classfname, "w") as ofh: ofh.write("\n".join(classes) + "\n") labelfname = args.indir / args.labelfname logger.info("Writing labels file to %s", labelfname) if labelfname.exists(): logger.warning("Labels file %s exists, not overwriting", labelfname) else: with open(labelfname, "w") as ofh: ofh.write("\n".join(labels) + "\n") return 0
def hash_genomes(args: Namespace, dlstatus: download.DLStatus, filestem: str, uid_class) -> Tuple[str, str]: """Hash genome files in passed dlstatus. :param args: Namespace of command-line arguments :param dlstatus: :param filestem: str, filestem for output :param uid_class: """ logger = logging.getLogger(__name__) # Create MD5 hash for the downloaded contigs ename = dlstatus.outfname.with_suffix("") # should strip only last suffix logger.debug("Creating local MD5 hash for %s", ename) hashfname = ename.with_suffix(".md5") datahash = download.create_hash(ename) logger.debug("Writing hash to %s", hashfname) with open(hashfname, "w") as hfh: hfh.write("\t".join([datahash, str(ename)]) + "\n") # Make label/class text labeltxt, classtxt = download.create_labels(uid_class, filestem, datahash) return labeltxt, classtxt
def get_ncbi_asm(args, asm_uid, fmt="fasta"): """Return the NCBI AssemblyAccession and AssemblyName for an assembly. :param args: Namespace, command-line arguments :param asm_uid: NCBI assembly UID :param fmt: str, format to retrieve assembly information Returns organism data for class/label files also, as well as accession, so we can track whether downloads fail because only the most recent version is available.. AssemblyAccession and AssemblyName are data fields in the eSummary record, and correspond to downloadable files for each assembly at ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GC[AF]/nnn/nnn/nnn/<AA>_<AN> where <AA> is AssemblyAccession, and <AN> is AssemblyName, and the choice of GCA vs GCF, and the three values of nnn are taken from <AA> """ logger = logging.getLogger(__name__) logger.info("Identifying assembly information from NCBI for %s", asm_uid) # Obtain full eSummary data for the assembly summary = Entrez.read( entrez_retry(args, Entrez.esummary, db="assembly", id=asm_uid, report="full"), validate=False, ) # Extract filestem from assembly data data = summary["DocumentSummarySet"]["DocumentSummary"][0] filestem = extract_filestem(data) # Report interesting things from the summary for those interested logger.info("\tOrganism: %s", data["Organism"]) logger.info("\tTaxid: %s", data["SpeciesTaxid"]) logger.info("\tAccession: %s", data["AssemblyAccession"]) logger.info("\tName: %s", data["AssemblyName"]) # NOTE: Maybe parse out the assembly stats here, in future? # Get class and label text organism = data["SpeciesName"] try: strain = data["Biosource"]["InfraspeciesList"][0]["Sub_value"] except (KeyError, IndexError): # we consider this an error/incompleteness in the NCBI metadata strain = "" # Download and extract genome assembly hash_md5 = None try: fastafname = retrieve_asm_contigs(args, filestem, fmt=fmt) hash_md5 = create_hash(fastafname) except NCBIDownloadException: # This is a little hacky. Sometimes, RefSeq assemblies are # suppressed (presumably because they are non-redundant), # but the GenBank assembly persists. In those cases, we # *assume* (because it may not be true) that the corresponding # genbank sequence shares the same accession number, except # that GCF is replaced by GCA gbfilestem = re.sub("^GCF_", "GCA_", filestem) logger.warning("Could not download %s, trying %s", filestem, gbfilestem) try: fastafname = retrieve_asm_contigs(args, gbfilestem, fmt=fmt) hash_md5 = create_hash(fastafname) except NCBIDownloadException: fastafname = None # Create label and class strings genus, species = organism.split(" ", 1) lbltxt = "%s\t%s_genomic\t%s %s %s" % ( hash_md5, filestem, genus[0] + ".", species, strain, ) clstxt = "%s\t%s_genomic\t%s" % (hash_md5, filestem, organism) logger.info("\tLabel: %s", lbltxt) logger.info("\tClass: %s", clstxt) return (fastafname, clstxt, lbltxt, data["AssemblyAccession"])
def subcmd_download(args: Namespace, logger: Logger) -> int: """Download assembled genomes in subtree of passed NCBI taxon ID. :param args: Namespace, command-line arguments :param logger: logging object """ # Create output directory, respecting force/noclobber if not args.dryrun: tools.make_outdir(args.outdir, args.force, args.noclobber, logger) else: logger.warning("Dry run only: will not overwrite or download") # Set Entrez email download.set_ncbi_email(args.email) logger.info(f"Setting Entrez email address: {args.email}") # Parse Entrez API key, if provided api_path = args.api_keypath.expanduser() if not api_path.is_file(): logger.warning( f"API path {api_path} not a valid file. Not using API key.") api_key = None else: api_key = download.parse_api_key(api_path) logger.info(f"API key recovered from {api_path}") # Get list of taxon IDs to download taxon_ids = download.split_taxa(args.taxon) logger.info(f"Taxon IDs received: {taxon_ids}") # Get assembly UIDs for each taxon asm_dict = tools.make_asm_dict(taxon_ids, args.retries) for tid, uids in asm_dict.items(): logger.info( f"Taxon ID summary\n\tQuery: {tid}\n\tasm count: {len(uids)}\n\tUIDs: {uids}" ) # Compile outputs to write class and label files, and a list of # skipped downloads (and define a helper tuple for collating skipped # genome information) classes = [] labels = [] skippedlist = [] Skipped = namedtuple("Skipped", "taxon_id accession organism strain url dltype") # Download contigs and hashes for each assembly UID in the list # On completion of this loop, each assembly in the list will either be # downloaded or skipped (with skipped genome information preserved in # skippedlist), and class/label info will be collated, ready for writing # to file. # Summary information is reported to the logger for each eSummary that # can be recovered for tid, uids in asm_dict.items(): logger.info(f"Downloading contigs for Taxon ID {tid}") for uid in uids: # Obtain eSummary logger.info(f"Get eSummary information for UID {uid}") esummary, filestem = download.get_ncbi_esummary( uid, args.retries, api_key) uid_class = download.get_ncbi_classification(esummary) # Report summary outstr = "\n\t".join([ f"Species Taxid: {esummary['SpeciesTaxid']}", f"TaxID: {esummary['Taxid']}", f"Accession: {esummary['AssemblyAccession']}", f"Name: {esummary['AssemblyName']}", f"Organism: {uid_class.organism}", f"Genus: {uid_class.genus}", f"Species: {uid_class.species}", f"Strain: {uid_class.strain}", ]) logger.info(f"eSummary information:\n\t{outstr}") if args.dryrun: logger.warning( f"(dry-run) skipping download of {esummary['AssemblyAccession']}" ) continue # Obtain URLs, trying the RefSeq filestem first, then GenBank if # there's a failure dlfiledata = tools.DLFileData( filestem, "ftp://ftp.ncbi.nlm.nih.gov/genomes/all", "genomic.fna.gz") logger.info(f"Retrieving URLs for {filestem}") # Try RefSeq first dlstatus = tools.download_genome_and_hash( args, logger, dlfiledata, dltype="RefSeq", disable_tqdm=args.disable_tqdm, ) # RefSeq failed, try GenBank # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, # pylint: disable=no-member "RefSeq", )) logger.warning( "RefSeq failed. Trying GenBank alternative assembly") # Try GenBank assembly dlstatus = tools.download_genome_and_hash( args, logger, dlfiledata, dltype="GenBank", disable_tqdm=args.disable_tqdm, ) # Pylint is confused by the content of dlstatus (a namedlist) if dlstatus.skipped: # pylint: disable=no-member skippedlist.append( Skipped( tid, uid, uid_class.organism, uid_class.strain, dlstatus.url, "GenBank", )) logger.warning("GenBank failed.") continue # Move straight on to the next download # One of the downloads worked: report information logger.info(f"Downloaded from URL: {dlstatus.url}") logger.info(f"Wrote assembly to: {dlstatus.outfname}") logger.info(f"Wrote MD5 hashes to: {dlstatus.outfhash}") # Check hash for the download hashstatus = download.check_hash(dlstatus.outfname, dlstatus.outfhash) logger.info(f"Local MD5 hash: {hashstatus.localhash}") logger.info(f"NCBI MD5 hash: {hashstatus.filehash}") if hashstatus.passed: logger.info("MD5 hash check passed") else: logger.warning( "MD5 hash check failed. Please check and retry.") # Extract downloaded files ename = dlstatus.outfname.with_suffix( "") # should strip only last suffix if ename.exists() and args.noclobber: logger.warning(f"Output file {ename} exists, not extracting") else: logger.info( f"Extracting archive {dlstatus.outfname} to {ename}") download.extract_contigs(dlstatus.outfname, ename) # Modify sequence ID header if Kraken option active if args.kraken: logger.warning( "Modifying downloaded sequence for Kraken compatibility") seqdata = list(SeqIO.parse(ename, "fasta")) logger.info(f"Modifying {ename}") for seq in seqdata: seq.id = "|".join( [seq.id, "kraken:taxid", esummary["SpeciesTaxid"]]) SeqIO.write(seqdata, ename, "fasta") # Create MD5 hash for the downloaded contigs logger.info(f"Creating local MD5 hash for {ename}") hashfname = ename.with_suffix(".md5") datahash = download.create_hash(ename) logger.info("Writing hash to %s" % hashfname) with open(hashfname, "w") as hfh: hfh.write("\t".join([datahash, str(ename)]) + "\n") # Make label/class text labeltxt, classtxt = download.create_labels( uid_class, filestem, datahash) classes.append(classtxt) labels.append(labeltxt) logger.info( f"Label and class file entries\n\tLabel: {labeltxt}\n\tClass: {classtxt}" ) # Write class and label files classfname = args.outdir / args.classfname logger.info(f"Writing classes file to {classfname}") if classfname.exists() and args.noclobber: logger.warning(f"Class file {classfname} exists, not overwriting") else: with open(classfname, "w") as ofh: ofh.write("\n".join(classes) + "\n") labelfname = args.outdir / args.labelfname logger.info(f"Writing labels file to {labelfname}") if labelfname.exists() and args.noclobber: logger.warning(f"Labels file {labelfname} exists, not overwriting") else: with open(labelfname, "w") as ofh: ofh.write("\n".join(labels) + "\n") # Report skipped genome list if skippedlist: logger.warning(f"{len(skippedlist)} genome downloads were skipped") for skipped in skippedlist: outstr = "\n\t".join([ f"taxon id: {skipped.taxon_id}", f"accession: {skipped.accession}", f"URL: {skipped.url}", f"source: {skipped.dltype}", ]) logger.warning(f"{skipped.organism} {skipped.strain}:\n\t{outstr}") return 0