Пример #1
0
    def download_markers(self, options: Iterable) -> None:
        """Download markers database files and amend user config to reflect this.

        Parameters
        ----------
        options : iterable
            iterable containing options in 'markers' section to download.

        Returns
        -------
        NoneType
            Will update provided `options` in `self.config`.

        Raises
        -------
        ConnectionError
            marker file download failed.

        """
        for option in options:
            # First retrieve the markers file url from `option` in `markers`
            url = self.config.get("database_urls", option)
            if self.config.has_option("markers", option):
                outfpath = self.config.get("markers", option)
            else:
                outfname = os.path.basename(url)
                outfpath = os.path.join(self.markers_dir, outfname)

            if self.dryrun:
                logger.debug(f"UPDATE: (markers,{option}): {outfpath}")
                self.config.set("markers", option, outfpath)
                continue

            # Retrieve markers file and write contents to `outfpath`
            with requests.Session() as session, open(outfpath, "w") as fh:
                resp = session.get(url)
                if not resp.ok:
                    raise ConnectionError(f"Failed to retrieve {url}")
                fh.write(resp.text)
            self.config.set("markers", option, outfpath)
            checksum_outfpath = f"{outfpath}.md5"
            write_checksum(outfpath, checksum_outfpath)
            current_checksum = read_checksum(checksum_outfpath)
            current_hash, __ = current_checksum.split()
            remote_checksum = self.get_remote_checksum("markers", option)
            remote_hash, __ = remote_checksum.split()
            if current_hash != remote_hash:
                raise ChecksumMismatchError(f"{option} download failed")
        self.press_hmms()
Пример #2
0
    def compare_checksums(self, section: str = None) -> Dict[str, Dict]:
        """Get all invalid database files in `options` from `section`
        in config. An md5 checksum comparison will be performed between the
        current and file's remote md5 to ensure file integrity prior to
        checking the respective file as valid.

        Parameters
        ----------
        section : str, optional Configure provided `section` Choices include
            'markers' and 'ncbi'. (default will download/format all database
            directories)

        Returns
        -------
        dict {section:{option, option,...}, section:{...}, ...}

        """
        sections = [section] if section else Databases.SECTIONS.keys()
        invalid = {}
        taxdump_checked = False
        for section in sections:
            for option in self.config.options(section):
                if option not in Databases.SECTIONS.get(section):
                    # Skip user added options not required by Autometa
                    continue
                # nodes.dmp, names.dmp and merged.dmp are all in taxdump.tar.gz
                option = "taxdump" if option in {"nodes", "names", "merged"
                                                 } else option
                fpath = self.config.get(section, option)
                fpath_md5 = f"{fpath}.md5"
                # We can not checksum a file that does not exist.
                if not os.path.exists(fpath) and not os.path.exists(fpath_md5):
                    continue
                # To not waste time checking the taxdump files 3 times.
                if option == "taxdump" and taxdump_checked:
                    continue
                if os.path.exists(fpath_md5):
                    current_checksum = read_checksum(fpath_md5)
                else:
                    current_checksum = calc_checksum(fpath)
                current_hash, __ = current_checksum.split()
                try:
                    remote_checksum = self.get_remote_checksum(section, option)
                    remote_hash, __ = remote_checksum.split()
                except ConnectionError as err:
                    # Do not mark file as invalid if a connection error occurs.
                    logger.warning(err)
                    continue
                if option == "taxdump":
                    taxdump_checked = True
                if remote_hash == current_hash:
                    logger.debug(f"{option} checksums match, skipping...")
                    continue
                if section in invalid:
                    invalid[section].add(option)
                else:
                    invalid.update({section: set([option])})
        # Log invalid options
        for section, options in invalid.items():
            for option in options:
                logger.debug(f"INVALID: ({section},{option})")
        return invalid
Пример #3
0
    def download_ncbi_files(self, options: Iterable) -> None:
        """Download NCBI database files.

        Parameters
        ----------
        options : iterable
            iterable containing options in 'ncbi' section to download.

        Returns
        -------
        NoneType
            Will update provided `options` in `self.config`.

        Raises
        -------
        subprocess.CalledProcessError
            NCBI file download with rsync failed.
        ConnectionError
            NCBI file checksums do not match after file transfer.

        """
        # s.t. set methods are available
        options = set(options)
        # If any of the taxdump.tar.gz files are missing,
        # we need to check that taxdump tarball is available to extract them (see self.extract_taxdump).
        for taxdump_option in {"nodes", "names", "merged"}:
            if taxdump_option in options:
                options.add("taxdump")
                options.discard(taxdump_option)
        for option in options:
            ftp_fullpath = self.config.get("database_urls", option)

            if (self.config.has_option("ncbi", option)
                    and self.config.get("ncbi", option) is not None):
                outfpath = self.config.get("ncbi", option)
            else:
                outfname = os.path.basename(ftp_fullpath)
                outfpath = os.path.join(self.ncbi_dir, outfname)

            logger.debug(f"UPDATE: (ncbi,{option}): {outfpath}")
            self.config.set("ncbi", option, outfpath)

            if self.dryrun:
                return

            rsync_fpath = ftp_fullpath.replace("ftp", "rsync", 1)
            cmd = ["rsync", "--quiet", "--archive", rsync_fpath, outfpath]
            logger.debug(f"starting {option} download")
            subprocess.run(cmd,
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL,
                           check=True)
            checksum_outfpath = f"{outfpath}.md5"
            write_checksum(outfpath, checksum_outfpath)
            current_checksum = read_checksum(checksum_outfpath)
            current_hash, __ = current_checksum.split()
            remote_checksum = self.get_remote_checksum("ncbi", option)
            remote_hash, __ = remote_checksum.split()
            if current_hash != remote_hash:
                raise ChecksumMismatchError(f"{option} download failed")
        if "taxdump" in options:
            self.extract_taxdump()
        if "nr" in options:
            self.format_nr()
Пример #4
0
    def format_nr(self) -> None:
        """Construct a diamond formatted database (nr.dmnd) from `nr` option
        in `ncbi` section in user config.

        NOTE: The checksum 'nr.dmnd.md5' will only be generated if nr.dmnd
        construction is successful. If the provided `nr` option in `ncbi` is
        'nr.gz' the database will be removed after successful database
        formatting.

        Returns
        -------
        NoneType
            config updated option:'nr' in section:'ncbi'.

        """
        db_infpath = self.config.get("ncbi", "nr")
        db_infpath_md5 = f"{db_infpath}.md5"
        db_outfpath = db_infpath.replace(".gz", ".dmnd")

        db_outfpath_exists = os.path.exists(db_outfpath)
        if db_outfpath_exists:
            db_outfpath_hash, __ = calc_checksum(db_outfpath).split()

        remote_checksum_matches = False
        current_nr_checksum_matches = False
        # Check database and database checksum is up-to-date
        if os.path.exists(db_infpath_md5) and db_outfpath_exists:
            # Check if the current db md5 is up-to-date with the remote db md5
            current_hash, __ = read_checksum(db_infpath_md5).split()
            remote_hash, __ = self.get_remote_checksum("ncbi", "nr").split()
            if remote_hash == current_hash:
                remote_checksum_matches = True
            # Check if the current db md5 matches the calc'd db checksum
            if db_outfpath_hash == current_hash:
                current_nr_checksum_matches = True

        db_outfpath_md5 = f"{db_outfpath}.md5"
        db_outfpath_md5_checksum_matches = False
        if os.path.exists(db_outfpath_md5) and db_outfpath_exists:
            db_outfpath_md5_hash, __ = read_checksum(db_outfpath_md5).split()
            if db_outfpath_hash == db_outfpath_md5_hash:
                db_outfpath_md5_checksum_matches = True

        checksum_checks = ["nr.dmnd.md5", "nr.gz.md5", "remote nr.gz.md5"]
        checksum_matches = [
            db_outfpath_md5_checksum_matches,
            current_nr_checksum_matches,
            remote_checksum_matches,
        ]
        for checksum_match, checksum_check in zip(checksum_matches,
                                                  checksum_checks):
            # If the checksums do not match, we need to update the database file.
            if checksum_match:
                logger.debug(f"{checksum_check} checksum matches, skipping...")
                self.config.set("ncbi", "nr", db_outfpath)
                logger.debug(f"set ncbi nr: {db_outfpath}")
                return
            # Only update out-of-date db files if user wants to update via self.update
            if not self.update and checksum_check == "remote nr.gz.md5":
                return

        diamond.makedatabase(fasta=db_infpath,
                             database=db_outfpath,
                             nproc=self.nproc)
        # Write checksum for nr.dmnd
        write_checksum(db_outfpath, db_outfpath_md5)

        if os.path.basename(db_infpath) == "nr.gz":
            # nr.gz will be removed after successful nr.dmnd construction
            os.remove(db_infpath)

        self.config.set("ncbi", "nr", db_outfpath)
        logger.debug(f"set ncbi nr: {db_outfpath}")