Пример #1
0
 def __init__(self):
     self.logger = SimpleLogger()
     self.downloader = FileDownloader()
     self.unpacker = FileUnpacker()
     self.repo_store = RepositoryStore()
     self.repositories = set()
     self.db_repositories = {}
Пример #2
0
 def __init__(self):
     self.logger = SimpleLogger()
     self.downloader = FileDownloader()
     self.unpacker = FileUnpacker()
     self.cverepo_store = CveRepoStore()
     self.repos = set()
     self.db_lastmodified = {}
Пример #3
0
 def __init__(self):
     self.logger = get_logger(__name__)
     self.downloader = FileDownloader()
     self.unpacker = FileUnpacker()
     self.repo_store = RepositoryStore()
     self.repositories = set()
     self.certs_tmp_directory = None
     self.certs_files = {}
Пример #4
0
 def __init__(self):
     self.logger = get_logger(__name__)
     self.downloader = FileDownloader()
     self.unpacker = FileUnpacker()
     self.cverepo_store = CveRepoStore()
     self.repos = set()
     self.db_lastmodified = {}
     self.year_since = int(os.getenv('YEAR_SINCE', DEFAULT_YEAR_SINCE))
Пример #5
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.certs_tmp_directory = None
        self.certs_files = {}

    def _get_certs_tuple(self, name):
        if name in self.certs_files:
            return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"]
        return None, None, None

    def _download_repomds(self):
        download_items = []
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml"),
                ca_cert=ca_cert,
                cert=cert,
                key=key
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _read_repomds(self):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        # Fetch current list of repositories from DB
        db_repositories = self.repo_store.list_repositories()
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            repomd = RepoMD(repomd_path)
            # Was repository already synced before?
            repository_key = (repository.content_set, repository.basearch, repository.releasever)
            if repository_key in db_repositories:
                db_revision = db_repositories[repository_key]["revision"]
            else:
                db_revision = None
            downloaded_revision = repomd.get_revision()
            # Repository is synced for the first time or has newer revision
            if db_revision is None or downloaded_revision > db_revision:
                repository.repomd = repomd
            else:
                self.logger.info("Downloaded repo %s (%s) is not newer than repo in DB (%s).",
                                 ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision))

    def _repo_download_failed(self, repo, failed_items):
        failed = False
        for md_path in list(repo.md_files.values()) + [REPOMD_PATH]:
            local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path))
            if local_path in failed_items:
                failed = True
                self.logger.warning("Download failed: %s (HTTP CODE %d)", urljoin(repo.repo_url, md_path),
                                    failed_items[local_path])
        return failed

    def _download_metadata(self, batch):
        download_items = []
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass
            try:
                repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
                item = DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)),
                    ca_cert=ca_cert,
                    cert=cert,
                    key=key
                )
                download_items.append(item)
                self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def _clean_certificate_cache(self):
        if self.certs_tmp_directory:
            shutil.rmtree(self.certs_tmp_directory)
            self.certs_tmp_directory = None
            self.certs_files = {}

    def add_db_repositories(self):
        """Queue all previously imported repositories."""
        repos = self.repo_store.list_repositories()
        for (content_set, basearch, releasever), repo_dict in repos.items():
            # Reference content_set_label -> content set id
            self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"]
            self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever,
                                             cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"],
                                             cert=repo_dict["cert"], key=repo_dict["key"]))

    def add_repository(self, repo_url, content_set, basearch, releasever,
                       cert_name=None, ca_cert=None, cert=None, key=None):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name,
                                         ca_cert=ca_cert, cert=cert, key=key))

    def _write_certificate_cache(self):
        certs = {}
        for repository in self.repositories:
            if repository.cert_name:
                certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert,
                                               "key": repository.key}
        if certs:
            self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-")
            for cert_name in certs:
                self.certs_files[cert_name] = {}
                for cert_type in ["ca_cert", "cert", "key"]:
                    # Cert is not None
                    if certs[cert_name][cert_type]:
                        cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type))
                        with open(cert_path, "w") as cert_file:
                            cert_file.write(certs[cert_name][cert_type])
                        self.certs_files[cert_name][cert_type] = cert_path
                    else:
                        self.certs_files[cert_name][cert_type] = None

    def _find_content_sets_by_regex(self, content_set_regex):
        if not content_set_regex.startswith('^'):
            content_set_regex = '^' + content_set_regex

        if not content_set_regex.endswith('$'):
            content_set_regex = content_set_regex + '$'

        return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id
                if re.match(content_set_regex, content_set_label)]

    def delete_content_set(self, content_set_regex):
        """Deletes content sets described by given regex from DB."""
        for content_set_label in self._find_content_sets_by_regex(content_set_regex):
            self.logger.info("Deleting content set: %s", content_set_label)
            self.repo_store.delete_content_set(content_set_label)
        self.repo_store.cleanup_unused_data()

    def import_repositories(self):
        """Create or update repository records in the DB."""
        self.logger.info("Importing %d repositories.", len(self.repositories))
        for repository in self.repositories:
            self.repo_store.import_repository(repository)

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.info("Checking %d repositories.", len(self.repositories))

        self._write_certificate_cache()

        # Download all repomd files first
        failed = self._download_repomds()
        if failed:
            self.logger.warning("%d repomd.xml files failed to download.", len(failed))
            failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)]
            self.clean_repodata(failed_repos)

        self._read_repomds()
        # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer)
        batches = BatchList()
        to_skip = []
        for repository in self.repositories:
            if repository.repomd:
                batches.add_item(repository)
            else:
                to_skip.append(repository)
        self.clean_repodata(to_skip)
        self.logger.info("%d repositories skipped.", len(to_skip))
        self.logger.info("Syncing %d repositories.", sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            failed = self._download_metadata(batch)
            if failed:
                self.logger.warning("%d metadata files failed to download.", len(failed))
                failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)]
                self.clean_repodata(failed_repos)
                batch = [repo for repo in batch if repo not in failed_repos]
            self._unpack_metadata(batch)
            for repository in batch:
                repository.load_metadata()
                self.repo_store.store(repository)
                repository.unload_metadata()
            self.clean_repodata(batch)

        self.repo_store.cleanup_unused_data()
        self._clean_certificate_cache()
Пример #6
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """

    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.certs_tmp_directory = None
        self.certs_files = {}

    def _get_certs_tuple(self, name):
        if name in self.certs_files:
            return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"]
        return None, None, None

    def _download_repomds(self):
        download_items = []
        certs_tmp_dict = {}
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
            # Check certificate expiration date
            if repository.cert_name:
                certs_tmp_dict[repository.cert_name] = cert

            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml"),
                ca_cert=ca_cert,
                cert=cert,
                key=key
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)

        for cert_name, cert in certs_tmp_dict.items():
            self._check_cert_expiration_date(cert_name, cert)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _check_cert_expiration_date(self, cert_name, cert):
        try:
            # Load certificate
            loaded_cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
            # Get expiration date and parse it to datetime object
            valid_to_dt = datetime.strptime(loaded_cert.get_notAfter(), "%Y%m%d%H%M%SZ")
            expire_in_days_td = (valid_to_dt - datetime.utcnow()).days
            expire_tuple = (valid_to_dt, expire_in_days_td)
            if 30 >= expire_in_days_td > 0:
                self.logger.warning('Certificate %s will expire in %s', cert_name, expire_in_days_td)
                msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate will expire soon', expire_tuple)
                send_slack_notification(msg)
            else:
                self.logger.warning('Certificate %s expired!', cert_name)
                msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate expired', expire_tuple)
                send_slack_notification(msg)
        except crypto.Error:
            self.logger.warning('Certificate not provided or incorrect: %s', cert_name if cert_name else 'None')
            msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate not provided or incorrect')
            send_slack_notification(msg)

    def _read_repomds(self):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        # Fetch current list of repositories from DB
        db_repositories = self.repo_store.list_repositories()
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            repomd = RepoMD(repomd_path)
            # Was repository already synced before?
            repository_key = (repository.content_set, repository.basearch, repository.releasever)
            if repository_key in db_repositories:
                db_revision = db_repositories[repository_key]["revision"]
            else:
                db_revision = None
            downloaded_revision = repomd.get_revision()
            # Repository is synced for the first time or has newer revision
            if db_revision is None or downloaded_revision > db_revision:
                repository.repomd = repomd
            else:
                self.logger.debug("Downloaded repo %s (%s) is not newer than repo in DB (%s).",
                                  ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision))

    def _repo_download_failed(self, repo, failed_items):
        failed = False
        for md_path in list(repo.md_files.values()) + [REPOMD_PATH]:
            local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path))
            if local_path in failed_items:
                failed = True
                # Download errors with no HTTP code are logged in downloader, deduplicate error msgs
                if failed_items[local_path] > 0:
                    self.logger.warning("Download failed: LABEL: %s URL: %s (HTTP CODE %d)",
                                        repo.content_set, urljoin(repo.repo_url, md_path),
                                        failed_items[local_path])
                    FAILED_REPO_WITH_HTTP_CODE.labels(failed_items[local_path]).inc()
        return failed

    def _download_metadata(self, batch):
        download_items = []
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass
            try:
                repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
                item = DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)),
                    ca_cert=ca_cert,
                    cert=cert,
                    key=key
                )
                download_items.append(item)
                self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def _clean_certificate_cache(self):
        if self.certs_tmp_directory:
            shutil.rmtree(self.certs_tmp_directory)
            self.certs_tmp_directory = None
            self.certs_files = {}

    def add_db_repositories(self):
        """Queue all previously imported repositories."""
        repos = self.repo_store.list_repositories()
        for (content_set, basearch, releasever), repo_dict in repos.items():
            # Reference content_set_label -> content set id
            self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"]
            self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever,
                                             cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"],
                                             cert=repo_dict["cert"], key=repo_dict["key"]))

    def add_repository(self, repo_url, content_set, basearch, releasever,
                       cert_name=None, ca_cert=None, cert=None, key=None):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name,
                                         ca_cert=ca_cert, cert=cert, key=key))

    def _write_certificate_cache(self):
        certs = {}
        for repository in self.repositories:
            if repository.cert_name:
                certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert,
                                               "key": repository.key}
        if certs:
            self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-")
            for cert_name in certs:
                self.certs_files[cert_name] = {}
                for cert_type in ["ca_cert", "cert", "key"]:
                    # Cert is not None
                    if certs[cert_name][cert_type]:
                        cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type))
                        with open(cert_path, "w") as cert_file:
                            cert_file.write(certs[cert_name][cert_type])
                        self.certs_files[cert_name][cert_type] = cert_path
                    else:
                        self.certs_files[cert_name][cert_type] = None

    def _find_content_sets_by_regex(self, content_set_regex):
        if not content_set_regex.startswith('^'):
            content_set_regex = '^' + content_set_regex

        if not content_set_regex.endswith('$'):
            content_set_regex = content_set_regex + '$'

        return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id
                if re.match(content_set_regex, content_set_label)]

    def delete_content_set(self, content_set_regex):
        """Deletes content sets described by given regex from DB."""
        for content_set_label in self._find_content_sets_by_regex(content_set_regex):
            self.logger.info("Deleting content set: %s", content_set_label)
            self.repo_store.delete_content_set(content_set_label)
        self.repo_store.cleanup_unused_data()

    def import_repositories(self):
        """Create or update repository records in the DB."""
        self.logger.info("Importing %d repositories.", len(self.repositories))
        failures = 0
        for repository in self.repositories:
            try:
                self.repo_store.import_repository(repository)
            except Exception:  # pylint: disable=broad-except
                failures += 1
        if failures > 0:
            self.logger.warning("Failed to import %d repositories.", failures)
            FAILED_IMPORT_REPO.inc(failures)

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.info("Checking %d repositories.", len(self.repositories))

        self._write_certificate_cache()

        # Download all repomd files first
        failed = self._download_repomds()
        if failed:
            FAILED_REPOMD.inc(len(failed))
            failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)]
            self.logger.warning("%d repomd.xml files failed to download.", len(failed))
            self.clean_repodata(failed_repos)

        self._read_repomds()
        # Filter all repositories without repomd attribute set (downloaded repomd is not newer)
        batches = BatchList()
        up_to_date = []

        def md_size(repomd, data_type):
            try:
                mdata = repomd.get_metadata(data_type)
                # open-size is not present for uncompressed files
                return int(mdata.get('size', 0)) + int(mdata.get('open-size', '0'))
            except RepoMDTypeNotFound:
                return 0

        for repository in self.repositories:
            if repository.repomd:

                repo_size = md_size(repository.repomd, 'primary_db')
                # If we use primary_db, we don't even download primary data xml
                if repo_size == 0:
                    repo_size += md_size(repository.repomd, 'primary')

                repo_size += md_size(repository.repomd, 'updateinfo')
                repo_size += md_size(repository.repomd, 'modules')

                batches.add_item(repository, repo_size)
            else:
                up_to_date.append(repository)

        self.clean_repodata(up_to_date)
        self.logger.info("%d repositories are up to date.", len(up_to_date))
        total_repositories = batches.get_total_items()
        completed_repositories = 0
        self.logger.info("%d repositories need to be synced.", total_repositories)

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        try:
            for batch in batches:
                self.logger.info("Syncing a batch of %d repositories", len(batch))
                try:
                    failed = self._download_metadata(batch)
                    if failed:
                        self.logger.warning("%d metadata files failed to download.", len(failed))
                        failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)]
                        self.clean_repodata(failed_repos)
                        batch = [repo for repo in batch if repo not in failed_repos]
                    self._unpack_metadata(batch)
                    for repository in batch:
                        repository.load_metadata()
                        completed_repositories += 1
                        self.logger.info("Syncing repository: %s [%s/%s]", ", ".join(
                            filter(None, (repository.content_set, repository.basearch, repository.releasever))),
                                         completed_repositories, total_repositories)
                        self.repo_store.store(repository)
                        repository.unload_metadata()
                finally:
                    self.clean_repodata(batch)
        finally:
            self.repo_store.cleanup_unused_data()
            self._clean_certificate_cache()
Пример #7
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """
    def __init__(self):
        self.logger = SimpleLogger()
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.db_repositories = {}

    def _download_repomds(self):
        download_items = []
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml")
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _read_repomds(self, failed):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            if repomd_path not in failed:
                repomd = RepoMD(repomd_path)
                # Was repository already synced before?
                if repository.repo_url in self.db_repositories:
                    db_revision = self.db_repositories[repository.repo_url]["revision"]
                else:
                    db_revision = None
                downloaded_revision = datetime.fromtimestamp(repomd.get_revision(), tz=timezone.utc)
                # Repository is synced for the first time or has newer revision
                if db_revision is None or downloaded_revision > db_revision:
                    repository.repomd = repomd
                else:
                    self.logger.log("Downloaded repo %s (%s) is not newer than repo in DB (%s)." %
                                    (repository.repo_url, str(downloaded_revision), str(db_revision)))
            else:
                self.logger.log("Download failed: %s (HTTP CODE %d)" % (urljoin(repository.repo_url, REPOMD_PATH),
                                                                        failed[repomd_path]))

    def _download_metadata(self, batch):
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                self.downloader.add(DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location))
                ))
        self.downloader.run()

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def add_repository(self, repo_url):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url))

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.log("Checking %d repositories." % len(self.repositories))

        # Fetch current list of repositories from DB
        self.db_repositories = self.repo_store.list_repositories()

        # Download all repomd files first
        failed = self._download_repomds()
        self.logger.log("%d repomd.xml files failed to download." % len(failed))
        self._read_repomds(failed)

        # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer)
        batches = BatchList()
        to_skip = []
        for repository in self.repositories:
            if repository.repomd:
                batches.add_item(repository)
            else:
                to_skip.append(repository)
        self.clean_repodata(to_skip)
        self.logger.log("%d repositories skipped." % len(to_skip))
        self.logger.log("Syncing %d repositories." % sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            self._download_metadata(batch)
            self._unpack_metadata(batch)
            for repository in batch:
                repository.load_metadata()
                self.repo_store.store(repository)
                repository.unload_metadata()
            self.clean_repodata(batch)
Пример #8
0
class CveRepoController:
    """
    Controls import/sync of CVE lists into the DB.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.cverepo_store = CveRepoStore()
        self.repos = set()
        self.db_lastmodified = {}
        self.year_since = int(os.getenv('YEAR_SINCE', DEFAULT_YEAR_SINCE))

    def _download_meta(self):
        download_items = []
        for repo in self.repos:
            repo.tmp_directory = tempfile.mkdtemp(prefix="cverepo-")
            item = DownloadItem(source_url=repo.meta_url(),
                                target_path=repo.meta_tmp())
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {
            item.target_path: item.status_code
            for item in download_items
            if item.status_code not in VALID_HTTP_CODES
        }

    def _read_meta(self, failed):
        """Reads downloaded meta files and checks for updates."""
        for repo in self.repos:
            meta_path = repo.meta_tmp()
            if meta_path not in failed:
                meta = CveMeta(meta_path)
                # already synced before?
                db_lastmodified = parse_datetime(
                    self.db_lastmodified.get(repo.label, None))
                meta_lastmodified = parse_datetime(meta.get_lastmodified())
                # synced for the first time or has newer revision
                if (db_lastmodified is None or meta_lastmodified is None
                        or meta_lastmodified > db_lastmodified):
                    repo.meta = meta
                else:
                    self.logger.info(
                        "Cve list '%s' has not been updated (since %s).",
                        repo.label, str(db_lastmodified))
            else:
                FAILED_NIST.inc()
                self.logger.warning("Download failed: %s (HTTP CODE %d)",
                                    repo.meta_url(), failed[meta_path])

    def _download_json(self, batch):
        for repo in batch:
            self.downloader.add(
                DownloadItem(source_url=repo.json_url(),
                             target_path=repo.json_tmpgz()))
        self.downloader.run()

    def _unpack_json(self, batch):
        for repo in batch:
            self.unpacker.add(repo.json_tmpgz())
        self.unpacker.run()

    def clean_repo(self, batch):
        """Clean downloaded files for given batch."""
        for repo in batch:
            if repo.tmp_directory:
                shutil.rmtree(repo.tmp_directory)
                repo.tmp_directory = None
            self.repos.remove(repo)

    def add_repos(self):
        """Generate urls for CVE lists to download."""
        # Fetch current list of repositories from DB
        self.db_lastmodified = self.cverepo_store.list_lastmodified()

        # CVE files for single years should be used only for initial load
        labels = [
            str(y) for y in range(self.year_since,
                                  int(time.strftime("%Y")) + 1)
        ]
        for label in labels:
            if label not in self.db_lastmodified:
                self.repos.add(CveRepo(label))

        # always import incremental changes
        labels = ['recent', 'modified']
        for label in labels:
            self.repos.add(CveRepo(label))

    def store(self):
        """Sync all queued CVE lists. Runs in batches due to disk space and memory usage."""
        self.logger.info("Checking %d CVE lists.", len(self.repos))

        # Download all repomd files first
        failed = self._download_meta()
        if failed:
            FAILED_NIST.inc()
            self.logger.warning("%d meta files failed to download.",
                                len(failed))
        self._read_meta(failed)

        # filter out failed / unchanged lists
        batches = BatchList()
        to_skip = []
        for repo in self.repos:
            if repo.meta:
                batches.add_item(repo)
            else:
                to_skip.append(repo)
        self.clean_repo(to_skip)
        self.logger.info("%d CVE lists skipped.", len(to_skip))
        self.logger.info("Syncing %d CVE lists.", sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            try:
                self._download_json(batch)
                self._unpack_json(batch)
                for repo in sorted(batch, key=lambda repo: repo.label):
                    repo.load_json()
                    self.cverepo_store.store(repo)
                    repo.unload_json()
            finally:
                self.clean_repo(batch)