Exemplo n.º 1
0
class CpeController:
    """
    Controls import/sync of CPE metadata into the DB.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.cpe_store = CpeStore()
        self.tmp_directory = tempfile.mkdtemp(prefix="cpe-")

    def _cpe_dict_path(self):
        return os.path.join(self.tmp_directory, 'cpe-dictionary.xml')

    def _repo_mapping_path(self):
        return os.path.join(self.tmp_directory, 'repository-to-cpe.json')

    def _download(self):
        cpe_dict_item = DownloadItem(source_url=CPE_DICT_URL,
                                     target_path=self._cpe_dict_path())
        repo_mapping_item = DownloadItem(source_url=REPO_TO_CPE_URL,
                                         target_path=self._repo_mapping_path())
        download_items = [cpe_dict_item, repo_mapping_item]
        for item in download_items:
            self.downloader.add(item)
        self.downloader.run()
        return {
            item.target_path: item.status_code
            for item in download_items
            if item.status_code not in VALID_HTTP_CODES
        }

    def _load(self):
        cpe_dict = CpeDict(self._cpe_dict_path())
        with open(self._repo_mapping_path(), 'r') as repo_mapping_file:
            repo_mapping = json.load(repo_mapping_file)
        return cpe_dict, repo_mapping

    def clean(self):
        """Clean downloaded files for given batch."""
        if self.tmp_directory:
            shutil.rmtree(self.tmp_directory)
            self.tmp_directory = None

    def store(self):
        """Sync CPE metadata."""
        self.logger.info("Checking CPE metadata.")
        try:
            # Download all files first
            failed = self._download()
            for path in failed:
                FAILED_CPE_METADATA.inc()
                self.logger.warning("Download failed: %s (HTTP CODE %d)", path,
                                    failed[path])

            cpe_dict, repo_mapping = self._load()
            self.cpe_store.store(cpe_dict, repo_mapping)
        finally:
            self.clean()
Exemplo n.º 2
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.certs_tmp_directory = None
        self.certs_files = {}

    def _get_certs_tuple(self, name):
        if name in self.certs_files:
            return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"]
        return None, None, None

    def _download_repomds(self):
        download_items = []
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml"),
                ca_cert=ca_cert,
                cert=cert,
                key=key
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _read_repomds(self):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        # Fetch current list of repositories from DB
        db_repositories = self.repo_store.list_repositories()
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            repomd = RepoMD(repomd_path)
            # Was repository already synced before?
            repository_key = (repository.content_set, repository.basearch, repository.releasever)
            if repository_key in db_repositories:
                db_revision = db_repositories[repository_key]["revision"]
            else:
                db_revision = None
            downloaded_revision = repomd.get_revision()
            # Repository is synced for the first time or has newer revision
            if db_revision is None or downloaded_revision > db_revision:
                repository.repomd = repomd
            else:
                self.logger.info("Downloaded repo %s (%s) is not newer than repo in DB (%s).",
                                 ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision))

    def _repo_download_failed(self, repo, failed_items):
        failed = False
        for md_path in list(repo.md_files.values()) + [REPOMD_PATH]:
            local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path))
            if local_path in failed_items:
                failed = True
                self.logger.warning("Download failed: %s (HTTP CODE %d)", urljoin(repo.repo_url, md_path),
                                    failed_items[local_path])
        return failed

    def _download_metadata(self, batch):
        download_items = []
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass
            try:
                repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
                item = DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)),
                    ca_cert=ca_cert,
                    cert=cert,
                    key=key
                )
                download_items.append(item)
                self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def _clean_certificate_cache(self):
        if self.certs_tmp_directory:
            shutil.rmtree(self.certs_tmp_directory)
            self.certs_tmp_directory = None
            self.certs_files = {}

    def add_db_repositories(self):
        """Queue all previously imported repositories."""
        repos = self.repo_store.list_repositories()
        for (content_set, basearch, releasever), repo_dict in repos.items():
            # Reference content_set_label -> content set id
            self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"]
            self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever,
                                             cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"],
                                             cert=repo_dict["cert"], key=repo_dict["key"]))

    def add_repository(self, repo_url, content_set, basearch, releasever,
                       cert_name=None, ca_cert=None, cert=None, key=None):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name,
                                         ca_cert=ca_cert, cert=cert, key=key))

    def _write_certificate_cache(self):
        certs = {}
        for repository in self.repositories:
            if repository.cert_name:
                certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert,
                                               "key": repository.key}
        if certs:
            self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-")
            for cert_name in certs:
                self.certs_files[cert_name] = {}
                for cert_type in ["ca_cert", "cert", "key"]:
                    # Cert is not None
                    if certs[cert_name][cert_type]:
                        cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type))
                        with open(cert_path, "w") as cert_file:
                            cert_file.write(certs[cert_name][cert_type])
                        self.certs_files[cert_name][cert_type] = cert_path
                    else:
                        self.certs_files[cert_name][cert_type] = None

    def _find_content_sets_by_regex(self, content_set_regex):
        if not content_set_regex.startswith('^'):
            content_set_regex = '^' + content_set_regex

        if not content_set_regex.endswith('$'):
            content_set_regex = content_set_regex + '$'

        return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id
                if re.match(content_set_regex, content_set_label)]

    def delete_content_set(self, content_set_regex):
        """Deletes content sets described by given regex from DB."""
        for content_set_label in self._find_content_sets_by_regex(content_set_regex):
            self.logger.info("Deleting content set: %s", content_set_label)
            self.repo_store.delete_content_set(content_set_label)
        self.repo_store.cleanup_unused_data()

    def import_repositories(self):
        """Create or update repository records in the DB."""
        self.logger.info("Importing %d repositories.", len(self.repositories))
        for repository in self.repositories:
            self.repo_store.import_repository(repository)

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.info("Checking %d repositories.", len(self.repositories))

        self._write_certificate_cache()

        # Download all repomd files first
        failed = self._download_repomds()
        if failed:
            self.logger.warning("%d repomd.xml files failed to download.", len(failed))
            failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)]
            self.clean_repodata(failed_repos)

        self._read_repomds()
        # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer)
        batches = BatchList()
        to_skip = []
        for repository in self.repositories:
            if repository.repomd:
                batches.add_item(repository)
            else:
                to_skip.append(repository)
        self.clean_repodata(to_skip)
        self.logger.info("%d repositories skipped.", len(to_skip))
        self.logger.info("Syncing %d repositories.", sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            failed = self._download_metadata(batch)
            if failed:
                self.logger.warning("%d metadata files failed to download.", len(failed))
                failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)]
                self.clean_repodata(failed_repos)
                batch = [repo for repo in batch if repo not in failed_repos]
            self._unpack_metadata(batch)
            for repository in batch:
                repository.load_metadata()
                self.repo_store.store(repository)
                repository.unload_metadata()
            self.clean_repodata(batch)

        self.repo_store.cleanup_unused_data()
        self._clean_certificate_cache()
Exemplo n.º 3
0
class CvemapController:
    """
    Controls import/sync of CVE map into the DB.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.cvemap_store = CvemapStore()
        self.updated = False
        self.lastmodified = None
        self.tmp_directory = tempfile.mkdtemp(prefix="cvemap-")

    def _tmp_head(self):
        return os.path.join(self.tmp_directory, 'cvemap.head')

    def _tmp_xml(self):
        return os.path.join(self.tmp_directory, 'cvemap.xml')

    def _download_head(self):
        item = DownloadItem(source_url=URL,
                            target_path=self._tmp_head()
                           )
        download_items = [item]
        self.downloader.add(item)
        self.downloader.run(headers_only=True)
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _read_head(self, failed):
        """Reads downloaded meta files and checks for updates."""
        if not failed:
            header_path = self._tmp_head()
            header = CvemapHead(header_path)

            # already synced before?
            db_lastmodified = parse_datetime(self.cvemap_store.lastmodified())
            #db_lastmodified = None
            self.lastmodified = parse_datetime(header.get_lastmodified())
            # synced for the first time or has newer revision
            if (db_lastmodified is None
                    or self.lastmodified is None
                    or self.lastmodified > db_lastmodified):
                self.updated = True
            else:
                self.logger.info("Cve map has not been updated (since %s).",
                                 str(db_lastmodified))
        else:
            FAILED_CVEMAP.inc()
            self.logger.warning("Download failed: %s (HTTP CODE %d)", URL, failed[header_path])

    def _download_xml(self):
        self.downloader.add(DownloadItem(source_url=URL,
                                         target_path=self._tmp_xml()))
        self.downloader.run()

    def _load_xml(self, lastmodified):
        return CvemapBody(self._tmp_xml(), lastmodified)

    def clean(self):
        """Clean downloaded files for given batch."""
        if self.tmp_directory:
            shutil.rmtree(self.tmp_directory)
            self.tmp_directory = None

    def store(self):
        """Sync CVE map."""
        self.logger.info("Checking CVE map.")

        # Download all repomd files first
        failed = self._download_head()
        if failed:
            FAILED_CVEMAP.inc()
            self.logger.warning("Cve map failed to download.")
        self._read_head(failed)

        try:
            if self.updated:
                # Download and process cvemap
                self._download_xml()
                cvemap = self._load_xml(self.lastmodified)
                self.cvemap_store.store(cvemap)
        finally:
            self.clean()
Exemplo n.º 4
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """

    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.certs_tmp_directory = None
        self.certs_files = {}

    def _get_certs_tuple(self, name):
        if name in self.certs_files:
            return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"]
        return None, None, None

    def _download_repomds(self):
        download_items = []
        certs_tmp_dict = {}
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
            # Check certificate expiration date
            if repository.cert_name:
                certs_tmp_dict[repository.cert_name] = cert

            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml"),
                ca_cert=ca_cert,
                cert=cert,
                key=key
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)

        for cert_name, cert in certs_tmp_dict.items():
            self._check_cert_expiration_date(cert_name, cert)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _check_cert_expiration_date(self, cert_name, cert):
        try:
            # Load certificate
            loaded_cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
            # Get expiration date and parse it to datetime object
            valid_to_dt = datetime.strptime(loaded_cert.get_notAfter(), "%Y%m%d%H%M%SZ")
            expire_in_days_td = (valid_to_dt - datetime.utcnow()).days
            expire_tuple = (valid_to_dt, expire_in_days_td)
            if 30 >= expire_in_days_td > 0:
                self.logger.warning('Certificate %s will expire in %s', cert_name, expire_in_days_td)
                msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate will expire soon', expire_tuple)
                send_slack_notification(msg)
            else:
                self.logger.warning('Certificate %s expired!', cert_name)
                msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate expired', expire_tuple)
                send_slack_notification(msg)
        except crypto.Error:
            self.logger.warning('Certificate not provided or incorrect: %s', cert_name if cert_name else 'None')
            msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate not provided or incorrect')
            send_slack_notification(msg)

    def _read_repomds(self):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        # Fetch current list of repositories from DB
        db_repositories = self.repo_store.list_repositories()
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            repomd = RepoMD(repomd_path)
            # Was repository already synced before?
            repository_key = (repository.content_set, repository.basearch, repository.releasever)
            if repository_key in db_repositories:
                db_revision = db_repositories[repository_key]["revision"]
            else:
                db_revision = None
            downloaded_revision = repomd.get_revision()
            # Repository is synced for the first time or has newer revision
            if db_revision is None or downloaded_revision > db_revision:
                repository.repomd = repomd
            else:
                self.logger.debug("Downloaded repo %s (%s) is not newer than repo in DB (%s).",
                                  ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision))

    def _repo_download_failed(self, repo, failed_items):
        failed = False
        for md_path in list(repo.md_files.values()) + [REPOMD_PATH]:
            local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path))
            if local_path in failed_items:
                failed = True
                # Download errors with no HTTP code are logged in downloader, deduplicate error msgs
                if failed_items[local_path] > 0:
                    self.logger.warning("Download failed: LABEL: %s URL: %s (HTTP CODE %d)",
                                        repo.content_set, urljoin(repo.repo_url, md_path),
                                        failed_items[local_path])
                    FAILED_REPO_WITH_HTTP_CODE.labels(failed_items[local_path]).inc()
        return failed

    def _download_metadata(self, batch):
        download_items = []
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass
            try:
                repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                ca_cert, cert, key = self._get_certs_tuple(repository.cert_name)
                item = DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)),
                    ca_cert=ca_cert,
                    cert=cert,
                    key=key
                )
                download_items.append(item)
                self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def _clean_certificate_cache(self):
        if self.certs_tmp_directory:
            shutil.rmtree(self.certs_tmp_directory)
            self.certs_tmp_directory = None
            self.certs_files = {}

    def add_db_repositories(self):
        """Queue all previously imported repositories."""
        repos = self.repo_store.list_repositories()
        for (content_set, basearch, releasever), repo_dict in repos.items():
            # Reference content_set_label -> content set id
            self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"]
            self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever,
                                             cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"],
                                             cert=repo_dict["cert"], key=repo_dict["key"]))

    def add_repository(self, repo_url, content_set, basearch, releasever,
                       cert_name=None, ca_cert=None, cert=None, key=None):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name,
                                         ca_cert=ca_cert, cert=cert, key=key))

    def _write_certificate_cache(self):
        certs = {}
        for repository in self.repositories:
            if repository.cert_name:
                certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert,
                                               "key": repository.key}
        if certs:
            self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-")
            for cert_name in certs:
                self.certs_files[cert_name] = {}
                for cert_type in ["ca_cert", "cert", "key"]:
                    # Cert is not None
                    if certs[cert_name][cert_type]:
                        cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type))
                        with open(cert_path, "w") as cert_file:
                            cert_file.write(certs[cert_name][cert_type])
                        self.certs_files[cert_name][cert_type] = cert_path
                    else:
                        self.certs_files[cert_name][cert_type] = None

    def _find_content_sets_by_regex(self, content_set_regex):
        if not content_set_regex.startswith('^'):
            content_set_regex = '^' + content_set_regex

        if not content_set_regex.endswith('$'):
            content_set_regex = content_set_regex + '$'

        return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id
                if re.match(content_set_regex, content_set_label)]

    def delete_content_set(self, content_set_regex):
        """Deletes content sets described by given regex from DB."""
        for content_set_label in self._find_content_sets_by_regex(content_set_regex):
            self.logger.info("Deleting content set: %s", content_set_label)
            self.repo_store.delete_content_set(content_set_label)
        self.repo_store.cleanup_unused_data()

    def import_repositories(self):
        """Create or update repository records in the DB."""
        self.logger.info("Importing %d repositories.", len(self.repositories))
        failures = 0
        for repository in self.repositories:
            try:
                self.repo_store.import_repository(repository)
            except Exception:  # pylint: disable=broad-except
                failures += 1
        if failures > 0:
            self.logger.warning("Failed to import %d repositories.", failures)
            FAILED_IMPORT_REPO.inc(failures)

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.info("Checking %d repositories.", len(self.repositories))

        self._write_certificate_cache()

        # Download all repomd files first
        failed = self._download_repomds()
        if failed:
            FAILED_REPOMD.inc(len(failed))
            failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)]
            self.logger.warning("%d repomd.xml files failed to download.", len(failed))
            self.clean_repodata(failed_repos)

        self._read_repomds()
        # Filter all repositories without repomd attribute set (downloaded repomd is not newer)
        batches = BatchList()
        up_to_date = []

        def md_size(repomd, data_type):
            try:
                mdata = repomd.get_metadata(data_type)
                # open-size is not present for uncompressed files
                return int(mdata.get('size', 0)) + int(mdata.get('open-size', '0'))
            except RepoMDTypeNotFound:
                return 0

        for repository in self.repositories:
            if repository.repomd:

                repo_size = md_size(repository.repomd, 'primary_db')
                # If we use primary_db, we don't even download primary data xml
                if repo_size == 0:
                    repo_size += md_size(repository.repomd, 'primary')

                repo_size += md_size(repository.repomd, 'updateinfo')
                repo_size += md_size(repository.repomd, 'modules')

                batches.add_item(repository, repo_size)
            else:
                up_to_date.append(repository)

        self.clean_repodata(up_to_date)
        self.logger.info("%d repositories are up to date.", len(up_to_date))
        total_repositories = batches.get_total_items()
        completed_repositories = 0
        self.logger.info("%d repositories need to be synced.", total_repositories)

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        try:
            for batch in batches:
                self.logger.info("Syncing a batch of %d repositories", len(batch))
                try:
                    failed = self._download_metadata(batch)
                    if failed:
                        self.logger.warning("%d metadata files failed to download.", len(failed))
                        failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)]
                        self.clean_repodata(failed_repos)
                        batch = [repo for repo in batch if repo not in failed_repos]
                    self._unpack_metadata(batch)
                    for repository in batch:
                        repository.load_metadata()
                        completed_repositories += 1
                        self.logger.info("Syncing repository: %s [%s/%s]", ", ".join(
                            filter(None, (repository.content_set, repository.basearch, repository.releasever))),
                                         completed_repositories, total_repositories)
                        self.repo_store.store(repository)
                        repository.unload_metadata()
                finally:
                    self.clean_repodata(batch)
        finally:
            self.repo_store.cleanup_unused_data()
            self._clean_certificate_cache()
Exemplo n.º 5
0
class RepositoryController:
    """
    Class for importing/syncing set of repositories into the DB.
    First, repomd from all repositories are downloaded and parsed.
    Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported.
    """
    def __init__(self):
        self.logger = SimpleLogger()
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.repo_store = RepositoryStore()
        self.repositories = set()
        self.db_repositories = {}

    def _download_repomds(self):
        download_items = []
        for repository in self.repositories:
            repomd_url = urljoin(repository.repo_url, REPOMD_PATH)
            repository.tmp_directory = tempfile.mkdtemp(prefix="repo-")
            item = DownloadItem(
                source_url=repomd_url,
                target_path=os.path.join(repository.tmp_directory, "repomd.xml")
            )
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {item.target_path: item.status_code for item in download_items
                if item.status_code not in VALID_HTTP_CODES}

    def _read_repomds(self, failed):
        """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are
           newer than metadata currently in DB.
        """
        for repository in self.repositories:
            repomd_path = os.path.join(repository.tmp_directory, "repomd.xml")
            if repomd_path not in failed:
                repomd = RepoMD(repomd_path)
                # Was repository already synced before?
                if repository.repo_url in self.db_repositories:
                    db_revision = self.db_repositories[repository.repo_url]["revision"]
                else:
                    db_revision = None
                downloaded_revision = datetime.fromtimestamp(repomd.get_revision(), tz=timezone.utc)
                # Repository is synced for the first time or has newer revision
                if db_revision is None or downloaded_revision > db_revision:
                    repository.repomd = repomd
                else:
                    self.logger.log("Downloaded repo %s (%s) is not newer than repo in DB (%s)." %
                                    (repository.repo_url, str(downloaded_revision), str(db_revision)))
            else:
                self.logger.log("Download failed: %s (HTTP CODE %d)" % (urljoin(repository.repo_url, REPOMD_PATH),
                                                                        failed[repomd_path]))

    def _download_metadata(self, batch):
        for repository in batch:
            # primary_db has higher priority, use primary.xml if not found
            try:
                repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"]
            except RepoMDTypeNotFound:
                repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"]
            # updateinfo.xml may be missing completely
            try:
                repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"]
            except RepoMDTypeNotFound:
                pass

            # queue metadata files for download
            for md_location in repository.md_files.values():
                self.downloader.add(DownloadItem(
                    source_url=urljoin(repository.repo_url, md_location),
                    target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location))
                ))
        self.downloader.run()

    def _unpack_metadata(self, batch):
        for repository in batch:
            for md_type in repository.md_files:
                self.unpacker.add(os.path.join(repository.tmp_directory,
                                               os.path.basename(repository.md_files[md_type])))
                # FIXME: this should be done in different place?
                repository.md_files[md_type] = os.path.join(
                    repository.tmp_directory,
                    os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0]
        self.unpacker.run()

    def clean_repodata(self, batch):
        """Clean downloaded repodata of all repositories in batch."""
        for repository in batch:
            if repository.tmp_directory:
                shutil.rmtree(repository.tmp_directory)
                repository.tmp_directory = None
            self.repositories.remove(repository)

    def add_repository(self, repo_url):
        """Queue repository to import/check updates."""
        repo_url = repo_url.strip()
        if not repo_url.endswith("/"):
            repo_url += "/"
        self.repositories.add(Repository(repo_url))

    def store(self):
        """Sync all queued repositories. Process repositories in batches due to disk space and memory usage."""
        self.logger.log("Checking %d repositories." % len(self.repositories))

        # Fetch current list of repositories from DB
        self.db_repositories = self.repo_store.list_repositories()

        # Download all repomd files first
        failed = self._download_repomds()
        self.logger.log("%d repomd.xml files failed to download." % len(failed))
        self._read_repomds(failed)

        # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer)
        batches = BatchList()
        to_skip = []
        for repository in self.repositories:
            if repository.repomd:
                batches.add_item(repository)
            else:
                to_skip.append(repository)
        self.clean_repodata(to_skip)
        self.logger.log("%d repositories skipped." % len(to_skip))
        self.logger.log("Syncing %d repositories." % sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            self._download_metadata(batch)
            self._unpack_metadata(batch)
            for repository in batch:
                repository.load_metadata()
                self.repo_store.store(repository)
                repository.unload_metadata()
            self.clean_repodata(batch)
Exemplo n.º 6
0
class CveRepoController:
    """
    Controls import/sync of CVE lists into the DB.
    """
    def __init__(self):
        self.logger = get_logger(__name__)
        self.downloader = FileDownloader()
        self.unpacker = FileUnpacker()
        self.cverepo_store = CveRepoStore()
        self.repos = set()
        self.db_lastmodified = {}
        self.year_since = int(os.getenv('YEAR_SINCE', DEFAULT_YEAR_SINCE))

    def _download_meta(self):
        download_items = []
        for repo in self.repos:
            repo.tmp_directory = tempfile.mkdtemp(prefix="cverepo-")
            item = DownloadItem(source_url=repo.meta_url(),
                                target_path=repo.meta_tmp())
            # Save for future status code check
            download_items.append(item)
            self.downloader.add(item)
        self.downloader.run()
        # Return failed downloads
        return {
            item.target_path: item.status_code
            for item in download_items
            if item.status_code not in VALID_HTTP_CODES
        }

    def _read_meta(self, failed):
        """Reads downloaded meta files and checks for updates."""
        for repo in self.repos:
            meta_path = repo.meta_tmp()
            if meta_path not in failed:
                meta = CveMeta(meta_path)
                # already synced before?
                db_lastmodified = parse_datetime(
                    self.db_lastmodified.get(repo.label, None))
                meta_lastmodified = parse_datetime(meta.get_lastmodified())
                # synced for the first time or has newer revision
                if (db_lastmodified is None or meta_lastmodified is None
                        or meta_lastmodified > db_lastmodified):
                    repo.meta = meta
                else:
                    self.logger.info(
                        "Cve list '%s' has not been updated (since %s).",
                        repo.label, str(db_lastmodified))
            else:
                FAILED_NIST.inc()
                self.logger.warning("Download failed: %s (HTTP CODE %d)",
                                    repo.meta_url(), failed[meta_path])

    def _download_json(self, batch):
        for repo in batch:
            self.downloader.add(
                DownloadItem(source_url=repo.json_url(),
                             target_path=repo.json_tmpgz()))
        self.downloader.run()

    def _unpack_json(self, batch):
        for repo in batch:
            self.unpacker.add(repo.json_tmpgz())
        self.unpacker.run()

    def clean_repo(self, batch):
        """Clean downloaded files for given batch."""
        for repo in batch:
            if repo.tmp_directory:
                shutil.rmtree(repo.tmp_directory)
                repo.tmp_directory = None
            self.repos.remove(repo)

    def add_repos(self):
        """Generate urls for CVE lists to download."""
        # Fetch current list of repositories from DB
        self.db_lastmodified = self.cverepo_store.list_lastmodified()

        # CVE files for single years should be used only for initial load
        labels = [
            str(y) for y in range(self.year_since,
                                  int(time.strftime("%Y")) + 1)
        ]
        for label in labels:
            if label not in self.db_lastmodified:
                self.repos.add(CveRepo(label))

        # always import incremental changes
        labels = ['recent', 'modified']
        for label in labels:
            self.repos.add(CveRepo(label))

    def store(self):
        """Sync all queued CVE lists. Runs in batches due to disk space and memory usage."""
        self.logger.info("Checking %d CVE lists.", len(self.repos))

        # Download all repomd files first
        failed = self._download_meta()
        if failed:
            FAILED_NIST.inc()
            self.logger.warning("%d meta files failed to download.",
                                len(failed))
        self._read_meta(failed)

        # filter out failed / unchanged lists
        batches = BatchList()
        to_skip = []
        for repo in self.repos:
            if repo.meta:
                batches.add_item(repo)
            else:
                to_skip.append(repo)
        self.clean_repo(to_skip)
        self.logger.info("%d CVE lists skipped.", len(to_skip))
        self.logger.info("Syncing %d CVE lists.", sum(len(l) for l in batches))

        # Download and process repositories in batches (unpacked metadata files can consume lot of disk space)
        for batch in batches:
            try:
                self._download_json(batch)
                self._unpack_json(batch)
                for repo in sorted(batch, key=lambda repo: repo.label):
                    repo.load_json()
                    self.cverepo_store.store(repo)
                    repo.unload_json()
            finally:
                self.clean_repo(batch)