Пример #1
0
    def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None:
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)
Пример #2
0
    def __init__(self, repomgr, statedir, parseddir, safety_checks=True):
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)
Пример #3
0
    def __init__(self,
                 repomgr,
                 statedir,
                 fetch_retries=3,
                 fetch_retry_delay=30,
                 safety_checks=True):
        self.repomgr = repomgr
        self.statedir = statedir
        self.fetch_retries = fetch_retries
        self.fetch_retry_delay = fetch_retry_delay
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers',
                                            superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers',
                                           superclass=Parser)
Пример #4
0
class RepositoryProcessor:
    def __init__(self, repomgr, statedir, safety_checks=True):
        self.repomgr = repomgr
        self.statedir = statedir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers',
                                            superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers',
                                           superclass=Parser)

    def __GetRepoPath(self, repository):
        return os.path.join(self.statedir, repository['name'] + '.state')

    def __GetSourcePath(self, repository, source):
        return os.path.join(self.__GetRepoPath(repository),
                            source['name'].replace('/', '_'))

    def __GetSerializedPath(self, repository):
        return os.path.join(self.statedir, repository['name'] + '.packages')

    def __CheckRepositoryOutdatedness(self, repository, logger):
        if 'valid_till' in repository and datetime.date.today(
        ) >= repository['valid_till']:
            logger.log(
                'repository {} has reached EoL, consider updating configs'.
                format(repository['name']),
                severity=Logger.WARNING)

    # Private methods which provide single actions on sources
    def __FetchSource(self, update, repository, source, logger):
        if 'fetcher' not in source:
            logger.log('fetching source {} not supported'.format(
                source['name']))
            return

        logger.log('fetching source {} started'.format(source['name']))

        self.fetcher_factory.SpawnWithKnownArgs(
            source['fetcher'],
            source).fetch(self.__GetSourcePath(repository, source),
                          update=update,
                          logger=logger.GetIndented())

        logger.log('fetching source {} complete'.format(source['name']))

    def _iter_parse_source(self, repository, source, logger):
        def postprocess_parsed_packages(packages_iter):
            for package in packages_iter:
                if isinstance(package, PackageMaker):
                    # unwrap packagemaker
                    if not package.check_sanity(True):
                        continue

                    package = package.unwrap()
                else:
                    # XXX: compatibility shim for parsers still returning raw packages
                    if not package.name:
                        raise InconsistentPackage(
                            'encountered package with no name')

                    if not package.version:
                        # XXX: this currently fires on kdepim in dports; it's pretty fatal on
                        # one hand, but shouldn't stop whole repo from updating on another. In
                        # future, it should be logged as some kind of very serious repository
                        # update error
                        logger.log('package with empty version: {}'.format(
                            package.name),
                                   severity=Logger.ERROR)
                        continue

                # fill subrepos
                if 'subrepo' in source:
                    package.subrepo = source['subrepo']

                # fill default maintainer
                if not package.maintainers:
                    if 'default_maintainer' in repository:
                        package.maintainers = [
                            repository['default_maintainer']
                        ]
                    else:
                        package.maintainers = [
                            'fallback-mnt-{}@repology'.format(
                                repository['name'])
                        ]

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.SpawnWithKnownArgs(
                source['parser'],
                source).iter_parse(self.__GetSourcePath(repository, source),
                                   PackageFactory(logger)))

    # Private methods which provide single actions on repos
    def __Fetch(self, update, repository, logger):
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        for source in repository['sources']:
            if not os.path.isdir(self.__GetRepoPath(repository)):
                os.mkdir(self.__GetRepoPath(repository))
            self.__FetchSource(update, repository, source,
                               logger.GetIndented())

        logger.log('fetching complete')

    def _parse(self, repository, logger):
        logger.log('parsing started')

        packages = []

        for source in repository['sources']:
            logger.log('parsing source {} started'.format(source['name']))

            packages.extend(
                self._iter_parse_source(repository, source,
                                        logger.GetIndented()))

            logger.log('parsing source {} complete'.format(source['name']))

        logger.log('parsing complete, {} packages, deduplicating'.format(
            len(packages)))

        packages = PackagesetDeduplicate(packages)

        if self.safety_checks and len(packages) < repository['minpackages']:
            raise TooLittlePackages(len(packages), repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(len(packages)))

        return packages

    def __Transform(self, packages, transformer, repository, logger):
        logger.log('processing started')
        sanitylogger = logger.GetIndented()
        for package in packages:
            package.repo = repository['name']
            package.family = repository['family']

            if repository.get('shadow', False):
                package.shadow = True

            if transformer:
                transformer.Process(package)

            # strip leading project name from flavor
            def strip_flavor(flavor):
                if flavor.startswith(package.effname + '-'):
                    return flavor[len(package.effname) + 1:]
                return flavor

            package.flavors = sorted(set(map(strip_flavor, package.flavors)))

            try:
                package.CheckSanity(transformed=transformer is not None)
            except PackageSanityCheckFailure as err:
                sanitylogger.log('sanity error: {}'.format(err),
                                 severity=Logger.ERROR)
                raise
            except PackageSanityCheckProblem as err:
                sanitylogger.log('sanity warning: {}'.format(err),
                                 severity=Logger.WARNING)

            package.Normalize()

        # XXX: in future, ignored packages will not be dropped here, but
        # ignored in summary and version calcualtions, but shown in
        # package listing
        packages = [
            package for package in packages
            if not package.HasFlag(PackageFlags.remove)
        ]

        logger.log('processing complete, {} packages, deduplicating'.format(
            len(packages)))

        packages = PackagesetDeduplicate(packages)

        if transformer:
            logger.log('processing complete, {} packages, sorting'.format(
                len(packages)))

            packages = sorted(packages, key=lambda package: package.effname)

        logger.log('processing complete, {} packages'.format(len(packages)))

        return packages

    def __Serialize(self, packages, path, repository, logger):
        tmppath = path + '.tmp'

        logger.log('saving started')
        with open(tmppath, 'wb') as outfile:
            pickler = pickle.Pickler(outfile, protocol=pickle.HIGHEST_PROTOCOL)
            pickler.fast = True  # deprecated, but I don't see any alternatives
            pickler.dump(len(packages))
            for package in packages:
                pickler.dump(package)
        os.replace(tmppath, path)
        logger.log('saving complete, {} packages'.format(len(packages)))

    def __Deserialize(self, path, repository, logger):
        packages = []
        logger.log('loading started')
        with open(path, 'rb') as infile:
            unpickler = pickle.Unpickler(infile)
            numpackages = unpickler.load()
            packages = [unpickler.load() for num in range(0, numpackages)]
            if packages and not packages[0].CheckFormat():
                raise StateFileFormatCheckProblem(path)
        logger.log('loading complete, {} packages'.format(len(packages)))

        return packages

    class StreamDeserializer:
        def __init__(self, path, logger):
            try:
                self.unpickler = pickle.Unpickler(open(path, 'rb'))
                self.count = self.unpickler.load()
            except FileNotFoundError:
                logger.log(
                    'parsed package data file {} does not exist, treating repository as empty'
                    .format(path),
                    severity=Logger.ERROR)
                self.count = 0

            self.current = None

            self.Get()

            if self.current and not self.current.CheckFormat():
                raise StateFileFormatCheckProblem(path)

        def Peek(self):
            return self.current

        def EOF(self):
            return self.current is None

        def Get(self):
            current = self.current
            if self.count == 0:
                self.current = None
            else:
                self.current = self.unpickler.load()
                self.count -= 1
            return current

    # Single repo methods
    def Fetch(self, reponame, update=True, logger=NoopLogger()):
        repository = self.repomgr.GetRepository(reponame)

        self.__CheckRepositoryOutdatedness(repository, logger)

        self.__Fetch(update, repository, logger)

    def Parse(self, reponame, transformer, logger=NoopLogger()):
        repository = self.repomgr.GetRepository(reponame)

        packages = self._parse(repository, logger)
        packages = self.__Transform(packages, transformer, repository, logger)

        return packages

    def ParseAndSerialize(self, reponame, transformer, logger=NoopLogger()):
        repository = self.repomgr.GetRepository(reponame)

        packages = self._parse(repository, logger)
        packages = self.__Transform(packages, transformer, repository, logger)
        self.__Serialize(packages, self.__GetSerializedPath(repository),
                         repository, logger)

        return packages

    def Deserialize(self, reponame, logger=NoopLogger()):
        repository = self.repomgr.GetRepository(reponame)

        return self.__Deserialize(self.__GetSerializedPath(repository),
                                  repository, logger)

    # Multi repo methods
    def ParseMulti(self,
                   reponames=None,
                   transformer=None,
                   logger=NoopLogger()):
        packages = []

        for repo in self.repomgr.GetRepositories(reponames):
            packages += self.Parse(repo['name'],
                                   transformer=transformer,
                                   logger=logger.GetPrefixed(repo['name'] +
                                                             ': '))

        return packages

    def DeserializeMulti(self, reponames=None, logger=NoopLogger()):
        packages = []

        for repo in self.repomgr.GetRepositories(reponames):
            packages += self.Deserialize(
                repo['name'], logger=logger.GetPrefixed(repo['name'] + ': '))

        return packages

    def StreamDeserializeMulti(self, reponames=None, logger=NoopLogger()):
        deserializers = []
        for repo in self.repomgr.GetRepositories(reponames):
            deserializers.append(
                self.StreamDeserializer(self.__GetSerializedPath(repo),
                                        logger))

        while True:
            # remove EOFed repos
            deserializers = [ds for ds in deserializers if not ds.EOF()]

            # stop when all deserializers are empty
            if not deserializers:
                break

            # find lowest key (effname)
            thiskey = deserializers[0].Peek().effname
            for ds in deserializers[1:]:
                thiskey = min(thiskey, ds.Peek().effname)

            # fetch all packages with given key from all deserializers
            packageset = []
            for ds in deserializers:
                while not ds.EOF() and ds.Peek().effname == thiskey:
                    packageset.append(ds.Get())

            yield packageset
Пример #5
0
class RepositoryProcessor:
    def __init__(self,
                 repomgr: RepositoryManager,
                 statedir: str,
                 parseddir: str,
                 safety_checks: bool = True) -> None:
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers',
                                            superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers',
                                           superclass=Parser)

    def _get_state_path(self, repository: RepositoryMetadata) -> str:
        return os.path.join(self.statedir, repository['name'] + '.state')

    def _get_state_source_path(self, repository: RepositoryMetadata,
                               source: RepositoryMetadata) -> str:
        return os.path.join(self._get_state_path(repository),
                            source['name'].replace('/', '_'))

    def _get_parsed_path(self, repository: RepositoryMetadata) -> str:
        return os.path.join(self.parseddir, repository['name'] + '.parsed')

    def _get_parsed_chunk_paths(self,
                                repository: RepositoryMetadata) -> List[str]:
        dirpath = self._get_parsed_path(repository)
        return [
            os.path.join(dirpath, filename) for filename in os.listdir(dirpath)
        ] if os.path.isdir(dirpath) else []

    # source level private methods
    def _fetch_source(self, repository: RepositoryMetadata, update: bool,
                      source: RepositoryMetadata, logger: Logger) -> bool:
        if 'fetcher' not in source:
            logger.log('fetching source {} not supported'.format(
                source['name']))
            return False

        logger.log('fetching source {} started'.format(source['name']))

        fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args(
            source['fetcher'], source)

        have_changes = fetcher.fetch(self._get_state_source_path(
            repository, source),
                                     update=update,
                                     logger=logger.get_indented())

        logger.log('fetching source {} complete'.format(source['name']) +
                   ('' if have_changes else ' (no changes)'))

        return have_changes

    def _iter_parse_source(self, repository: RepositoryMetadata,
                           source: RepositoryMetadata,
                           transformer: Optional[PackageTransformer],
                           logger: Logger) -> Iterator[Package]:
        def postprocess_parsed_packages(
                packages_iter: Iterable[PackageMaker]) -> Iterator[Package]:
            for packagemaker in packages_iter:
                try:
                    package = packagemaker.spawn(
                        repo=repository['name'],
                        family=repository['family'],
                        subrepo=source.get('subrepo'),
                        shadow=repository.get('shadow', False),
                        default_maintainer=repository.get(
                            'default_maintainer'),
                    )
                except RuntimeError as e:
                    packagemaker.log(str(e), Logger.ERROR)
                    raise

                # transform
                if transformer:
                    transformer.process(package)

                # skip removed packages
                if package.has_flag(PackageFlags.REMOVE):
                    continue

                # postprocess flavors
                def strip_flavor(flavor: str) -> str:
                    flavor.removeprefix(package.effname + '-')
                    return flavor

                package.flavors = sorted(
                    set(map(strip_flavor, package.flavors)))

                # XXX: arch is not used anywhere yet, and until #711 is implemented,
                # it just introduces package duplicates; it's a crude solution, but
                # just drop it here
                package.arch = None

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.spawn_with_known_args(
                source['parser'], source).iter_parse(
                    self._get_state_source_path(repository, source),
                    PackageFactory(logger), transformer))

    def _iter_parse_all_sources(self, repository: RepositoryMetadata,
                                transformer: Optional[PackageTransformer],
                                logger: Logger) -> Iterator[Package]:
        for source in repository['sources']:
            logger.log('parsing source {} started'.format(source['name']))
            yield from self._iter_parse_source(repository, source, transformer,
                                               logger.get_indented())
            logger.log('parsing source {} complete'.format(source['name']))

    # repository level private methods
    def _fetch(self, repository: RepositoryMetadata, update: bool,
               logger: Logger) -> bool:
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        have_changes = False
        for source in repository['sources']:
            if not os.path.isdir(self._get_state_path(repository)):
                os.mkdir(self._get_state_path(repository))
            have_changes |= self._fetch_source(repository, update, source,
                                               logger.get_indented())

        logger.log('fetching complete' +
                   ('' if have_changes else ' (no changes)'))

        return have_changes

    def _parse(self, repository: RepositoryMetadata,
               transformer: Optional[PackageTransformer],
               logger: Logger) -> None:
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        with AtomicDir(self._get_parsed_path(repository)) as state_dir:
            serializer = ChunkedSerializer(state_dir.get_path(),
                                           MAX_PACKAGES_PER_CHUNK)

            serializer.serialize(
                self._iter_parse_all_sources(repository, transformer, logger))

            if self.safety_checks and serializer.get_num_packages(
            ) < repository['minpackages']:
                raise TooLittlePackages(serializer.get_num_packages(),
                                        repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(
            serializer.get_num_packages()))

    # public methods
    def fetch(self,
              reponames: RepositoryNameList,
              update: bool = True,
              logger: Logger = NoopLogger()) -> bool:
        have_changes = False

        for repository in self.repomgr.get_repositories(reponames):
            have_changes |= self._fetch(repository, update, logger)

        return have_changes

    def parse(
        self,
        reponames: RepositoryNameList,
        transformer: Optional[PackageTransformer] = None,
        logger: Logger = NoopLogger()
    ) -> None:
        for repository in self.repomgr.get_repositories(reponames):
            self._parse(repository, transformer, logger)

    def iter_parse(
        self,
        reponames: RepositoryNameList,
        transformer: Optional[PackageTransformer] = None,
        logger: Logger = NoopLogger()
    ) -> Iterator[Package]:
        for repository in self.repomgr.get_repositories(reponames):
            yield from self._iter_parse_all_sources(repository, transformer,
                                                    logger)

    def iter_parsed(
        self,
        reponames: Optional[RepositoryNameList] = None,
        logger: Logger = NoopLogger()
    ) -> Iterator[List[Package]]:
        sources: List[str] = []
        for repository in self.repomgr.get_repositories(reponames):
            repo_sources = self._get_parsed_chunk_paths(repository)

            if not repo_sources:
                logger.log(
                    'parsed packages for repository {} are missing, treating repository as empty'
                    .format(repository['desc']),
                    severity=Logger.WARNING)

            sources.extend(repo_sources)

        if sources:
            yield from map(packageset_deduplicate, heap_deserialize(sources))
        else:
            logger.log('no parsed packages found', severity=Logger.ERROR)
Пример #6
0
# Copyright (C) 2016 Dmitry Marakasov <*****@*****.**>
#
# This file is part of repology
#
# repology is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# repology is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with repology.  If not, see <http://www.gnu.org/licenses/>.

from repology.moduleutils import ClassFactory

Factory = ClassFactory(__name__, __file__, 'Parser')
Пример #7
0
class RepositoryProcessor:
    def __init__(self, repomgr, statedir, parseddir, safety_checks=True):
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers',
                                            superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers',
                                           superclass=Parser)

    def _get_state_path(self, repository):
        return os.path.join(self.statedir, repository['name'] + '.state')

    def _get_state_source_path(self, repository, source):
        return os.path.join(self._get_state_path(repository),
                            source['name'].replace('/', '_'))

    def _get_parsed_path(self, repository):
        return os.path.join(self.parseddir, repository['name'] + '.parsed')

    def _get_parsed_chunk_paths(self, repository):
        dirpath = self._get_parsed_path(repository)
        return [
            os.path.join(dirpath, filename) for filename in os.listdir(dirpath)
        ] if os.path.isdir(dirpath) else []

    # source level private methods
    def _fetch_source(self, repository, update, source, logger):
        if 'fetcher' not in source:
            logger.log('fetching source {} not supported'.format(
                source['name']))
            return

        logger.log('fetching source {} started'.format(source['name']))

        self.fetcher_factory.SpawnWithKnownArgs(
            source['fetcher'],
            source).fetch(self._get_state_source_path(repository, source),
                          update=update,
                          logger=logger.GetIndented())

        logger.log('fetching source {} complete'.format(source['name']))

    def _iter_parse_source(self, repository, source, transformer, logger):
        def postprocess_parsed_packages(packages_iter):
            for package in packages_iter:
                # unwrap packagemaker
                if not package.check_sanity(verbose=True):
                    continue

                package = package.unwrap()

                # fill repository-specific fields
                package.repo = repository['name']
                package.family = repository['family']

                if 'subrepo' in source:
                    package.subrepo = source['subrepo']

                if repository.get('shadow', False):
                    package.shadow = True

                if not package.maintainers:
                    if 'default_maintainer' in repository:
                        package.maintainers = [
                            repository['default_maintainer']
                        ]
                    else:
                        package.maintainers = [
                            'fallback-mnt-{}@repology'.format(
                                repository['name'])
                        ]

                # transform
                if transformer:
                    transformer.process(package)

                # skip removed packages
                if package.HasFlag(PackageFlags.remove):
                    continue

                # postprocess
                def strip_flavor(flavor):
                    if flavor.startswith(package.effname + '-'):
                        return flavor[len(package.effname) + 1:]
                    return flavor

                package.flavors = sorted(
                    set(map(strip_flavor, package.flavors)))

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.SpawnWithKnownArgs(
                source['parser'], source).iter_parse(
                    self._get_state_source_path(repository, source),
                    PackageFactory(logger)))

    def _iter_parse_all_sources(self, repository, transformer, logger):
        for source in repository['sources']:
            logger.log('parsing source {} started'.format(source['name']))
            yield from self._iter_parse_source(repository, source, transformer,
                                               logger.GetIndented())
            logger.log('parsing source {} complete'.format(source['name']))

    # repository level private methods
    def _fetch(self, repository, update, logger):
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        for source in repository['sources']:
            if not os.path.isdir(self._get_state_path(repository)):
                os.mkdir(self._get_state_path(repository))
            self._fetch_source(repository, update, source,
                               logger.GetIndented())

        logger.log('fetching complete')

    def _parse(self, repository, transformer, logger):
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        packages = []
        chunknum = 0
        num_packages = 0

        def flush_packages():
            nonlocal packages, chunknum

            if packages:
                packages = sorted(packages,
                                  key=lambda package: package.effname)
                serialize(packages, os.path.join(state_dir, str(chunknum)))
                packages = []
                chunknum += 1

        with atomic_dir(self._get_parsed_path(repository)) as state_dir:
            for package in self._iter_parse_all_sources(
                    repository, transformer, logger):
                packages.append(package)
                num_packages += 1

                if len(packages) >= MAX_PACKAGES_PER_CHUNK:
                    flush_packages()

            flush_packages()

        if self.safety_checks and num_packages < repository['minpackages']:
            raise TooLittlePackages(num_packages, repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(num_packages))

    # public methods
    def fetch(self, reponames, update=True, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            self._fetch(repository, update, logger)

    def parse(self, reponames, transformer=None, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            self._parse(repository, transformer, logger)

    def iter_parse(self, reponames, transformer=None, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            yield from self._iter_parse_all_sources(repository, transformer,
                                                    logger)

    def iter_parsed(self, reponames=None, logger=NoopLogger()):
        def get_sources():
            for repository in self.repomgr.GetRepositories(reponames):
                sources = self._get_parsed_chunk_paths(repository)
                if not sources:
                    logger.log(
                        'parsed packages for repository {} are missing, treating repository as empty'
                        .format(repository['desc']),
                        severity=Logger.ERROR)
                yield from sources

        with heap_deserializer(get_sources(),
                               lambda package: package.effname) as heap:
            for packageset in heap():
                packageset = PackagesetDeduplicate(packageset)

                yield packageset
Пример #8
0
class RepositoryProcessor:
    def __init__(self,
                 repomgr: RepositoryManager,
                 statedir: str,
                 parseddir: str,
                 safety_checks: bool = True) -> None:
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers',
                                            superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers',
                                           superclass=Parser)

    def _get_state_path(self, repository: RepositoryMetadata) -> str:
        return os.path.join(self.statedir, repository['name'] + '.state')

    def _get_state_source_path(self, repository: RepositoryMetadata,
                               source: RepositoryMetadata) -> str:
        return os.path.join(self._get_state_path(repository),
                            source['name'].replace('/', '_'))

    def _get_parsed_path(self, repository: RepositoryMetadata) -> str:
        return os.path.join(self.parseddir, repository['name'] + '.parsed')

    def _get_parsed_chunk_paths(self,
                                repository: RepositoryMetadata) -> List[str]:
        dirpath = self._get_parsed_path(repository)
        return [
            os.path.join(dirpath, filename) for filename in os.listdir(dirpath)
        ] if os.path.isdir(dirpath) else []

    # source level private methods
    def _fetch_source(self, repository: RepositoryMetadata, update: bool,
                      source: RepositoryMetadata, logger: Logger) -> bool:
        if 'fetcher' not in source:
            logger.log('fetching source {} not supported'.format(
                source['name']))
            return False

        logger.log('fetching source {} started'.format(source['name']))

        fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args(
            source['fetcher'], source)

        have_changes = fetcher.fetch(self._get_state_source_path(
            repository, source),
                                     update=update,
                                     logger=logger.get_indented())

        logger.log('fetching source {} complete'.format(source['name']) +
                   ('' if have_changes else ' (no changes)'))

        return have_changes

    def _iter_parse_source(self, repository: RepositoryMetadata,
                           source: RepositoryMetadata,
                           transformer: Optional[PackageTransformer],
                           logger: Logger) -> Iterator[Package]:
        def postprocess_parsed_packages(
                packages_iter: Iterable[PackageMaker]) -> Iterator[Package]:
            for packagemaker in packages_iter:
                # unwrap packagemaker
                if not packagemaker.check_sanity(verbose=True):
                    continue

                package = packagemaker.unwrap()

                # fill repository-specific fields
                package.repo = repository['name']
                package.family = repository['family']

                if 'subrepo' in source:
                    package.subrepo = source['subrepo']

                if repository.get('shadow', False):
                    package.shadow = True

                if not package.maintainers:
                    default_maintainer = repository.get(
                        'default_maintainer', True)
                    if isinstance(default_maintainer, str):
                        package.maintainers = [
                            repository['default_maintainer']
                        ]
                    elif default_maintainer:
                        package.maintainers = [
                            'fallback-mnt-{}@repology'.format(
                                repository['name'])
                        ]

                # transform
                if transformer:
                    transformer.process(package)

                # skip removed packages
                if package.HasFlag(PackageFlags.remove):
                    continue

                # postprocess
                def strip_flavor(flavor: str) -> str:
                    if flavor.startswith(package.effname + '-'):
                        return flavor[len(package.effname) + 1:]
                    return flavor

                package.flavors = sorted(
                    set(map(strip_flavor, package.flavors)))

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.spawn_with_known_args(
                source['parser'], source).iter_parse(
                    self._get_state_source_path(repository, source),
                    PackageFactory(logger), transformer))

    def _iter_parse_all_sources(self, repository: RepositoryMetadata,
                                transformer: Optional[PackageTransformer],
                                logger: Logger) -> Iterator[Package]:
        for source in repository['sources']:
            logger.log('parsing source {} started'.format(source['name']))
            yield from self._iter_parse_source(repository, source, transformer,
                                               logger.get_indented())
            logger.log('parsing source {} complete'.format(source['name']))

    # repository level private methods
    def _fetch(self, repository: RepositoryMetadata, update: bool,
               logger: Logger) -> bool:
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        have_changes = False
        for source in repository['sources']:
            if not os.path.isdir(self._get_state_path(repository)):
                os.mkdir(self._get_state_path(repository))
            have_changes |= self._fetch_source(repository, update, source,
                                               logger.get_indented())

        logger.log('fetching complete' +
                   ('' if have_changes else ' (no changes)'))

        return have_changes

    def _parse(self, repository: RepositoryMetadata,
               transformer: Optional[PackageTransformer],
               logger: Logger) -> None:
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        with AtomicDir(self._get_parsed_path(repository)) as state_dir:
            serializer = ChunkedSerializer(state_dir.get_path(),
                                           MAX_PACKAGES_PER_CHUNK)

            serializer.serialize(
                self._iter_parse_all_sources(repository, transformer, logger))

            if self.safety_checks and serializer.get_num_packages(
            ) < repository['minpackages']:
                raise TooLittlePackages(serializer.get_num_packages(),
                                        repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(
            serializer.get_num_packages()))

    # public methods
    def fetch(self,
              reponames: RepositoryNameList,
              update: bool = True,
              logger: Logger = NoopLogger()) -> bool:
        have_changes = False

        for repository in self.repomgr.get_repositories(reponames):
            have_changes |= self._fetch(repository, update, logger)

        return have_changes

    def parse(
        self,
        reponames: RepositoryNameList,
        transformer: Optional[PackageTransformer] = None,
        logger: Logger = NoopLogger()
    ) -> None:
        for repository in self.repomgr.get_repositories(reponames):
            self._parse(repository, transformer, logger)

    def iter_parse(
        self,
        reponames: RepositoryNameList,
        transformer: Optional[PackageTransformer] = None,
        logger: Logger = NoopLogger()
    ) -> Iterator[Package]:
        for repository in self.repomgr.get_repositories(reponames):
            yield from self._iter_parse_all_sources(repository, transformer,
                                                    logger)

    def iter_parsed(
        self,
        reponames: Optional[RepositoryNameList] = None,
        logger: Logger = NoopLogger()
    ) -> Iterator[List[Package]]:
        def get_sources():
            for repository in self.repomgr.get_repositories(reponames):
                sources = self._get_parsed_chunk_paths(repository)
                if not sources:
                    logger.log(
                        'parsed packages for repository {} are missing, treating repository as empty'
                        .format(repository['desc']),
                        severity=Logger.ERROR)
                yield from sources

        with heap_deserializer(get_sources(),
                               lambda package: package.effname) as heap:
            for packageset in heap():
                packageset = PackagesetDeduplicate(packageset)

                yield packageset
Пример #9
0
class RepositoryProcessor:
    def __init__(self, repomgr, statedir, parseddir, safety_checks=True):
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)

    def _get_state_path(self, repository):
        return os.path.join(self.statedir, repository['name'] + '.state')

    def _get_state_source_path(self, repository, source):
        return os.path.join(self._get_state_path(repository), source['name'].replace('/', '_'))

    def _get_parsed_path(self, repository):
        return os.path.join(self.parseddir, repository['name'] + '.parsed')

    def _get_parsed_chunk_paths(self, repository):
        dirpath = self._get_parsed_path(repository)
        return [
            os.path.join(dirpath, filename)
            for filename in os.listdir(dirpath)
        ] if os.path.isdir(dirpath) else []

    # source level private methods
    def _fetch_source(self, repository, update, source, logger):
        if 'fetcher' not in source:
            logger.log('fetching source {} not supported'.format(source['name']))
            return

        logger.log('fetching source {} started'.format(source['name']))

        self.fetcher_factory.SpawnWithKnownArgs(
            source['fetcher'], source
        ).fetch(
            self._get_state_source_path(repository, source),
            update=update,
            logger=logger.GetIndented()
        )

        logger.log('fetching source {} complete'.format(source['name']))

    def _iter_parse_source(self, repository, source, transformer, logger):
        def postprocess_parsed_packages(packages_iter):
            for package in packages_iter:
                if isinstance(package, PackageMaker):
                    # unwrap packagemaker
                    if not package.check_sanity(True):
                        continue

                    package = package.unwrap()
                else:
                    # XXX: compatibility shim for parsers still returning raw packages
                    if not package.name:
                        raise InconsistentPackage('encountered package with no name')

                    if not package.version:
                        # XXX: this currently fires on kdepim in dports; it's pretty fatal on
                        # one hand, but shouldn't stop whole repo from updating on another. In
                        # future, it should be logged as some kind of very serious repository
                        # update error
                        logger.log('package with empty version: {}'.format(package.name), severity=Logger.ERROR)
                        continue

                # fill repository-specific fields
                package.repo = repository['name']
                package.family = repository['family']

                if 'subrepo' in source:
                    package.subrepo = source['subrepo']

                if repository.get('shadow', False):
                    package.shadow = True

                if not package.maintainers:
                    if 'default_maintainer' in repository:
                        package.maintainers = [repository['default_maintainer']]
                    else:
                        package.maintainers = ['fallback-mnt-{}@repology'.format(repository['name'])]

                # transform
                if transformer:
                    transformer.process(package)

                # skip removed packages
                if package.HasFlag(PackageFlags.remove):
                    continue

                # postprocess
                def strip_flavor(flavor):
                    if flavor.startswith(package.effname + '-'):
                        return flavor[len(package.effname) + 1:]
                    return flavor

                package.flavors = sorted(set(map(strip_flavor, package.flavors)))

                # legacy sanity checking
                try:
                    package.CheckSanity(transformed=transformer is not None)
                except PackageSanityCheckFailure as err:
                    logger.log('sanity error: {}'.format(err), severity=Logger.ERROR)
                    raise
                except PackageSanityCheckProblem as err:
                    logger.log('sanity warning: {}'.format(err), severity=Logger.WARNING)

                package.Normalize()

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.SpawnWithKnownArgs(
                source['parser'], source
            ).iter_parse(
                self._get_state_source_path(repository, source),
                PackageFactory(logger)
            )
        )

    def _iter_parse_all_sources(self, repository, transformer, logger):
        for source in repository['sources']:
            logger.log('parsing source {} started'.format(source['name']))
            yield from self._iter_parse_source(repository, source, transformer, logger.GetIndented())
            logger.log('parsing source {} complete'.format(source['name']))

    # repository level private methods
    def _fetch(self, repository, update, logger):
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        for source in repository['sources']:
            if not os.path.isdir(self._get_state_path(repository)):
                os.mkdir(self._get_state_path(repository))
            self._fetch_source(repository, update, source, logger.GetIndented())

        logger.log('fetching complete')

    def _parse(self, repository, transformer, logger):
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        packages = []
        chunknum = 0
        num_packages = 0

        def flush_packages():
            nonlocal packages, chunknum

            if packages:
                packages = sorted(packages, key=lambda package: package.effname)
                serialize(packages, os.path.join(state_dir, str(chunknum)))
                packages = []
                chunknum += 1

        with atomic_dir(self._get_parsed_path(repository)) as state_dir:
            for package in self._iter_parse_all_sources(repository, transformer, logger):
                packages.append(package)
                num_packages += 1

                if len(packages) >= MAX_PACKAGES_PER_CHUNK:
                    flush_packages()

            flush_packages()

        if self.safety_checks and num_packages < repository['minpackages']:
            raise TooLittlePackages(num_packages, repository['minpackages'])

        logger.log('parsing complete, {} packages'.format(num_packages))

    # public methods
    def fetch(self, reponames, update=True, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            self._fetch(repository, update, logger)

    def parse(self, reponames, transformer=None, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            self._parse(repository, transformer, logger)

    def iter_parse(self, reponames, transformer=None, logger=NoopLogger()):
        for repository in self.repomgr.GetRepositories(reponames):
            yield from self._iter_parse_all_sources(repository, transformer, logger)

    def iter_parsed(self, reponames=None, logger=NoopLogger()):
        def get_sources():
            for repository in self.repomgr.GetRepositories(reponames):
                sources = self._get_parsed_chunk_paths(repository)
                if not sources:
                    logger.log('parsed packages for repository {} are missing, treating repository as empty'.format(repository['desc']), severity=Logger.ERROR)
                yield from sources

        with heap_deserializer(get_sources(), lambda package: package.effname) as heap:
            for packageset in heap():
                packageset = PackagesetDeduplicate(packageset)

                yield packageset
Пример #10
0
class RepositoryProcessor:
    def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None:
        self.repomgr = repomgr
        self.statedir = statedir
        self.parseddir = parseddir
        self.safety_checks = safety_checks

        self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher)
        self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)

    def _get_state_path(self, repository: Repository) -> str:
        return os.path.join(self.statedir, repository.name + '.state')

    def _get_state_source_path(self, repository: Repository, source: Source) -> str:
        return os.path.join(self._get_state_path(repository), source.name.replace('/', '_'))

    def _get_parsed_path(self, repository: Repository) -> str:
        return os.path.join(self.parseddir, repository.name + '.parsed')

    def _get_parsed_chunk_paths(self, repository: Repository) -> list[str]:
        dirpath = self._get_parsed_path(repository)
        return [
            os.path.join(dirpath, filename)
            for filename in os.listdir(dirpath)
        ] if os.path.isdir(dirpath) else []

    # source level private methods
    def _fetch_source(self, repository: Repository, update: bool, source: Source, logger: Logger) -> bool:
        logger.log(f'fetching source {source.name} started')

        fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args(
            source.fetcher['class'],
            source.fetcher
        )

        have_changes = fetcher.fetch(
            self._get_state_source_path(repository, source),
            update=update,
            logger=logger.get_indented()
        )

        logger.log(f'fetching source {source.name} complete' + ('' if have_changes else ' (no changes)'))

        return have_changes

    def _iter_parse_source(
        self,
        repository: Repository,
        source: Source,
        transformer: PackageTransformer | None,
        maintainermgr: MaintainerManager | None,
        logger: Logger
    ) -> Iterator[Package]:
        def postprocess_parsed_packages(packages_iter: Iterable[PackageMaker]) -> Iterator[Package]:
            for packagemaker in packages_iter:
                try:
                    package = packagemaker.spawn(
                        repo=repository.name,
                        family=repository.family,
                        subrepo=source.subrepo,
                        shadow=repository.shadow,
                        default_maintainer=repository.default_maintainer,
                    )
                except RuntimeError as e:
                    packagemaker.log(str(e), Logger.ERROR)
                    raise

                # transform
                if transformer:
                    transformer.process(package)

                # skip removed packages
                if package.has_flag(PackageFlags.REMOVE):
                    continue

                # postprocess flavors
                def strip_flavor(flavor: str) -> str:
                    flavor.removeprefix(package.effname + '-')
                    return flavor

                package.flavors = sorted(set(map(strip_flavor, package.flavors)))

                # add packagelinks
                packagelinks: list[tuple[int, str]] = []
                for pkglink in source.packagelinks + repository.packagelinks:
                    link_type = pkglink.type
                    try:
                        packagelinks.extend(
                            (link_type, url)
                            for url in format_package_links(package, pkglink.url)
                        )
                    except Exception as e:
                        packagemaker.log(f'cannot spawn package link from template "{pkglink.url}": {str(e)}', Logger.ERROR)
                        raise

                if package.links is None:
                    package.links = packagelinks
                else:
                    seen = set(package.links)
                    package.links.extend(link for link in packagelinks if link not in seen)

                # postprocess maintainers
                if maintainermgr and package.maintainers:
                    package.maintainers = [maintainer for maintainer in package.maintainers if not maintainermgr.is_hidden(maintainer)]

                yield package

        return postprocess_parsed_packages(
            self.parser_factory.spawn_with_known_args(
                source.parser['class'],
                source.parser
            ).iter_parse(
                self._get_state_source_path(repository, source),
                PackageFactory(logger)
            )
        )

    def _iter_parse_all_sources(
        self,
        repository: Repository,
        transformer: PackageTransformer | None,
        maintainermgr: MaintainerManager | None,
        logger: Logger
    ) -> Iterator[Package]:
        for source in repository.sources:
            logger.log(f'parsing source {source.name} started')
            yield from self._iter_parse_source(repository, source, transformer, maintainermgr, logger.get_indented())
            logger.log(f'parsing source {source.name} complete')

    # repository level private methods
    def _fetch(self, repository: Repository, update: bool, logger: Logger) -> bool:
        logger.log('fetching started')

        if not os.path.isdir(self.statedir):
            os.mkdir(self.statedir)

        have_changes = False
        for source in repository.sources:
            if not os.path.isdir(self._get_state_path(repository)):
                os.mkdir(self._get_state_path(repository))
            have_changes |= self._fetch_source(repository, update, source, logger.get_indented())

        logger.log('fetching complete' + ('' if have_changes else ' (no changes)'))

        return have_changes

    def _parse(
        self,
        repository: Repository,
        transformer: PackageTransformer | None,
        maintainermgr: MaintainerManager | None,
        logger: Logger
    ) -> None:
        logger.log('parsing started')

        if not os.path.isdir(self.parseddir):
            os.mkdir(self.parseddir)

        with AtomicDir(self._get_parsed_path(repository)) as state_dir:
            serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK)

            serializer.serialize(self._iter_parse_all_sources(repository, transformer, maintainermgr, logger))

            if self.safety_checks and serializer.get_num_packages() < repository.minpackages:
                raise TooLittlePackages(serializer.get_num_packages(), repository.minpackages)

        logger.log('parsing complete, {} packages'.format(serializer.get_num_packages()))

    # public methods
    def fetch(self, reponames: RepositoryNameList, update: bool = True, logger: Logger = NoopLogger()) -> bool:
        have_changes = False

        for repository in self.repomgr.get_repositories(reponames):
            have_changes |= self._fetch(repository, update, logger)

        return have_changes

    def parse(
        self,
        reponames: RepositoryNameList,
        transformer: PackageTransformer | None = None,
        maintainermgr: MaintainerManager | None = None,
        logger: Logger = NoopLogger()
    ) -> None:
        for repository in self.repomgr.get_repositories(reponames):
            self._parse(repository, transformer, maintainermgr, logger)

    def iter_parse(
        self,
        reponames: RepositoryNameList,
        transformer: PackageTransformer | None = None,
        maintainermgr: MaintainerManager | None = None,
        logger: Logger = NoopLogger()
    ) -> Iterator[Package]:
        for repository in self.repomgr.get_repositories(reponames):
            yield from self._iter_parse_all_sources(repository, transformer, maintainermgr, logger)

    def iter_parsed(self, reponames: RepositoryNameList | None = None, logger: Logger = NoopLogger()) -> Iterator[list[Package]]:
        sources: list[str] = []
        for repository in self.repomgr.get_repositories(reponames):
            repo_sources = self._get_parsed_chunk_paths(repository)

            if not repo_sources:
                logger.log(f'parsed packages for repository {repository.desc} are missing, treating repository as empty', severity=Logger.WARNING)

            sources.extend(repo_sources)

        if sources:
            yield from map(packageset_deduplicate, heap_deserialize(sources))
        else:
            logger.log('no parsed packages found', severity=Logger.ERROR)
Пример #11
0
# Copyright (C) 2016-2017 Dmitry Marakasov <*****@*****.**>
#
# This file is part of repology
#
# repology is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# repology is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with repology.  If not, see <http://www.gnu.org/licenses/>.

from repology.moduleutils import ClassFactory

Factory = ClassFactory(__name__, __file__, 'Fetcher')