def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None: self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)
def __init__(self, repomgr, statedir, parseddir, safety_checks=True): self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)
def __init__(self, repomgr, statedir, fetch_retries=3, fetch_retry_delay=30, safety_checks=True): self.repomgr = repomgr self.statedir = statedir self.fetch_retries = fetch_retries self.fetch_retry_delay = fetch_retry_delay self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser)
class RepositoryProcessor: def __init__(self, repomgr, statedir, safety_checks=True): self.repomgr = repomgr self.statedir = statedir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def __GetRepoPath(self, repository): return os.path.join(self.statedir, repository['name'] + '.state') def __GetSourcePath(self, repository, source): return os.path.join(self.__GetRepoPath(repository), source['name'].replace('/', '_')) def __GetSerializedPath(self, repository): return os.path.join(self.statedir, repository['name'] + '.packages') def __CheckRepositoryOutdatedness(self, repository, logger): if 'valid_till' in repository and datetime.date.today( ) >= repository['valid_till']: logger.log( 'repository {} has reached EoL, consider updating configs'. format(repository['name']), severity=Logger.WARNING) # Private methods which provide single actions on sources def __FetchSource(self, update, repository, source, logger): if 'fetcher' not in source: logger.log('fetching source {} not supported'.format( source['name'])) return logger.log('fetching source {} started'.format(source['name'])) self.fetcher_factory.SpawnWithKnownArgs( source['fetcher'], source).fetch(self.__GetSourcePath(repository, source), update=update, logger=logger.GetIndented()) logger.log('fetching source {} complete'.format(source['name'])) def _iter_parse_source(self, repository, source, logger): def postprocess_parsed_packages(packages_iter): for package in packages_iter: if isinstance(package, PackageMaker): # unwrap packagemaker if not package.check_sanity(True): continue package = package.unwrap() else: # XXX: compatibility shim for parsers still returning raw packages if not package.name: raise InconsistentPackage( 'encountered package with no name') if not package.version: # XXX: this currently fires on kdepim in dports; it's pretty fatal on # one hand, but shouldn't stop whole repo from updating on another. In # future, it should be logged as some kind of very serious repository # update error logger.log('package with empty version: {}'.format( package.name), severity=Logger.ERROR) continue # fill subrepos if 'subrepo' in source: package.subrepo = source['subrepo'] # fill default maintainer if not package.maintainers: if 'default_maintainer' in repository: package.maintainers = [ repository['default_maintainer'] ] else: package.maintainers = [ 'fallback-mnt-{}@repology'.format( repository['name']) ] yield package return postprocess_parsed_packages( self.parser_factory.SpawnWithKnownArgs( source['parser'], source).iter_parse(self.__GetSourcePath(repository, source), PackageFactory(logger))) # Private methods which provide single actions on repos def __Fetch(self, update, repository, logger): logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) for source in repository['sources']: if not os.path.isdir(self.__GetRepoPath(repository)): os.mkdir(self.__GetRepoPath(repository)) self.__FetchSource(update, repository, source, logger.GetIndented()) logger.log('fetching complete') def _parse(self, repository, logger): logger.log('parsing started') packages = [] for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) packages.extend( self._iter_parse_source(repository, source, logger.GetIndented())) logger.log('parsing source {} complete'.format(source['name'])) logger.log('parsing complete, {} packages, deduplicating'.format( len(packages))) packages = PackagesetDeduplicate(packages) if self.safety_checks and len(packages) < repository['minpackages']: raise TooLittlePackages(len(packages), repository['minpackages']) logger.log('parsing complete, {} packages'.format(len(packages))) return packages def __Transform(self, packages, transformer, repository, logger): logger.log('processing started') sanitylogger = logger.GetIndented() for package in packages: package.repo = repository['name'] package.family = repository['family'] if repository.get('shadow', False): package.shadow = True if transformer: transformer.Process(package) # strip leading project name from flavor def strip_flavor(flavor): if flavor.startswith(package.effname + '-'): return flavor[len(package.effname) + 1:] return flavor package.flavors = sorted(set(map(strip_flavor, package.flavors))) try: package.CheckSanity(transformed=transformer is not None) except PackageSanityCheckFailure as err: sanitylogger.log('sanity error: {}'.format(err), severity=Logger.ERROR) raise except PackageSanityCheckProblem as err: sanitylogger.log('sanity warning: {}'.format(err), severity=Logger.WARNING) package.Normalize() # XXX: in future, ignored packages will not be dropped here, but # ignored in summary and version calcualtions, but shown in # package listing packages = [ package for package in packages if not package.HasFlag(PackageFlags.remove) ] logger.log('processing complete, {} packages, deduplicating'.format( len(packages))) packages = PackagesetDeduplicate(packages) if transformer: logger.log('processing complete, {} packages, sorting'.format( len(packages))) packages = sorted(packages, key=lambda package: package.effname) logger.log('processing complete, {} packages'.format(len(packages))) return packages def __Serialize(self, packages, path, repository, logger): tmppath = path + '.tmp' logger.log('saving started') with open(tmppath, 'wb') as outfile: pickler = pickle.Pickler(outfile, protocol=pickle.HIGHEST_PROTOCOL) pickler.fast = True # deprecated, but I don't see any alternatives pickler.dump(len(packages)) for package in packages: pickler.dump(package) os.replace(tmppath, path) logger.log('saving complete, {} packages'.format(len(packages))) def __Deserialize(self, path, repository, logger): packages = [] logger.log('loading started') with open(path, 'rb') as infile: unpickler = pickle.Unpickler(infile) numpackages = unpickler.load() packages = [unpickler.load() for num in range(0, numpackages)] if packages and not packages[0].CheckFormat(): raise StateFileFormatCheckProblem(path) logger.log('loading complete, {} packages'.format(len(packages))) return packages class StreamDeserializer: def __init__(self, path, logger): try: self.unpickler = pickle.Unpickler(open(path, 'rb')) self.count = self.unpickler.load() except FileNotFoundError: logger.log( 'parsed package data file {} does not exist, treating repository as empty' .format(path), severity=Logger.ERROR) self.count = 0 self.current = None self.Get() if self.current and not self.current.CheckFormat(): raise StateFileFormatCheckProblem(path) def Peek(self): return self.current def EOF(self): return self.current is None def Get(self): current = self.current if self.count == 0: self.current = None else: self.current = self.unpickler.load() self.count -= 1 return current # Single repo methods def Fetch(self, reponame, update=True, logger=NoopLogger()): repository = self.repomgr.GetRepository(reponame) self.__CheckRepositoryOutdatedness(repository, logger) self.__Fetch(update, repository, logger) def Parse(self, reponame, transformer, logger=NoopLogger()): repository = self.repomgr.GetRepository(reponame) packages = self._parse(repository, logger) packages = self.__Transform(packages, transformer, repository, logger) return packages def ParseAndSerialize(self, reponame, transformer, logger=NoopLogger()): repository = self.repomgr.GetRepository(reponame) packages = self._parse(repository, logger) packages = self.__Transform(packages, transformer, repository, logger) self.__Serialize(packages, self.__GetSerializedPath(repository), repository, logger) return packages def Deserialize(self, reponame, logger=NoopLogger()): repository = self.repomgr.GetRepository(reponame) return self.__Deserialize(self.__GetSerializedPath(repository), repository, logger) # Multi repo methods def ParseMulti(self, reponames=None, transformer=None, logger=NoopLogger()): packages = [] for repo in self.repomgr.GetRepositories(reponames): packages += self.Parse(repo['name'], transformer=transformer, logger=logger.GetPrefixed(repo['name'] + ': ')) return packages def DeserializeMulti(self, reponames=None, logger=NoopLogger()): packages = [] for repo in self.repomgr.GetRepositories(reponames): packages += self.Deserialize( repo['name'], logger=logger.GetPrefixed(repo['name'] + ': ')) return packages def StreamDeserializeMulti(self, reponames=None, logger=NoopLogger()): deserializers = [] for repo in self.repomgr.GetRepositories(reponames): deserializers.append( self.StreamDeserializer(self.__GetSerializedPath(repo), logger)) while True: # remove EOFed repos deserializers = [ds for ds in deserializers if not ds.EOF()] # stop when all deserializers are empty if not deserializers: break # find lowest key (effname) thiskey = deserializers[0].Peek().effname for ds in deserializers[1:]: thiskey = min(thiskey, ds.Peek().effname) # fetch all packages with given key from all deserializers packageset = [] for ds in deserializers: while not ds.EOF() and ds.Peek().effname == thiskey: packageset.append(ds.Get()) yield packageset
class RepositoryProcessor: def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None: self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def _get_state_path(self, repository: RepositoryMetadata) -> str: return os.path.join(self.statedir, repository['name'] + '.state') def _get_state_source_path(self, repository: RepositoryMetadata, source: RepositoryMetadata) -> str: return os.path.join(self._get_state_path(repository), source['name'].replace('/', '_')) def _get_parsed_path(self, repository: RepositoryMetadata) -> str: return os.path.join(self.parseddir, repository['name'] + '.parsed') def _get_parsed_chunk_paths(self, repository: RepositoryMetadata) -> List[str]: dirpath = self._get_parsed_path(repository) return [ os.path.join(dirpath, filename) for filename in os.listdir(dirpath) ] if os.path.isdir(dirpath) else [] # source level private methods def _fetch_source(self, repository: RepositoryMetadata, update: bool, source: RepositoryMetadata, logger: Logger) -> bool: if 'fetcher' not in source: logger.log('fetching source {} not supported'.format( source['name'])) return False logger.log('fetching source {} started'.format(source['name'])) fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args( source['fetcher'], source) have_changes = fetcher.fetch(self._get_state_source_path( repository, source), update=update, logger=logger.get_indented()) logger.log('fetching source {} complete'.format(source['name']) + ('' if have_changes else ' (no changes)')) return have_changes def _iter_parse_source(self, repository: RepositoryMetadata, source: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> Iterator[Package]: def postprocess_parsed_packages( packages_iter: Iterable[PackageMaker]) -> Iterator[Package]: for packagemaker in packages_iter: try: package = packagemaker.spawn( repo=repository['name'], family=repository['family'], subrepo=source.get('subrepo'), shadow=repository.get('shadow', False), default_maintainer=repository.get( 'default_maintainer'), ) except RuntimeError as e: packagemaker.log(str(e), Logger.ERROR) raise # transform if transformer: transformer.process(package) # skip removed packages if package.has_flag(PackageFlags.REMOVE): continue # postprocess flavors def strip_flavor(flavor: str) -> str: flavor.removeprefix(package.effname + '-') return flavor package.flavors = sorted( set(map(strip_flavor, package.flavors))) # XXX: arch is not used anywhere yet, and until #711 is implemented, # it just introduces package duplicates; it's a crude solution, but # just drop it here package.arch = None yield package return postprocess_parsed_packages( self.parser_factory.spawn_with_known_args( source['parser'], source).iter_parse( self._get_state_source_path(repository, source), PackageFactory(logger), transformer)) def _iter_parse_all_sources(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> Iterator[Package]: for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) yield from self._iter_parse_source(repository, source, transformer, logger.get_indented()) logger.log('parsing source {} complete'.format(source['name'])) # repository level private methods def _fetch(self, repository: RepositoryMetadata, update: bool, logger: Logger) -> bool: logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) have_changes = False for source in repository['sources']: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) have_changes |= self._fetch_source(repository, update, source, logger.get_indented()) logger.log('fetching complete' + ('' if have_changes else ' (no changes)')) return have_changes def _parse(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> None: logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) with AtomicDir(self._get_parsed_path(repository)) as state_dir: serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK) serializer.serialize( self._iter_parse_all_sources(repository, transformer, logger)) if self.safety_checks and serializer.get_num_packages( ) < repository['minpackages']: raise TooLittlePackages(serializer.get_num_packages(), repository['minpackages']) logger.log('parsing complete, {} packages'.format( serializer.get_num_packages())) # public methods def fetch(self, reponames: RepositoryNameList, update: bool = True, logger: Logger = NoopLogger()) -> bool: have_changes = False for repository in self.repomgr.get_repositories(reponames): have_changes |= self._fetch(repository, update, logger) return have_changes def parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> None: for repository in self.repomgr.get_repositories(reponames): self._parse(repository, transformer, logger) def iter_parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> Iterator[Package]: for repository in self.repomgr.get_repositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, logger) def iter_parsed( self, reponames: Optional[RepositoryNameList] = None, logger: Logger = NoopLogger() ) -> Iterator[List[Package]]: sources: List[str] = [] for repository in self.repomgr.get_repositories(reponames): repo_sources = self._get_parsed_chunk_paths(repository) if not repo_sources: logger.log( 'parsed packages for repository {} are missing, treating repository as empty' .format(repository['desc']), severity=Logger.WARNING) sources.extend(repo_sources) if sources: yield from map(packageset_deduplicate, heap_deserialize(sources)) else: logger.log('no parsed packages found', severity=Logger.ERROR)
# Copyright (C) 2016 Dmitry Marakasov <*****@*****.**> # # This file is part of repology # # repology is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # repology is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with repology. If not, see <http://www.gnu.org/licenses/>. from repology.moduleutils import ClassFactory Factory = ClassFactory(__name__, __file__, 'Parser')
class RepositoryProcessor: def __init__(self, repomgr, statedir, parseddir, safety_checks=True): self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def _get_state_path(self, repository): return os.path.join(self.statedir, repository['name'] + '.state') def _get_state_source_path(self, repository, source): return os.path.join(self._get_state_path(repository), source['name'].replace('/', '_')) def _get_parsed_path(self, repository): return os.path.join(self.parseddir, repository['name'] + '.parsed') def _get_parsed_chunk_paths(self, repository): dirpath = self._get_parsed_path(repository) return [ os.path.join(dirpath, filename) for filename in os.listdir(dirpath) ] if os.path.isdir(dirpath) else [] # source level private methods def _fetch_source(self, repository, update, source, logger): if 'fetcher' not in source: logger.log('fetching source {} not supported'.format( source['name'])) return logger.log('fetching source {} started'.format(source['name'])) self.fetcher_factory.SpawnWithKnownArgs( source['fetcher'], source).fetch(self._get_state_source_path(repository, source), update=update, logger=logger.GetIndented()) logger.log('fetching source {} complete'.format(source['name'])) def _iter_parse_source(self, repository, source, transformer, logger): def postprocess_parsed_packages(packages_iter): for package in packages_iter: # unwrap packagemaker if not package.check_sanity(verbose=True): continue package = package.unwrap() # fill repository-specific fields package.repo = repository['name'] package.family = repository['family'] if 'subrepo' in source: package.subrepo = source['subrepo'] if repository.get('shadow', False): package.shadow = True if not package.maintainers: if 'default_maintainer' in repository: package.maintainers = [ repository['default_maintainer'] ] else: package.maintainers = [ 'fallback-mnt-{}@repology'.format( repository['name']) ] # transform if transformer: transformer.process(package) # skip removed packages if package.HasFlag(PackageFlags.remove): continue # postprocess def strip_flavor(flavor): if flavor.startswith(package.effname + '-'): return flavor[len(package.effname) + 1:] return flavor package.flavors = sorted( set(map(strip_flavor, package.flavors))) yield package return postprocess_parsed_packages( self.parser_factory.SpawnWithKnownArgs( source['parser'], source).iter_parse( self._get_state_source_path(repository, source), PackageFactory(logger))) def _iter_parse_all_sources(self, repository, transformer, logger): for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) yield from self._iter_parse_source(repository, source, transformer, logger.GetIndented()) logger.log('parsing source {} complete'.format(source['name'])) # repository level private methods def _fetch(self, repository, update, logger): logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) for source in repository['sources']: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) self._fetch_source(repository, update, source, logger.GetIndented()) logger.log('fetching complete') def _parse(self, repository, transformer, logger): logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) packages = [] chunknum = 0 num_packages = 0 def flush_packages(): nonlocal packages, chunknum if packages: packages = sorted(packages, key=lambda package: package.effname) serialize(packages, os.path.join(state_dir, str(chunknum))) packages = [] chunknum += 1 with atomic_dir(self._get_parsed_path(repository)) as state_dir: for package in self._iter_parse_all_sources( repository, transformer, logger): packages.append(package) num_packages += 1 if len(packages) >= MAX_PACKAGES_PER_CHUNK: flush_packages() flush_packages() if self.safety_checks and num_packages < repository['minpackages']: raise TooLittlePackages(num_packages, repository['minpackages']) logger.log('parsing complete, {} packages'.format(num_packages)) # public methods def fetch(self, reponames, update=True, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): self._fetch(repository, update, logger) def parse(self, reponames, transformer=None, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): self._parse(repository, transformer, logger) def iter_parse(self, reponames, transformer=None, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, logger) def iter_parsed(self, reponames=None, logger=NoopLogger()): def get_sources(): for repository in self.repomgr.GetRepositories(reponames): sources = self._get_parsed_chunk_paths(repository) if not sources: logger.log( 'parsed packages for repository {} are missing, treating repository as empty' .format(repository['desc']), severity=Logger.ERROR) yield from sources with heap_deserializer(get_sources(), lambda package: package.effname) as heap: for packageset in heap(): packageset = PackagesetDeduplicate(packageset) yield packageset
class RepositoryProcessor: def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None: self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def _get_state_path(self, repository: RepositoryMetadata) -> str: return os.path.join(self.statedir, repository['name'] + '.state') def _get_state_source_path(self, repository: RepositoryMetadata, source: RepositoryMetadata) -> str: return os.path.join(self._get_state_path(repository), source['name'].replace('/', '_')) def _get_parsed_path(self, repository: RepositoryMetadata) -> str: return os.path.join(self.parseddir, repository['name'] + '.parsed') def _get_parsed_chunk_paths(self, repository: RepositoryMetadata) -> List[str]: dirpath = self._get_parsed_path(repository) return [ os.path.join(dirpath, filename) for filename in os.listdir(dirpath) ] if os.path.isdir(dirpath) else [] # source level private methods def _fetch_source(self, repository: RepositoryMetadata, update: bool, source: RepositoryMetadata, logger: Logger) -> bool: if 'fetcher' not in source: logger.log('fetching source {} not supported'.format( source['name'])) return False logger.log('fetching source {} started'.format(source['name'])) fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args( source['fetcher'], source) have_changes = fetcher.fetch(self._get_state_source_path( repository, source), update=update, logger=logger.get_indented()) logger.log('fetching source {} complete'.format(source['name']) + ('' if have_changes else ' (no changes)')) return have_changes def _iter_parse_source(self, repository: RepositoryMetadata, source: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> Iterator[Package]: def postprocess_parsed_packages( packages_iter: Iterable[PackageMaker]) -> Iterator[Package]: for packagemaker in packages_iter: # unwrap packagemaker if not packagemaker.check_sanity(verbose=True): continue package = packagemaker.unwrap() # fill repository-specific fields package.repo = repository['name'] package.family = repository['family'] if 'subrepo' in source: package.subrepo = source['subrepo'] if repository.get('shadow', False): package.shadow = True if not package.maintainers: default_maintainer = repository.get( 'default_maintainer', True) if isinstance(default_maintainer, str): package.maintainers = [ repository['default_maintainer'] ] elif default_maintainer: package.maintainers = [ 'fallback-mnt-{}@repology'.format( repository['name']) ] # transform if transformer: transformer.process(package) # skip removed packages if package.HasFlag(PackageFlags.remove): continue # postprocess def strip_flavor(flavor: str) -> str: if flavor.startswith(package.effname + '-'): return flavor[len(package.effname) + 1:] return flavor package.flavors = sorted( set(map(strip_flavor, package.flavors))) yield package return postprocess_parsed_packages( self.parser_factory.spawn_with_known_args( source['parser'], source).iter_parse( self._get_state_source_path(repository, source), PackageFactory(logger), transformer)) def _iter_parse_all_sources(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> Iterator[Package]: for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) yield from self._iter_parse_source(repository, source, transformer, logger.get_indented()) logger.log('parsing source {} complete'.format(source['name'])) # repository level private methods def _fetch(self, repository: RepositoryMetadata, update: bool, logger: Logger) -> bool: logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) have_changes = False for source in repository['sources']: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) have_changes |= self._fetch_source(repository, update, source, logger.get_indented()) logger.log('fetching complete' + ('' if have_changes else ' (no changes)')) return have_changes def _parse(self, repository: RepositoryMetadata, transformer: Optional[PackageTransformer], logger: Logger) -> None: logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) with AtomicDir(self._get_parsed_path(repository)) as state_dir: serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK) serializer.serialize( self._iter_parse_all_sources(repository, transformer, logger)) if self.safety_checks and serializer.get_num_packages( ) < repository['minpackages']: raise TooLittlePackages(serializer.get_num_packages(), repository['minpackages']) logger.log('parsing complete, {} packages'.format( serializer.get_num_packages())) # public methods def fetch(self, reponames: RepositoryNameList, update: bool = True, logger: Logger = NoopLogger()) -> bool: have_changes = False for repository in self.repomgr.get_repositories(reponames): have_changes |= self._fetch(repository, update, logger) return have_changes def parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> None: for repository in self.repomgr.get_repositories(reponames): self._parse(repository, transformer, logger) def iter_parse( self, reponames: RepositoryNameList, transformer: Optional[PackageTransformer] = None, logger: Logger = NoopLogger() ) -> Iterator[Package]: for repository in self.repomgr.get_repositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, logger) def iter_parsed( self, reponames: Optional[RepositoryNameList] = None, logger: Logger = NoopLogger() ) -> Iterator[List[Package]]: def get_sources(): for repository in self.repomgr.get_repositories(reponames): sources = self._get_parsed_chunk_paths(repository) if not sources: logger.log( 'parsed packages for repository {} are missing, treating repository as empty' .format(repository['desc']), severity=Logger.ERROR) yield from sources with heap_deserializer(get_sources(), lambda package: package.effname) as heap: for packageset in heap(): packageset = PackagesetDeduplicate(packageset) yield packageset
class RepositoryProcessor: def __init__(self, repomgr, statedir, parseddir, safety_checks=True): self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def _get_state_path(self, repository): return os.path.join(self.statedir, repository['name'] + '.state') def _get_state_source_path(self, repository, source): return os.path.join(self._get_state_path(repository), source['name'].replace('/', '_')) def _get_parsed_path(self, repository): return os.path.join(self.parseddir, repository['name'] + '.parsed') def _get_parsed_chunk_paths(self, repository): dirpath = self._get_parsed_path(repository) return [ os.path.join(dirpath, filename) for filename in os.listdir(dirpath) ] if os.path.isdir(dirpath) else [] # source level private methods def _fetch_source(self, repository, update, source, logger): if 'fetcher' not in source: logger.log('fetching source {} not supported'.format(source['name'])) return logger.log('fetching source {} started'.format(source['name'])) self.fetcher_factory.SpawnWithKnownArgs( source['fetcher'], source ).fetch( self._get_state_source_path(repository, source), update=update, logger=logger.GetIndented() ) logger.log('fetching source {} complete'.format(source['name'])) def _iter_parse_source(self, repository, source, transformer, logger): def postprocess_parsed_packages(packages_iter): for package in packages_iter: if isinstance(package, PackageMaker): # unwrap packagemaker if not package.check_sanity(True): continue package = package.unwrap() else: # XXX: compatibility shim for parsers still returning raw packages if not package.name: raise InconsistentPackage('encountered package with no name') if not package.version: # XXX: this currently fires on kdepim in dports; it's pretty fatal on # one hand, but shouldn't stop whole repo from updating on another. In # future, it should be logged as some kind of very serious repository # update error logger.log('package with empty version: {}'.format(package.name), severity=Logger.ERROR) continue # fill repository-specific fields package.repo = repository['name'] package.family = repository['family'] if 'subrepo' in source: package.subrepo = source['subrepo'] if repository.get('shadow', False): package.shadow = True if not package.maintainers: if 'default_maintainer' in repository: package.maintainers = [repository['default_maintainer']] else: package.maintainers = ['fallback-mnt-{}@repology'.format(repository['name'])] # transform if transformer: transformer.process(package) # skip removed packages if package.HasFlag(PackageFlags.remove): continue # postprocess def strip_flavor(flavor): if flavor.startswith(package.effname + '-'): return flavor[len(package.effname) + 1:] return flavor package.flavors = sorted(set(map(strip_flavor, package.flavors))) # legacy sanity checking try: package.CheckSanity(transformed=transformer is not None) except PackageSanityCheckFailure as err: logger.log('sanity error: {}'.format(err), severity=Logger.ERROR) raise except PackageSanityCheckProblem as err: logger.log('sanity warning: {}'.format(err), severity=Logger.WARNING) package.Normalize() yield package return postprocess_parsed_packages( self.parser_factory.SpawnWithKnownArgs( source['parser'], source ).iter_parse( self._get_state_source_path(repository, source), PackageFactory(logger) ) ) def _iter_parse_all_sources(self, repository, transformer, logger): for source in repository['sources']: logger.log('parsing source {} started'.format(source['name'])) yield from self._iter_parse_source(repository, source, transformer, logger.GetIndented()) logger.log('parsing source {} complete'.format(source['name'])) # repository level private methods def _fetch(self, repository, update, logger): logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) for source in repository['sources']: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) self._fetch_source(repository, update, source, logger.GetIndented()) logger.log('fetching complete') def _parse(self, repository, transformer, logger): logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) packages = [] chunknum = 0 num_packages = 0 def flush_packages(): nonlocal packages, chunknum if packages: packages = sorted(packages, key=lambda package: package.effname) serialize(packages, os.path.join(state_dir, str(chunknum))) packages = [] chunknum += 1 with atomic_dir(self._get_parsed_path(repository)) as state_dir: for package in self._iter_parse_all_sources(repository, transformer, logger): packages.append(package) num_packages += 1 if len(packages) >= MAX_PACKAGES_PER_CHUNK: flush_packages() flush_packages() if self.safety_checks and num_packages < repository['minpackages']: raise TooLittlePackages(num_packages, repository['minpackages']) logger.log('parsing complete, {} packages'.format(num_packages)) # public methods def fetch(self, reponames, update=True, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): self._fetch(repository, update, logger) def parse(self, reponames, transformer=None, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): self._parse(repository, transformer, logger) def iter_parse(self, reponames, transformer=None, logger=NoopLogger()): for repository in self.repomgr.GetRepositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, logger) def iter_parsed(self, reponames=None, logger=NoopLogger()): def get_sources(): for repository in self.repomgr.GetRepositories(reponames): sources = self._get_parsed_chunk_paths(repository) if not sources: logger.log('parsed packages for repository {} are missing, treating repository as empty'.format(repository['desc']), severity=Logger.ERROR) yield from sources with heap_deserializer(get_sources(), lambda package: package.effname) as heap: for packageset in heap(): packageset = PackagesetDeduplicate(packageset) yield packageset
class RepositoryProcessor: def __init__(self, repomgr: RepositoryManager, statedir: str, parseddir: str, safety_checks: bool = True) -> None: self.repomgr = repomgr self.statedir = statedir self.parseddir = parseddir self.safety_checks = safety_checks self.fetcher_factory = ClassFactory('repology.fetchers.fetchers', superclass=Fetcher) self.parser_factory = ClassFactory('repology.parsers.parsers', superclass=Parser) def _get_state_path(self, repository: Repository) -> str: return os.path.join(self.statedir, repository.name + '.state') def _get_state_source_path(self, repository: Repository, source: Source) -> str: return os.path.join(self._get_state_path(repository), source.name.replace('/', '_')) def _get_parsed_path(self, repository: Repository) -> str: return os.path.join(self.parseddir, repository.name + '.parsed') def _get_parsed_chunk_paths(self, repository: Repository) -> list[str]: dirpath = self._get_parsed_path(repository) return [ os.path.join(dirpath, filename) for filename in os.listdir(dirpath) ] if os.path.isdir(dirpath) else [] # source level private methods def _fetch_source(self, repository: Repository, update: bool, source: Source, logger: Logger) -> bool: logger.log(f'fetching source {source.name} started') fetcher: Fetcher = self.fetcher_factory.spawn_with_known_args( source.fetcher['class'], source.fetcher ) have_changes = fetcher.fetch( self._get_state_source_path(repository, source), update=update, logger=logger.get_indented() ) logger.log(f'fetching source {source.name} complete' + ('' if have_changes else ' (no changes)')) return have_changes def _iter_parse_source( self, repository: Repository, source: Source, transformer: PackageTransformer | None, maintainermgr: MaintainerManager | None, logger: Logger ) -> Iterator[Package]: def postprocess_parsed_packages(packages_iter: Iterable[PackageMaker]) -> Iterator[Package]: for packagemaker in packages_iter: try: package = packagemaker.spawn( repo=repository.name, family=repository.family, subrepo=source.subrepo, shadow=repository.shadow, default_maintainer=repository.default_maintainer, ) except RuntimeError as e: packagemaker.log(str(e), Logger.ERROR) raise # transform if transformer: transformer.process(package) # skip removed packages if package.has_flag(PackageFlags.REMOVE): continue # postprocess flavors def strip_flavor(flavor: str) -> str: flavor.removeprefix(package.effname + '-') return flavor package.flavors = sorted(set(map(strip_flavor, package.flavors))) # add packagelinks packagelinks: list[tuple[int, str]] = [] for pkglink in source.packagelinks + repository.packagelinks: link_type = pkglink.type try: packagelinks.extend( (link_type, url) for url in format_package_links(package, pkglink.url) ) except Exception as e: packagemaker.log(f'cannot spawn package link from template "{pkglink.url}": {str(e)}', Logger.ERROR) raise if package.links is None: package.links = packagelinks else: seen = set(package.links) package.links.extend(link for link in packagelinks if link not in seen) # postprocess maintainers if maintainermgr and package.maintainers: package.maintainers = [maintainer for maintainer in package.maintainers if not maintainermgr.is_hidden(maintainer)] yield package return postprocess_parsed_packages( self.parser_factory.spawn_with_known_args( source.parser['class'], source.parser ).iter_parse( self._get_state_source_path(repository, source), PackageFactory(logger) ) ) def _iter_parse_all_sources( self, repository: Repository, transformer: PackageTransformer | None, maintainermgr: MaintainerManager | None, logger: Logger ) -> Iterator[Package]: for source in repository.sources: logger.log(f'parsing source {source.name} started') yield from self._iter_parse_source(repository, source, transformer, maintainermgr, logger.get_indented()) logger.log(f'parsing source {source.name} complete') # repository level private methods def _fetch(self, repository: Repository, update: bool, logger: Logger) -> bool: logger.log('fetching started') if not os.path.isdir(self.statedir): os.mkdir(self.statedir) have_changes = False for source in repository.sources: if not os.path.isdir(self._get_state_path(repository)): os.mkdir(self._get_state_path(repository)) have_changes |= self._fetch_source(repository, update, source, logger.get_indented()) logger.log('fetching complete' + ('' if have_changes else ' (no changes)')) return have_changes def _parse( self, repository: Repository, transformer: PackageTransformer | None, maintainermgr: MaintainerManager | None, logger: Logger ) -> None: logger.log('parsing started') if not os.path.isdir(self.parseddir): os.mkdir(self.parseddir) with AtomicDir(self._get_parsed_path(repository)) as state_dir: serializer = ChunkedSerializer(state_dir.get_path(), MAX_PACKAGES_PER_CHUNK) serializer.serialize(self._iter_parse_all_sources(repository, transformer, maintainermgr, logger)) if self.safety_checks and serializer.get_num_packages() < repository.minpackages: raise TooLittlePackages(serializer.get_num_packages(), repository.minpackages) logger.log('parsing complete, {} packages'.format(serializer.get_num_packages())) # public methods def fetch(self, reponames: RepositoryNameList, update: bool = True, logger: Logger = NoopLogger()) -> bool: have_changes = False for repository in self.repomgr.get_repositories(reponames): have_changes |= self._fetch(repository, update, logger) return have_changes def parse( self, reponames: RepositoryNameList, transformer: PackageTransformer | None = None, maintainermgr: MaintainerManager | None = None, logger: Logger = NoopLogger() ) -> None: for repository in self.repomgr.get_repositories(reponames): self._parse(repository, transformer, maintainermgr, logger) def iter_parse( self, reponames: RepositoryNameList, transformer: PackageTransformer | None = None, maintainermgr: MaintainerManager | None = None, logger: Logger = NoopLogger() ) -> Iterator[Package]: for repository in self.repomgr.get_repositories(reponames): yield from self._iter_parse_all_sources(repository, transformer, maintainermgr, logger) def iter_parsed(self, reponames: RepositoryNameList | None = None, logger: Logger = NoopLogger()) -> Iterator[list[Package]]: sources: list[str] = [] for repository in self.repomgr.get_repositories(reponames): repo_sources = self._get_parsed_chunk_paths(repository) if not repo_sources: logger.log(f'parsed packages for repository {repository.desc} are missing, treating repository as empty', severity=Logger.WARNING) sources.extend(repo_sources) if sources: yield from map(packageset_deduplicate, heap_deserialize(sources)) else: logger.log('no parsed packages found', severity=Logger.ERROR)
# Copyright (C) 2016-2017 Dmitry Marakasov <*****@*****.**> # # This file is part of repology # # repology is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # repology is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with repology. If not, see <http://www.gnu.org/licenses/>. from repology.moduleutils import ClassFactory Factory = ClassFactory(__name__, __file__, 'Fetcher')