class DEP11Generator: def __init__(self): pass def initialize(self, dep11_dir): dep11_dir = os.path.abspath(dep11_dir) conf = load_generator_config(dep11_dir) if not conf: return False self._dep11_url = conf.get("MediaBaseUrl") self._icon_sizes = conf.get("IconSizes") if not self._icon_sizes: self._icon_sizes = ["128x128", "64x64"] self._archive_root = conf.get("ArchiveRoot") cache_dir = os.path.join(dep11_dir, "cache") if conf.get("CacheDir"): cache_dir = conf.get("CacheDir") self._export_dir = os.path.join(dep11_dir, "export") if conf.get("ExportDir"): self._export_dir = conf.get("ExportDir") if not os.path.exists(cache_dir): os.makedirs(cache_dir) if not os.path.exists(self._export_dir): os.makedirs(self._export_dir) self._suites_data = conf['Suites'] self._distro_name = conf.get("DistroName") if not self._distro_name: self._distro_name = "Debian" # initialize our on-dik metadata pool self._cache = DataCache(self._get_media_dir()) ret = self._cache.open(cache_dir) os.chdir(dep11_dir) return ret def _get_media_dir(self): mdir = os.path.join(self._export_dir, "media") if not os.path.exists(mdir): os.makedirs(mdir) return mdir def _get_packages_for(self, suite, component, arch): return read_packages_dict_from_file(self._archive_root, suite, component, arch).values() def make_icon_tar(self, suitename, component, pkglist): ''' Generate icons-%(size).tar.gz ''' dep11_mediadir = self._get_media_dir() names_seen = set() tar_location = os.path.join(self._export_dir, "data", suitename, component) size_tars = dict() for pkg in pkglist: pkid = get_pkg_id(pkg['name'], pkg['version'], pkg['arch']) gids = self._cache.get_cpt_gids_for_pkg(pkid) if not gids: # no component global-ids == no icons to add to the tarball continue for gid in gids: for size in self._icon_sizes: icon_location_glob = os.path.join (dep11_mediadir, component, gid, "icons", size, "*.png") tar = None if size not in size_tars: icon_tar_fname = os.path.join(tar_location, "icons-%s.tar.gz" % (size)) size_tars[size] = tarfile.open(icon_tar_fname+".new", "w:gz") tar = size_tars[size] for filename in glob.glob(icon_location_glob): icon_name = os.path.basename(filename) if size+"/"+icon_name in names_seen: continue tar.add(filename, arcname=icon_name) names_seen.add(size+"/"+icon_name) for tar in size_tars.values(): tar.close() # FIXME Ugly.... safe_move_file(tar.name, tar.name.replace(".new", "")) def process_suite(self, suite_name): ''' Extract new metadata for a given suite. ''' suite = self._suites_data.get(suite_name) if not suite: log.error("Suite '%s' not found!" % (suite_name)) return False dep11_mediadir = self._get_media_dir() # We need 'forkserver' as startup method to prevent deadlocks on join() # Something in the extractor is doing weird things, makes joining impossible # when using simple fork as startup method. mp.set_start_method('forkserver') for component in suite['components']: all_cpt_pkgs = list() for arch in suite['architectures']: pkglist = self._get_packages_for(suite_name, component, arch) # compile a list of packages that we need to look into pkgs_todo = dict() for pkg in pkglist: pkid = get_pkg_id(pkg['name'], pkg['version'], pkg['arch']) # check if we scanned the package already if self._cache.package_exists(pkid): continue pkgs_todo[pkid] = pkg # set up metadata extractor iconf = ContentsListIconFinder(suite_name, component, arch, self._archive_root) mde = MetadataExtractor(suite_name, component, self._icon_sizes, self._cache, iconf) # Multiprocessing can't cope with LMDB open in the cache, # but instead of throwing an error or doing something else # that makes debugging easier, it just silently skips each # multprocessing task. Stupid thing. # (remember to re-open the cache later) self._cache.close() # set up multiprocessing with mp.Pool(maxtasksperchild=16) as pool: def handle_results(message): log.info(message) def handle_error(e): traceback.print_exception(type(e), e, e.__traceback__) log.error(str(e)) pool.terminate() sys.exit(5) log.info("Processing %i packages in %s/%s/%s" % (len(pkgs_todo), suite_name, component, arch)) for pkid, pkg in pkgs_todo.items(): package_fname = os.path.join (self._archive_root, pkg['filename']) if not os.path.exists(package_fname): log.warning('Package not found: %s' % (package_fname)) continue pool.apply_async(extract_metadata, (mde, suite_name, pkg['name'], package_fname, pkg['version'], pkg['arch'], pkid), callback=handle_results, error_callback=handle_error) pool.close() pool.join() # reopen the cache, we need it self._cache.reopen() hints_dir = os.path.join(self._export_dir, "hints", suite_name, component) if not os.path.exists(hints_dir): os.makedirs(hints_dir) dep11_dir = os.path.join(self._export_dir, "data", suite_name, component) if not os.path.exists(dep11_dir): os.makedirs(dep11_dir) # now write data to disk hints_fname = os.path.join(hints_dir, "DEP11Hints_%s.yml.gz" % (arch)) data_fname = os.path.join(dep11_dir, "Components-%s.yml.gz" % (arch)) hints_f = gzip.open(hints_fname+".new", 'wb') data_f = gzip.open(data_fname+".new", 'wb') dep11_header = get_dep11_header(suite_name, component, os.path.join(self._dep11_url, component)) data_f.write(bytes(dep11_header, 'utf-8')) for pkg in pkglist: pkid = get_pkg_id(pkg['name'], pkg['version'], pkg['arch']) data = self._cache.get_metadata_for_pkg(pkid) if data: data_f.write(bytes(data, 'utf-8')) hint = self._cache.get_hints(pkid) if hint: hints_f.write(bytes(hint, 'utf-8')) data_f.close() safe_move_file(data_fname+".new", data_fname) hints_f.close() safe_move_file(hints_fname+".new", hints_fname) all_cpt_pkgs.extend(pkglist) # create icon tarball self.make_icon_tar(suite_name, component, all_cpt_pkgs) log.info("Completed metadata extraction for suite %s/%s" % (suite_name, component)) def expire_cache(self): pkgids = set() for suite_name in self._suites_data: suite = self._suites_data[suite_name] for component in suite['components']: for arch in suite['architectures']: pkglist = self._get_packages_for(suite_name, component, arch) for pkg in pkglist: pkid = get_pkg_id(pkg['name'], pkg['version'], pkg['arch']) pkgids.add(pkid) # clean cache oldpkgs = self._cache.get_packages_not_in_set(pkgids) for pkid in oldpkgs: pkid = str(pkid, 'utf-8') self._cache.remove_package(pkid) # ensure we don't leave cruft self._cache.remove_orphaned_components() def remove_processed(self, suite_name): ''' Delete information about processed packages, to reprocess them later. ''' suite = self._suites_data.get(suite_name) if not suite: log.error("Suite '%s' not found!" % (suite_name)) return False for component in suite['components']: all_cpt_pkgs = list() for arch in suite['architectures']: pkglist = self._get_packages_for(suite_name, component, arch) for pkg in pkglist: package_fname = os.path.join (self._archive_root, pkg['filename']) pkid = get_pkg_id(pkg['name'], pkg['version'], pkg['arch']) # we ignore packages without any interesting metadata here if self._cache.is_ignored(pkid): continue self._cache.remove_package(pkid) # drop all components which don't have packages self._cache.remove_orphaned_components()
class DEP11Generator: def __init__(self): pass def initialize(self, dep11_dir): dep11_dir = os.path.abspath(dep11_dir) conf = load_generator_config(dep11_dir) if not conf: return False self._all_pkgs = defaultdict(partial(defaultdict, partial(defaultdict, list))) self._dep11_url = conf.get("MediaBaseUrl") self._icon_sizes = conf.get("IconSizes") if not self._icon_sizes: self._icon_sizes = ["128x128", "64x64"] self._archive_root = conf.get("ArchiveRoot") cache_dir = os.path.join(dep11_dir, "cache") if conf.get("CacheDir"): cache_dir = conf.get("CacheDir") self._export_dir = os.path.join(dep11_dir, "export") if conf.get("ExportDir"): self._export_dir = conf.get("ExportDir") self._langpack_dir = os.path.join(dep11_dir, "langpacks") if not os.path.exists(cache_dir): os.makedirs(cache_dir) if not os.path.exists(self._export_dir): os.makedirs(self._export_dir) if not os.path.exists(self._langpack_dir): os.makedirs(self._langpack_dir) self._suites_data = conf['Suites'] self._distro_name = conf.get("DistroName") if not self._distro_name: self._distro_name = "Debian" # the RepositoryName property is only interesting for # 3rd-party repositories using this generator, which don't want # to conflict with the main distro repository data. self._repo_name = conf.get("RepositoryName") if not self._repo_name: self._repo_name = self._distro_name # initialize our on-disk metadata pool self._cache = DataCache(self._get_media_dir()) ret = self._cache.open(cache_dir) os.chdir(dep11_dir) return ret def _get_media_dir(self): mdir = os.path.join(self._export_dir, "media") if not os.path.exists(mdir): os.makedirs(mdir) return mdir def _get_packages_for(self, suite, component, arch, with_desc=True): return read_packages_dict_from_file(self._archive_root, suite, component, arch, with_description=with_desc).values() def make_icon_tar(self, suitename, component, pkglist): ''' Generate icons-%(size).tar.gz ''' dep11_mediadir = self._get_media_dir() names_seen = set() tar_location = os.path.join(self._export_dir, "data", suitename, component) size_tars = dict() for pkg in pkglist: pkid = pkg.pkid gids = self._cache.get_cpt_gids_for_pkg(pkid) if not gids: # no component global-ids == no icons to add to the tarball continue for gid in gids: for size in self._icon_sizes: icon_location_glob = os.path.join (dep11_mediadir, "*", gid, "icons", size, "*.png") tar = None if size not in size_tars: icon_tar_fname = os.path.join(tar_location, "icons-%s.tar.gz" % (size)) size_tars[size] = tarfile.open(icon_tar_fname+".new", "w:gz") tar = size_tars[size] for filename in glob.glob(icon_location_glob): icon_name = os.path.basename(filename) if size+"/"+icon_name in names_seen: continue tar.add(filename, arcname=icon_name) names_seen.add(size+"/"+icon_name) for tar in size_tars.values(): tar.close() # FIXME Ugly.... safe_move_file(tar.name, tar.name.replace(".new", "")) def process_suite(self, suite_name): ''' Extract new metadata for a given suite. ''' suite = self._suites_data.get(suite_name) if not suite: log.error("Suite '%s' not found!" % (suite_name)) return False base_suite_name = suite.get('baseSuite') base_suite = self._suites_data.get(base_suite_name) if base_suite_name else None # We need 'forkserver' as startup method to prevent deadlocks on join() # Something in the extractor is doing weird things, makes joining impossible # when using simple fork as startup method. mp.set_start_method('forkserver') for component in suite['components']: for arch in suite['architectures']: self._all_pkgs[suite_name][component][arch] = \ self._get_packages_for(suite_name, component, arch) if base_suite: for component in base_suite['components']: for arch in base_suite['architectures']: self._all_pkgs[base_suite_name][component][arch] = \ self._get_packages_for(base_suite_name, component, arch) langpacks = None for component in suite['components']: all_cpt_pkgs = list() new_components = False for arch in suite['architectures']: pkglist = self._all_pkgs[suite_name][component][arch] suite_component_arch = "%s/%s/%s" % (suite_name, component, arch) dep11_dir = os.path.join(self._export_dir, "data", suite_name, component) data_fname = os.path.join(dep11_dir, "Components-%s.yml.gz" % (arch)) last_seen_pkgs = set() try: for y in yaml.load_all(gzip.open(data_fname, 'r')): if 'Package' in y: last_seen_pkgs.add(y['Package']) except FileNotFoundError: pass # compile a list of packages that we need to look into pkgs_todo = dict() for pkg in pkglist: pkid = pkg.pkid last_seen_pkgs.discard(pkg.name) # check if we scanned the package already if self._cache.package_exists(pkid): if not self._cache.package_in_suite(pkid, suite_component_arch) and not self._cache.is_ignored(pkid): log.info("Seen %s before, but not in %s" % (pkid, suite_component_arch)) self._cache.add_package_to_suite(pkid, suite_component_arch) new_components = True continue pkgs_todo[pkid] = pkg # some packages have been removed if last_seen_pkgs: for pkg in last_seen_pkgs: self._cache.remove_package_from_suite(pkid, suite_component_arch) new_components = True dep11_header = get_dep11_header(self._repo_name, suite_name, component, os.path.join(self._dep11_url, component), suite.get('dataPriority', 0)) if not os.path.exists(dep11_dir): os.makedirs(dep11_dir) if not pkgs_todo and not new_components: if not os.path.exists(data_fname): log.info ("No packages to process for %s, but %s doesn't exist, so writing with header only." % (suite_component_arch, data_fname)) data_f = gzip.open(data_fname, 'wb') data_f.write(bytes(dep11_header, 'utf-8')) data_f.close() else: log.info("Skipped %s, no new packages to process." % suite_component_arch) continue if pkgs_todo: # set up metadata extractor icon_theme = suite.get('useIconTheme') iconh = IconHandler(suite_name, component, arch, self._archive_root, icon_theme, base_suite_name=suite.get('baseSuite')) iconh.set_wanted_icon_sizes(self._icon_sizes) if not langpacks: langpacks = UbuntuLangpackHandler(suite, suite_name, self._all_pkgs, self._langpack_dir, self._cache) mde = MetadataExtractor(suite_name, component, arch, self._cache, iconh, langpacks) # Multiprocessing can't cope with LMDB open in the cache, # but instead of throwing an error or doing something else # that makes debugging easier, it just silently skips each # multprocessing task. Stupid thing. # (remember to re-open the cache later) self._cache.close() # set up multiprocessing with mp.Pool(maxtasksperchild=24) as pool: count = 1 def handle_results(result): nonlocal count nonlocal new_components (message, any_components) = result new_components = new_components or any_components log.info(message.format(count, len(pkgs_todo))) count += 1 def handle_error(e): traceback.print_exception(type(e), e, e.__traceback__) log.error(str(e)) pool.terminate() sys.exit(5) log.info("Processing %i packages in %s" % (len(pkgs_todo), suite_component_arch)) for pkid, pkg in pkgs_todo.items(): package_fname = os.path.join (self._archive_root, pkg.filename) if not os.path.exists(package_fname): log.warning('Package not found: %s' % (package_fname)) continue pkg.filename = package_fname pool.apply_async(extract_metadata, (mde, suite_name, pkg), callback=handle_results, error_callback=handle_error) pool.close() pool.join() # reopen the cache, we need it self._cache.reopen() hints_dir = os.path.join(self._export_dir, "hints", suite_name, component) if not os.path.exists(hints_dir): os.makedirs(hints_dir) hints_fname = os.path.join(hints_dir, "DEP11Hints_%s.yml.gz" % (arch)) hints_f = gzip.open(hints_fname+".new", 'wb') if not new_components and os.path.exists(data_fname): log.info("Skipping %s, no components in any of the new packages.", suite_component_arch) else: # now write data to disk data_f = gzip.open(data_fname+".new", 'wb') data_f.write(bytes(dep11_header, 'utf-8')) for pkg in pkglist: pkid = pkg.pkid if new_components: data = self._cache.get_metadata_for_pkg(pkid) if data: data_f.write(bytes(data, 'utf-8')) hint = self._cache.get_hints(pkid) if hint: hints_f.write(bytes(hint, 'utf-8')) if new_components: data_f.close() safe_move_file(data_fname+".new", data_fname) hints_f.close() safe_move_file(hints_fname+".new", hints_fname) all_cpt_pkgs.extend(pkglist) # create icon tarball self.make_icon_tar(suite_name, component, all_cpt_pkgs) log.info("Completed metadata extraction for suite %s/%s" % (suite_name, component)) def expire_cache(self): pkgids = set() for suite_name in self._suites_data: suite = self._suites_data[suite_name] for component in suite['components']: for arch in suite['architectures']: pkglist = self._get_packages_for(suite_name, component, arch, with_desc=False) for pkg in pkglist: pkgids.add(pkg.pkid) # clean cache oldpkgs = self._cache.get_packages_not_in_set(pkgids) for pkid in oldpkgs: pkid = str(pkid, 'utf-8') self._cache.remove_package(pkid) # ensure we don't leave cruft, drop orphaned components (cpts w/o pkg) self._cache.remove_orphaned_components() # drop orphaned media (media w/o registered cpt) self._cache.remove_orphaned_media() def remove_processed(self, suite_name): ''' Delete information about processed packages, to reprocess them later. ''' suite = self._suites_data.get(suite_name) if not suite: log.error("Suite '%s' not found!" % (suite_name)) return False for component in suite['components']: for arch in suite['architectures']: pkglist = self._get_packages_for(suite_name, component, arch, with_desc=False) for pkg in pkglist: pkid = pkg.pkid # we ignore packages without any interesting metadata here if self._cache.is_ignored(pkid): continue if not self._cache.package_exists(pkid): continue self._cache.remove_package(pkid) # drop all components which don't have packages self._cache.remove_orphaned_components() self._cache.remove_orphaned_media() def forget_package(self, pkid): ''' Delete all information about a package in the cache. ''' if '/' in pkid: if not self._cache.package_exists(pkid): print("Package with ID '%s' does not exist." % (pkid)) return self._cache.remove_package(pkid) else: log.info("Removing all packages with name {}".format(pkid)) ret = self._cache.delete_package_by_name(pkid) if not ret: print("Unable to remove packages matching name '%s'." % (pkid)) return # drop all components which don't have packages self._cache.remove_orphaned_components() def show_info(self, pkgname): ''' Show some details we know about a package. ''' print("{}:".format(pkgname)) for pkva, info in self._cache.get_info(pkgname): print(" {}".format(pkva)) for e in info: print(" | -> {}".format(str(e))) def prepopulate_cache(self, suite_name): ''' Check which packages we can definitely ignore based on their contents in the Contents.gz file. This is useful when e.g. bootstrapping new suites / architectures. ''' suite = self._suites_data.get(suite_name) if not suite: log.error("Suite '%s' not found!" % (suite_name)) return False for component in suite['components']: for arch in suite['architectures']: pkid_filelist = dict() for fname, pkg in parse_contents_file(self._archive_root, suite_name, component, arch): if not pkid_filelist.get(pkg.pkid): pkid_filelist[pkg.pkid] = list() pkid_filelist[pkg.pkid].append(fname) for pkid, filelist in pkid_filelist.items(): ignore = True for f in filelist: if 'usr/share/applications/' in f: ignore = False break if 'usr/share/metainfo/' in f: ignore = False break if 'usr/share/appdata/' in f: ignore = False break if not ignore: continue if self._cache.is_ignored(pkid): log.info("Package is already ignored: {}".format(pkid)) elif self._cache.package_exists(pkid): log.warning("Tried to ignore package which actually exists and has data: {}".format(pkid)) else: log.info("Ignoring package: {}".format(pkid)) self._cache.set_package_ignore(pkid)