def scan_packages(repository, packages_file=None, cache=None): """ A reimplementation of the ``dpkg-scanpackages -m`` command in Python. Updates a ``Packages`` file based on the Debian package archive(s) found in the given directory. Uses :py:class:`.PackageCache` to (optionally) speed up the process significantly by caching package metadata and hashes on disk. This explains why this function can be much faster than ``dpkg-scanpackages -m``. :param repository: The pathname of a directory containing Debian package archives (a string). :param packages_file: The pathname of the ``Packages`` file to update (a string). Defaults to the ``Packages`` file in the given directory. :param cache: The :py:class:`.PackageCache` to use (defaults to ``None``). """ # By default the `Packages' file inside the repository is updated. if not packages_file: packages_file = os.path.join(repository, 'Packages') # Update the `Packages' file. timer = Timer() package_archives = glob.glob(os.path.join(repository, '*.deb')) num_packages = len(package_archives) spinner = Spinner(total=num_packages) with open(packages_file, 'wb') as handle: for i, archive in enumerate(optimize_order(package_archives), start=1): fields = dict(inspect_package_fields(archive, cache=cache)) fields.update(get_packages_entry(archive, cache=cache)) deb822_dict = unparse_control_fields(fields) deb822_dict.dump(handle) handle.write(b'\n') spinner.step(label="Scanning package metadata", progress=i) spinner.clear() logger.debug("Wrote %i entries to output Packages file in %s.", num_packages, timer)
def scan_packages(repository, packages_file=None, cache=None): """ A reimplementation of the ``dpkg-scanpackages -m`` command in Python. Updates a ``Packages`` file based on the Debian package archive(s) found in the given directory. Uses :class:`.PackageCache` to (optionally) speed up the process significantly by caching package metadata and hashes on disk. This explains why this function can be much faster than ``dpkg-scanpackages -m``. :param repository: The pathname of a directory containing Debian package archives (a string). :param packages_file: The pathname of the ``Packages`` file to update (a string). Defaults to the ``Packages`` file in the given directory. :param cache: The :class:`.PackageCache` to use (defaults to :data:`None`). """ # By default the `Packages' file inside the repository is updated. if not packages_file: packages_file = os.path.join(repository, 'Packages') # Update the `Packages' file. timer = Timer() package_archives = glob.glob(os.path.join(repository, '*.deb')) num_packages = len(package_archives) spinner = Spinner(total=num_packages) with open(packages_file, 'wb') as handle: for i, archive in enumerate(optimize_order(package_archives), start=1): fields = dict(inspect_package_fields(archive, cache=cache)) fields.update(get_packages_entry(archive, cache=cache)) deb822_dict = unparse_control_fields(fields) deb822_dict.dump(handle) handle.write(b'\n') spinner.step(label="Scanning package metadata", progress=i) spinner.clear() logger.debug("Wrote %i entries to output Packages file in %s.", num_packages, timer)
def check_duplicate_files(dependency_set, cache=None): """ Check a collection of Debian package archives for conflicts. Looks for duplicate files in unrelated package archives. Ignores groups of packages that have their 'Provides' and 'Replaces' fields set to a common value. Other variants of 'Conflicts' are not supported yet. Because this analysis involves both the package control file fields and the pathnames of files installed by packages it can be slow. To make it faster you can use the :py:class:`.PackageCache`. :param dependency_set: A list of filenames (strings) of ``*.deb`` files. :param cache: The :py:class:`.PackageCache` to use (defaults to ``None``). :raises: :py:class:`exceptions.ValueError` when less than two package archives are given (the duplicate check obviously only works if there are packages to compare :-). :raises: :py:class:`DuplicateFilesFound` when duplicate files are found within a group of package archives. """ timer = Timer() dependency_set = list(map(parse_filename, dependency_set)) # Make sure we have something useful to work with. num_archives = len(dependency_set) if num_archives < 2: msg = "To check for duplicate files you need to provide two or more packages archives! (%i given)" raise ValueError(msg % num_archives) # Build up a global map of all files contained in the given package archives. global_contents = collections.defaultdict(set) global_fields = {} spinner = Spinner(total=num_archives) logger.info("Checking for duplicate files in %i package archives ..", num_archives) for i, archive in enumerate(optimize_order(dependency_set), start=1): spinner.step(label="Scanning %i package archives" % num_archives, progress=i) fields, contents = inspect_package(archive.filename, cache=cache) global_fields[archive.filename] = fields for pathname, stat in contents.items(): if not stat.permissions.startswith('d'): global_contents[pathname].add(archive) spinner.clear() # Count the number of duplicate files between sets of conflicting packages # for more user friendly reporting. duplicate_files = collections.defaultdict( lambda: dict(count=0, filenames=[])) for pathname, packages in global_contents.items(): if len(packages) > 1: # Override the sort key to be the filename because we don't need # to properly sort by version (which is slow on large collections). key = tuple(sorted(packages, key=lambda p: p.filename)) duplicate_files[key]['count'] += 1 duplicate_files[key]['filenames'].append(pathname) for packages, information in sorted(duplicate_files.items()): # Never report multiple versions of the same package. if len(set(package.name for package in packages)) == 1: duplicate_files.pop(packages) continue # We check for one common case where it's easy to guarantee that # we're not dealing with broken packages: All of the packages have # marked each other as conflicting via the combination of the # fields `Provides:' and `Conflicts:'. def find_virtual_name(field_name): package_names = set() for archive in packages: field = global_fields[archive.filename].get(field_name) if field: package_names |= field.names else: return if len(package_names) == 1: return list(package_names)[0] marked_conflicts = find_virtual_name('Conflicts') marked_provides = find_virtual_name('Provides') if marked_conflicts and marked_conflicts == marked_provides: duplicate_files.pop(packages) # Boring string formatting, trying to find a way to clearly present conflicts. summary = [] for packages, information in sorted(duplicate_files.items()): block = [] conflicts = pluralize(information['count'], 'conflict', 'conflicts') block.append("Found %s between %i packages:\n" % (conflicts, len(packages))) for i, package in enumerate(sorted(packages), start=1): block.append(" %i. %s\n" % (i, package.filename)) block.append("These packages contain %s:\n" % conflicts) for i, filename in enumerate(sorted(information['filenames']), start=1): block.append(" %i. %s\n" % (i, filename)) summary.append(''.join(block)) if summary: archives_involved = set( itertools.chain.from_iterable(duplicate_files.keys())) files = pluralize(len(duplicate_files), 'duplicate file', 'duplicate files') archives = pluralize(len(archives_involved), 'package archive', 'package archives') summary.insert(0, "Found %s in %s!\n" % (files, archives)) summary.append( compact(""" Hint: If the package contents are correct you can resolve these conflicts by marking the packages as conflicting. You do this by adding the 'Conflicts' and 'Provides' fields and setting them to a common value. That should silence this message. """)) delimiter = '%s\n' % ('-' * 79) raise DuplicateFilesFound(delimiter.join(summary)) else: logger.info("No conflicting files found (took %s).", timer)
def check_duplicate_files(dependency_set, cache=None): """ Check a collection of Debian package archives for conflicts. :param dependency_set: A list of filenames (strings) of ``*.deb`` files. :param cache: The :class:`.PackageCache` to use (defaults to :data:`None`). :raises: :exc:`exceptions.ValueError` when less than two package archives are given (the duplicate check obviously only works if there are packages to compare :-). :raises: :exc:`DuplicateFilesFound` when duplicate files are found within a group of package archives. This check looks for duplicate files in package archives that concern different packages. Ignores groups of packages that have their 'Provides' and 'Replaces' fields set to a common value. Other variants of 'Conflicts' are not supported yet. Because this analysis involves both the package control file fields and the pathnames of files installed by packages it can be really slow. To make it faster you can use the :class:`.PackageCache`. """ timer = Timer() dependency_set = list(map(parse_filename, dependency_set)) # Make sure we have something useful to work with. num_archives = len(dependency_set) if num_archives < 2: msg = "To check for duplicate files you need to provide two or more packages archives! (%i given)" raise ValueError(msg % num_archives) # Build up a global map of all files contained in the given package archives. global_contents = collections.defaultdict(set) global_fields = {} spinner = Spinner(total=num_archives) logger.info("Checking for duplicate files in %i package archives ..", num_archives) for i, archive in enumerate(optimize_order(dependency_set), start=1): spinner.step(label="Scanning %i package archives" % num_archives, progress=i) fields, contents = inspect_package(archive.filename, cache=cache) global_fields[archive.filename] = fields for pathname, stat in contents.items(): if not stat.permissions.startswith('d'): global_contents[pathname].add(archive) spinner.clear() # Count the number of duplicate files between sets of conflicting packages # for more user friendly reporting. duplicate_files = collections.defaultdict(lambda: dict(count=0, filenames=[])) for pathname, packages in global_contents.items(): if len(packages) > 1: # Override the sort key to be the filename because we don't need # to properly sort by version (which is slow on large collections). key = tuple(sorted(packages, key=lambda p: p.filename)) duplicate_files[key]['count'] += 1 duplicate_files[key]['filenames'].append(pathname) for packages, information in sorted(duplicate_files.items()): # Never report multiple versions of the same package. if len(set(package.name for package in packages)) == 1: duplicate_files.pop(packages) continue # We check for one common case where it's easy to guarantee that # we're not dealing with broken packages: All of the packages have # marked each other as conflicting via the combination of the # fields `Provides:' and `Conflicts:'. def find_virtual_name(field_name): package_names = set() for archive in packages: field = global_fields[archive.filename].get(field_name) if field: package_names |= field.names else: return if len(package_names) == 1: return list(package_names)[0] marked_conflicts = find_virtual_name('Conflicts') marked_provides = find_virtual_name('Provides') if marked_conflicts and marked_conflicts == marked_provides: duplicate_files.pop(packages) # Boring string formatting, trying to find a way to clearly present conflicts. summary = [] for packages, information in sorted(duplicate_files.items()): block = [] conflicts = pluralize(information['count'], 'conflict', 'conflicts') block.append("Found %s between %i packages:\n" % (conflicts, len(packages))) for i, package in enumerate(sorted(packages), start=1): block.append(" %i. %s\n" % (i, package.filename)) block.append("These packages contain %s:\n" % conflicts) for i, filename in enumerate(sorted(information['filenames']), start=1): block.append(" %i. %s\n" % (i, filename)) summary.append(''.join(block)) if summary: archives_involved = set(itertools.chain.from_iterable(duplicate_files.keys())) files = pluralize(len(duplicate_files), 'duplicate file', 'duplicate files') archives = pluralize(len(archives_involved), 'package archive', 'package archives') summary.insert(0, "Found %s in %s!\n" % (files, archives)) summary.append(compact(""" Hint: If the package contents are correct you can resolve these conflicts by marking the packages as conflicting. You do this by adding the 'Conflicts' and 'Provides' fields and setting them to a common value. That should silence this message. """)) delimiter = '%s\n' % ('-' * 79) raise DuplicateFilesFound(delimiter.join(summary)) else: logger.info("No conflicting files found (took %s).", timer)