def test_directory_to_objects_ignore_name_case(self): directory = Directory.from_disk( path=self.tmpdir_name, dir_filter=from_disk.ignore_named_directories( [b"symLiNks"], case_sensitive=False ), ) for name, value in self.contents.items(): self.assertContentEqual(directory[b"contents/" + name], value) for name in self.specials: self.assertContentEqual( directory[b"specials/" + name], self.empty_content, ) self.assertEqual( directory[b"empty1/empty2"].get_data(), self.empty_directory, ) with self.assertRaisesRegex(KeyError, "b'symlinks'"): directory[b"symlinks"] objs = directory.collect() self.assertCountEqual(["content", "directory"], objs) self.assertEqual(len(objs["directory"]), 5) self.assertEqual(len(objs["content"]), len(self.contents) + 1)
def jar_dirs(datadir, tmp_path): jar_1_path = os.path.join(datadir, "https_maven.org", "sprova4j-0.1.0-sources.jar") jar_2_path = os.path.join(datadir, "https_maven.org", "sprova4j-0.1.1-sources.jar") jar_1_extract_path = os.path.join(tmp_path, "jar_1") jar_2_extract_path = os.path.join(tmp_path, "jar_2") uncompress(jar_1_path, jar_1_extract_path) uncompress(jar_2_path, jar_2_extract_path) jar_1_dir = Directory.from_disk(path=jar_1_extract_path.encode()) jar_2_dir = Directory.from_disk(path=jar_2_extract_path.encode()) return [jar_1_dir, jar_2_dir]
def test_directory_to_objects_ignore_empty(self): directory = Directory.from_disk( path=self.tmpdir_name, dir_filter=from_disk.ignore_empty_directories ) for name, value in self.contents.items(): self.assertContentEqual(directory[b"contents/" + name], value) for name, value in self.symlinks.items(): self.assertContentEqual(directory[b"symlinks/" + name], value) for name in self.specials: self.assertContentEqual( directory[b"specials/" + name], self.empty_content, ) # empty directories have been ignored recursively with self.assertRaisesRegex(KeyError, "b'empty1'"): directory[b"empty1"] with self.assertRaisesRegex(KeyError, "b'empty1'"): directory[b"empty1/empty2"] objs = directory.collect() self.assertCountEqual(["content", "directory"], objs) self.assertEqual(len(objs["directory"]), 4) self.assertEqual( len(objs["content"]), len(self.contents) + len(self.symlinks) + 1 )
def fetch_data(self): """Retrieve, uncompress archive and fetch objects from the tarball. The actual ingestion takes place in the :meth:`store_data` implementation below. """ url = self.get_tarball_url_to_retrieve() filepath, hashes = self.client.download(url) nature = tarball.uncompress(filepath, self.dir_path) dir_path = self.dir_path.encode('utf-8') directory = Directory.from_disk(path=dir_path, save_path=True) objects = directory.collect() if 'content' not in objects: objects['content'] = {} if 'directory' not in objects: objects['directory'] = {} # compute the full revision (with ids) revision = self.build_revision(filepath, nature, hashes) revision = revision_from(directory.hash, revision) objects['revision'] = { revision['id']: revision, } snapshot = self.build_snapshot(revision) objects['snapshot'] = { snapshot['id']: snapshot } self.objects = objects
def list_objs(self, *, dir_path, revision, release, branch_name): """List all objects from dir_path. Args: dir_path (str): the directory to list revision (dict): revision dictionary representation release (dict): release dictionary representation branch_name (str): branch name Returns: dict: a mapping from object types ('content', 'directory', 'revision', 'release', 'snapshot') with a dictionary mapping each object's id to the object """ log_id = str(uuid.uuid4()) sdir_path = dir_path.decode('utf-8') log_data = { 'swh_type': 'dir_list_objs_end', 'swh_repo': sdir_path, 'swh_id': log_id, } self.log.debug("Started listing {swh_repo}".format(**log_data), extra=log_data) directory = Directory.from_disk(path=dir_path, save_path=True) objects = directory.collect() if 'content' not in objects: objects['content'] = {} if 'directory' not in objects: objects['directory'] = {} full_rev = revision_from(directory.hash, revision) rev_id = full_rev['id'] objects['revision'] = {rev_id: full_rev} objects['release'] = {} if release and 'name' in release: full_rel = release_from(rev_id, release) objects['release'][full_rel['id']] = full_rel snapshot = snapshot_from(rev_id, branch_name) objects['snapshot'] = {snapshot['id']: snapshot} log_data.update({ 'swh_num_%s' % key: len(values) for key, values in objects.items() }) self.log.debug(("Done listing the objects in {swh_repo}: " "{swh_num_content} contents, " "{swh_num_directory} directories, " "{swh_num_revision} revisions, " "{swh_num_release} releases, " "{swh_num_snapshot} snapshot").format(**log_data), extra=log_data) return objects
def model_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> Directory: from swh.model.from_disk import accept_all_directories, ignore_directories_patterns dir_filter = (ignore_directories_patterns(path, exclude_patterns) if exclude_patterns else accept_all_directories) return Directory.from_disk(path=path, dir_filter=dir_filter)
def test_contents_match(self): directory = Directory.from_disk( path=os.path.join(self.tmpdir_name, b"sample-folder") ) for name, expected in self.tarball_contents.items(): obj = directory[name] if isinstance(obj, Content): self.assertContentEqual(obj, expected) elif isinstance(obj, Directory): self.assertDirectoryEqual(obj, expected) else: raise self.failureException("Unknown type for %s" % obj)
def test_directory_entry_order(self): with tempfile.TemporaryDirectory() as dirname: dirname = os.fsencode(dirname) open(os.path.join(dirname, b"foo."), "a") open(os.path.join(dirname, b"foo0"), "a") os.mkdir(os.path.join(dirname, b"foo")) directory = Directory.from_disk(path=dirname) assert [entry["name"] for entry in directory.entries] == [ b"foo.", b"foo", b"foo0", ]
def test_iter_directory(self): """Iter from_disk.directory should yield the full arborescence tree""" directory = Directory.from_disk( path=os.path.join(self.tmpdir_name, b"sample-folder") ) contents, skipped_contents, directories = from_disk.iter_directory(directory) expected_nb = defaultdict(int) for name in self.tarball_contents.keys(): obj = directory[name] expected_nb[obj.object_type] += 1 assert len(contents) == expected_nb["content"] and len(contents) > 0 assert len(skipped_contents) == 0 assert len(directories) == expected_nb["directory"] and len(directories) > 0
def process_package(package): """Process a source package into its constituent components. The source package will be decompressed in a temporary directory. Args: package (dict): a dict with the following keys: - name: source package name - version: source package version - dsc: the full path of the package's DSC file. Returns: tuple: A tuple with two elements: - package: the original package dict augmented with the following keys: - metadata: the metadata from get_package_metadata - directory: the sha1_git of the root directory of the package - objects: a dictionary of the parsed directories and files, both indexed by id Raises: FileNotFoundError: if the dsc file does not exist PackageExtractionFailed: if package extraction failed """ log.info("Processing package %s_%s" % (package['name'], str(package['version'])), extra={ 'swh_type': 'deb_process_start', 'swh_name': package['name'], 'swh_version': str(package['version']), }) tempdir = download_package(package) dsc, debdir = extract_package(package, tempdir) directory = Directory.from_disk(path=os.fsencode(debdir), save_path=True) metadata = get_package_metadata(package, dsc, debdir) return directory, metadata, tempdir
def swh_hash_data_at_revision( self, revision: int) -> Tuple[Dict, DirectoryFromDisk]: """Compute the information at a given svn revision. This is expected to be used for checks only. Yields: The tuple (commit dictionary, targeted directory object). """ # Update disk representation of the repository at revision id local_dirname, local_url = self.export_temporary(revision) # Compute the current hashes on disk directory = DirectoryFromDisk.from_disk( path=local_url, max_content_length=self.max_content_length) # Retrieve the commit information for revision commit = self.commit_info(revision) # Clean export directory self.clean_fs(local_dirname) return commit, directory
def test_directory_to_objects(self): directory = Directory.from_disk(path=self.tmpdir_name) for name, value in self.contents.items(): self.assertContentEqual(directory[b"contents/" + name], value) for name, value in self.symlinks.items(): self.assertContentEqual(directory[b"symlinks/" + name], value) for name in self.specials: self.assertContentEqual( directory[b"specials/" + name], self.empty_content, ) self.assertEqual( directory[b"empty1/empty2"].get_data(), self.empty_directory, ) # Raise on non existent file with self.assertRaisesRegex(KeyError, "b'nonexistent'"): directory[b"empty1/nonexistent"] # Raise on non existent directory with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"): directory[b"nonexistentdir/file"] objs = directory.collect() self.assertCountEqual(["content", "directory"], objs) self.assertEqual(len(objs["directory"]), 6) self.assertEqual( len(objs["content"]), len(self.contents) + len(self.symlinks) + 1 )
def test_directory_swhid(self): directory_swhid = "swh:1:dir:" + hash_to_hex(self.empty_directory["id"]) directory = Directory.from_disk(path=self.tmpdir_name) assert str(directory.swhid()) == directory_swhid
out.append(spindent + "}]") elif key in ALGORITHMS | {"id", "target"}: format_hash(value, indent=indent) elif isinstance(value, DentryPerms): out.append(str(value)) else: out.append(repr(value)) out.append(",\n") spindent = " " * indent out.append(spindent + "%s = {\n" % varname) format_dict_items(data, indent=4 + indent) out.append(spindent + "}") return "".join(out) if __name__ == "__main__": if not sys.argv[1:]: print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr) exit(2) for dirname in sys.argv[1:]: basename = os.path.basename(dirname) varname = "expected_%s" % basename testdata = generate_from_directory( varname, Directory.from_disk(path=os.fsencode(dirname)), indent=8 ) print(testdata) print()