Exemplo n.º 1
0
    def test_directory_repr(self):
        entries = [b"a", b"b", b"c"]
        d = Directory()
        for entry in entries:
            d[entry] = Directory()

        r = repr(d)
        self.assertIn(hash_to_hex(d.hash), r)

        for entry in entries:
            self.assertIn(str(entry), r)
Exemplo n.º 2
0
    def test_directory_del_nested(self):
        d = Directory()
        d[b"a"] = Directory()
        d[b"a/b"] = Directory()

        with self.assertRaisesRegex(KeyError, "b'c'"):
            del d[b"a/b/c"]

        with self.assertRaisesRegex(KeyError, "b'level2'"):
            del d[b"a/level2/c"]

        del d[b"a/b"]

        self.assertEqual(d[b"a"].get_data(), self.empty_directory)
Exemplo n.º 3
0
    def test_directory_contains(self):
        d = Directory()
        d[b"a"] = Directory()
        d[b"a/b"] = Directory()
        d[b"a/b/c"] = Directory()
        d[b"a/b/c/d"] = Content()

        self.assertIn(b"a", d)
        self.assertIn(b"a/b", d)
        self.assertIn(b"a/b/c", d)
        self.assertIn(b"a/b/c/d", d)

        self.assertNotIn(b"b", d)
        self.assertNotIn(b"b/c", d)
        self.assertNotIn(b"b/c/d", d)
Exemplo n.º 4
0
    def test_directory_to_objects_ignore_name_case(self):
        directory = Directory.from_disk(
            path=self.tmpdir_name,
            dir_filter=from_disk.ignore_named_directories(
                [b"symLiNks"], case_sensitive=False
            ),
        )
        for name, value in self.contents.items():
            self.assertContentEqual(directory[b"contents/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
                directory[b"specials/" + name],
                self.empty_content,
            )

        self.assertEqual(
            directory[b"empty1/empty2"].get_data(),
            self.empty_directory,
        )

        with self.assertRaisesRegex(KeyError, "b'symlinks'"):
            directory[b"symlinks"]

        objs = directory.collect()

        self.assertCountEqual(["content", "directory"], objs)

        self.assertEqual(len(objs["directory"]), 5)
        self.assertEqual(len(objs["content"]), len(self.contents) + 1)
def jar_dirs(datadir, tmp_path):
    jar_1_path = os.path.join(datadir, "https_maven.org",
                              "sprova4j-0.1.0-sources.jar")
    jar_2_path = os.path.join(datadir, "https_maven.org",
                              "sprova4j-0.1.1-sources.jar")

    jar_1_extract_path = os.path.join(tmp_path, "jar_1")
    jar_2_extract_path = os.path.join(tmp_path, "jar_2")

    uncompress(jar_1_path, jar_1_extract_path)
    uncompress(jar_2_path, jar_2_extract_path)

    jar_1_dir = Directory.from_disk(path=jar_1_extract_path.encode())
    jar_2_dir = Directory.from_disk(path=jar_2_extract_path.encode())

    return [jar_1_dir, jar_2_dir]
Exemplo n.º 6
0
    def fetch_data(self):
        """Retrieve, uncompress archive and fetch objects from the tarball.
           The actual ingestion takes place in the :meth:`store_data`
           implementation below.

        """
        url = self.get_tarball_url_to_retrieve()
        filepath, hashes = self.client.download(url)
        nature = tarball.uncompress(filepath, self.dir_path)

        dir_path = self.dir_path.encode('utf-8')
        directory = Directory.from_disk(path=dir_path, save_path=True)
        objects = directory.collect()
        if 'content' not in objects:
            objects['content'] = {}
        if 'directory' not in objects:
            objects['directory'] = {}

        # compute the full revision (with ids)
        revision = self.build_revision(filepath, nature, hashes)
        revision = revision_from(directory.hash, revision)
        objects['revision'] = {
            revision['id']: revision,
        }

        snapshot = self.build_snapshot(revision)
        objects['snapshot'] = {
            snapshot['id']: snapshot
        }
        self.objects = objects
Exemplo n.º 7
0
    def list_objs(self, *, dir_path, revision, release, branch_name):
        """List all objects from dir_path.

        Args:
            dir_path (str): the directory to list
            revision (dict): revision dictionary representation
            release (dict): release dictionary representation
            branch_name (str): branch name

        Returns:
            dict: a mapping from object types ('content', 'directory',
            'revision', 'release', 'snapshot') with a dictionary
            mapping each object's id to the object

        """
        log_id = str(uuid.uuid4())
        sdir_path = dir_path.decode('utf-8')

        log_data = {
            'swh_type': 'dir_list_objs_end',
            'swh_repo': sdir_path,
            'swh_id': log_id,
        }

        self.log.debug("Started listing {swh_repo}".format(**log_data),
                       extra=log_data)

        directory = Directory.from_disk(path=dir_path, save_path=True)
        objects = directory.collect()
        if 'content' not in objects:
            objects['content'] = {}
        if 'directory' not in objects:
            objects['directory'] = {}

        full_rev = revision_from(directory.hash, revision)
        rev_id = full_rev['id']
        objects['revision'] = {rev_id: full_rev}

        objects['release'] = {}
        if release and 'name' in release:
            full_rel = release_from(rev_id, release)
            objects['release'][full_rel['id']] = full_rel

        snapshot = snapshot_from(rev_id, branch_name)
        objects['snapshot'] = {snapshot['id']: snapshot}

        log_data.update({
            'swh_num_%s' % key: len(values)
            for key, values in objects.items()
        })

        self.log.debug(("Done listing the objects in {swh_repo}: "
                        "{swh_num_content} contents, "
                        "{swh_num_directory} directories, "
                        "{swh_num_revision} revisions, "
                        "{swh_num_release} releases, "
                        "{swh_num_snapshot} snapshot").format(**log_data),
                       extra=log_data)

        return objects
Exemplo n.º 8
0
    def test_directory_to_objects_ignore_empty(self):
        directory = Directory.from_disk(
            path=self.tmpdir_name, dir_filter=from_disk.ignore_empty_directories
        )

        for name, value in self.contents.items():
            self.assertContentEqual(directory[b"contents/" + name], value)

        for name, value in self.symlinks.items():
            self.assertContentEqual(directory[b"symlinks/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
                directory[b"specials/" + name],
                self.empty_content,
            )

        # empty directories have been ignored recursively
        with self.assertRaisesRegex(KeyError, "b'empty1'"):
            directory[b"empty1"]
        with self.assertRaisesRegex(KeyError, "b'empty1'"):
            directory[b"empty1/empty2"]

        objs = directory.collect()

        self.assertCountEqual(["content", "directory"], objs)

        self.assertEqual(len(objs["directory"]), 4)
        self.assertEqual(
            len(objs["content"]), len(self.contents) + len(self.symlinks) + 1
        )
Exemplo n.º 9
0
def model_of_dir(path: bytes,
                 exclude_patterns: Iterable[bytes] = None) -> Directory:
    from swh.model.from_disk import accept_all_directories, ignore_directories_patterns

    dir_filter = (ignore_directories_patterns(path, exclude_patterns)
                  if exclude_patterns else accept_all_directories)

    return Directory.from_disk(path=path, dir_filter=dir_filter)
Exemplo n.º 10
0
    def test_contents_match(self):
        directory = Directory.from_disk(
            path=os.path.join(self.tmpdir_name, b"sample-folder")
        )

        for name, expected in self.tarball_contents.items():
            obj = directory[name]
            if isinstance(obj, Content):
                self.assertContentEqual(obj, expected)
            elif isinstance(obj, Directory):
                self.assertDirectoryEqual(obj, expected)
            else:
                raise self.failureException("Unknown type for %s" % obj)
Exemplo n.º 11
0
 def _get_directory_data(source_tree: Directory, nodes_data: MerkleNodeInfo,
                         directory_data: Dict):
     directories = list(
         filter(
             lambda n: n.object_type == "directory",
             map(lambda n: n[1], source_tree.items()),
         ))
     for node in directories:
         directory_info = directory_content(node, nodes_data)
         rel_path = Path(node.data["path"].decode()).relative_to(
             Path(root_path))
         directory_data[rel_path] = directory_info
         if has_dirs(node):
             _get_directory_data(node, nodes_data, directory_data)
Exemplo n.º 12
0
    def test_directory_entry_order(self):
        with tempfile.TemporaryDirectory() as dirname:
            dirname = os.fsencode(dirname)
            open(os.path.join(dirname, b"foo."), "a")
            open(os.path.join(dirname, b"foo0"), "a")
            os.mkdir(os.path.join(dirname, b"foo"))

            directory = Directory.from_disk(path=dirname)

        assert [entry["name"] for entry in directory.entries] == [
            b"foo.",
            b"foo",
            b"foo0",
        ]
Exemplo n.º 13
0
    def test_iter_directory(self):
        """Iter from_disk.directory should yield the full arborescence tree"""
        directory = Directory.from_disk(
            path=os.path.join(self.tmpdir_name, b"sample-folder")
        )

        contents, skipped_contents, directories = from_disk.iter_directory(directory)

        expected_nb = defaultdict(int)
        for name in self.tarball_contents.keys():
            obj = directory[name]
            expected_nb[obj.object_type] += 1

        assert len(contents) == expected_nb["content"] and len(contents) > 0
        assert len(skipped_contents) == 0
        assert len(directories) == expected_nb["directory"] and len(directories) > 0
Exemplo n.º 14
0
def init_merkle_node_info(source_tree: Directory, data: MerkleNodeInfo,
                          info: set):
    """Populate the MerkleNodeInfo with the SWHIDs of the given source tree and the
       attributes that will be stored.
    """
    if not info:
        raise Exception("Data initialization requires node attributes values.")
    nodes_info: Dict[str, Optional[str]] = {}
    for ainfo in info:
        if ainfo in SUPPORTED_INFO:
            nodes_info[ainfo] = None
        else:
            raise Exception(f"Information {ainfo} is not supported.")

    for node in source_tree.iter_tree():
        data[node.swhid()] = nodes_info.copy()  # type: ignore
Exemplo n.º 15
0
def directory_content(node: Directory,
                      nodes_data: MerkleNodeInfo) -> Tuple[int, int]:
    """Count known contents inside the given directory.

    Returns:
     A tuple with the total number of contents inside the directory and the number
     of known contents.
    """
    known_cnt = 0
    node_contents = list(
        filter(lambda n: n.object_type == "content",
               map(lambda n: n[1], node.items())))
    for sub_node in node_contents:
        if nodes_data[sub_node.swhid()]["known"]:
            known_cnt += 1

    return (len(node_contents), known_cnt)
Exemplo n.º 16
0
def process_package(package):
    """Process a source package into its constituent components.

    The source package will be decompressed in a temporary directory.

    Args:
        package (dict): a dict with the following keys:

            - name: source package name
            - version: source package version
            - dsc: the full path of the package's DSC file.

    Returns:
        tuple: A tuple with two elements:

        - package: the original package dict augmented with the following keys:

            - metadata: the metadata from get_package_metadata
            - directory: the sha1_git of the root directory of the package

        - objects: a dictionary of the parsed directories and files, both
          indexed by id

    Raises:
        FileNotFoundError: if the dsc file does not exist
        PackageExtractionFailed: if package extraction failed

    """
    log.info("Processing package %s_%s" %
             (package['name'], str(package['version'])),
             extra={
                 'swh_type': 'deb_process_start',
                 'swh_name': package['name'],
                 'swh_version': str(package['version']),
             })

    tempdir = download_package(package)
    dsc, debdir = extract_package(package, tempdir)

    directory = Directory.from_disk(path=os.fsencode(debdir), save_path=True)
    metadata = get_package_metadata(package, dsc, debdir)

    return directory, metadata, tempdir
Exemplo n.º 17
0
    def swh_hash_data_at_revision(
            self, revision: int) -> Tuple[Dict, DirectoryFromDisk]:
        """Compute the information at a given svn revision. This is expected to be used
        for checks only.

        Yields:
            The tuple (commit dictionary, targeted directory object).

        """
        # Update disk representation of the repository at revision id
        local_dirname, local_url = self.export_temporary(revision)
        # Compute the current hashes on disk
        directory = DirectoryFromDisk.from_disk(
            path=local_url, max_content_length=self.max_content_length)

        # Retrieve the commit information for revision
        commit = self.commit_info(revision)

        # Clean export directory
        self.clean_fs(local_dirname)

        return commit, directory
Exemplo n.º 18
0
    def test_directory_to_objects(self):
        directory = Directory.from_disk(path=self.tmpdir_name)

        for name, value in self.contents.items():
            self.assertContentEqual(directory[b"contents/" + name], value)

        for name, value in self.symlinks.items():
            self.assertContentEqual(directory[b"symlinks/" + name], value)

        for name in self.specials:
            self.assertContentEqual(
                directory[b"specials/" + name],
                self.empty_content,
            )

        self.assertEqual(
            directory[b"empty1/empty2"].get_data(),
            self.empty_directory,
        )

        # Raise on non existent file
        with self.assertRaisesRegex(KeyError, "b'nonexistent'"):
            directory[b"empty1/nonexistent"]

        # Raise on non existent directory
        with self.assertRaisesRegex(KeyError, "b'nonexistentdir'"):
            directory[b"nonexistentdir/file"]

        objs = directory.collect()

        self.assertCountEqual(["content", "directory"], objs)

        self.assertEqual(len(objs["directory"]), 6)
        self.assertEqual(
            len(objs["content"]), len(self.contents) + len(self.symlinks) + 1
        )
Exemplo n.º 19
0
 def test_directory_del_wrong_type(self):
     d = Directory()
     with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
         del d["foo"]
     with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
         del d[42]
Exemplo n.º 20
0
 def test_directory_set_wrong_type(self):
     d = Directory()
     with self.assertRaisesRegex(ValueError, "Content or Directory"):
         d[b"entry"] = object()
Exemplo n.º 21
0
 def test_directory_set_empty_name(self):
     d = Directory()
     with self.assertRaisesRegex(ValueError, "must have a name"):
         d[b""] = Directory()
     with self.assertRaisesRegex(ValueError, "must have a name"):
         d[b"/"] = Directory()
Exemplo n.º 22
0
    def test_directory_set_nul_in_name(self):
        d = Directory()

        with self.assertRaisesRegex(ValueError, "nul bytes"):
            d[b"\x00\x01"] = Directory()
Exemplo n.º 23
0
 def test_directory_set_wrong_type_name(self):
     d = Directory()
     with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
         d["foo"] = Directory()
     with self.assertRaisesRegex(ValueError, "bytes Directory entry"):
         d[42] = Directory()
Exemplo n.º 24
0
def has_dirs(node: Directory) -> bool:
    """Check if the given directory has other directories inside."""
    for _, sub_node in node.items():
        if isinstance(sub_node, Directory):
            return True
    return False
Exemplo n.º 25
0
 def test_directory_swhid(self):
     directory_swhid = "swh:1:dir:" + hash_to_hex(self.empty_directory["id"])
     directory = Directory.from_disk(path=self.tmpdir_name)
     assert str(directory.swhid()) == directory_swhid
Exemplo n.º 26
0
 def test_directory_access_self(self):
     d = Directory()
     self.assertIs(d, d[b""])
     self.assertIs(d, d[b"/"])
     self.assertIs(d, d[b"//"])
Exemplo n.º 27
0
    def test_directory_access_nested(self):
        d = Directory()
        d[b"a"] = Directory()
        d[b"a/b"] = Directory()

        self.assertEqual(d[b"a/b"].get_data(), self.empty_directory)
                        out.append(spindent + "}]")
                elif key in ALGORITHMS | {"id", "target"}:
                    format_hash(value, indent=indent)
                elif isinstance(value, DentryPerms):
                    out.append(str(value))
                else:
                    out.append(repr(value))
            out.append(",\n")

    spindent = " " * indent
    out.append(spindent + "%s = {\n" % varname)
    format_dict_items(data, indent=4 + indent)
    out.append(spindent + "}")

    return "".join(out)


if __name__ == "__main__":
    if not sys.argv[1:]:
        print("Usage: %s dir1 dir2" % sys.argv[0], file=sys.stderr)
        exit(2)

    for dirname in sys.argv[1:]:
        basename = os.path.basename(dirname)
        varname = "expected_%s" % basename
        testdata = generate_from_directory(
            varname, Directory.from_disk(path=os.fsencode(dirname)), indent=8
        )
        print(testdata)
        print()
Exemplo n.º 29
0
 def test_directory_access_wrong_type(self):
     d = Directory()
     with self.assertRaisesRegex(ValueError, "bytes from Directory"):
         d["foo"]
     with self.assertRaisesRegex(ValueError, "bytes from Directory"):
         d[42]