Exemplo n.º 1
0
def _check_debian_loading(swh_storage, packages):
    loader = DebianLoader(
        swh_storage,
        URL,
        packages=packages,
    )

    actual_load_status = loader.load()
    expected_snapshot_id = "474c0e3d5796d15363031c333533527d659c559e"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        swh_storage,
        URL,
        status="full",
        type="deb",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/stretch/contrib/0.7.2-3":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes(
                    "de96ae3d3e136f5c1709117059e2a2c05b8ee5ae"),
            ),
            b"releases/buster/contrib/0.7.2-4":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes(
                    "11824484c585319302ea4fde4917faf78dfb1973"),
            ),
        },
    )

    check_snapshot(expected_snapshot, swh_storage)
Exemplo n.º 2
0
def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir):
    """With no time upload, artifact is skipped"""
    package = "jammit-no-time"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    # no branch as one artifact without any intrinsic metadata
    expected_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    assert actual_load_status == {
        "status": "uneventful",
        "snapshot_id": expected_snapshot.id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)
Exemplo n.º 3
0
def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir):
    """Skip artifact with no intrinsic metadata during ingestion"""
    package = "nativescript-telerik-analytics"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    # no branch as one artifact without any intrinsic metadata
    expected_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot.id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)
Exemplo n.º 4
0
def test_origin_browse_directory_branch_with_non_resolvable_revision(
    client, archive_data, new_origin, unknown_revision
):
    branch_name = "master"
    snapshot = Snapshot(
        branches={
            branch_name.encode(): SnapshotBranch(
                target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION,
            )
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": branch_name},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(
        resp, f"Revision {unknown_revision } could not be found in the archive."
    )
Exemplo n.º 5
0
def test_lookup_snapshot_branch_names_filtering_paginated(
        archive_data, directory, revision):
    pattern = "foo"
    nb_branches_by_target_type = 10
    branches = {}
    for i in range(nb_branches_by_target_type):
        branches[f"branch/directory/bar{i}".encode()] = SnapshotBranch(
            target=hash_to_bytes(directory),
            target_type=TargetType.DIRECTORY,
        )
        branches[f"branch/revision/bar{i}".encode()] = SnapshotBranch(
            target=hash_to_bytes(revision),
            target_type=TargetType.REVISION,
        )
        branches[f"branch/directory/{pattern}{i}".encode()] = SnapshotBranch(
            target=hash_to_bytes(directory),
            target_type=TargetType.DIRECTORY,
        )
        branches[f"branch/revision/{pattern}{i}".encode()] = SnapshotBranch(
            target=hash_to_bytes(revision),
            target_type=TargetType.REVISION,
        )

    snapshot = Snapshot(branches=branches)
    archive_data.snapshot_add([snapshot])

    branches_count = nb_branches_by_target_type // 2

    for target_type in (
            DIRECTORY,
            REVISION,
    ):
        partial_branches = archive.lookup_snapshot(
            hash_to_hex(snapshot.id),
            target_types=[target_type],
            branches_count=branches_count,
            branch_name_include_substring=pattern,
        )
        branches = partial_branches["branches"]

        assert len(branches) == branches_count
        for branch_name, branch_data in branches.items():
            assert pattern in branch_name
            assert branch_data["target_type"] == target_type
        for i in range(branches_count):
            assert f"branch/{target_type}/{pattern}{i}" in branches
        assert (partial_branches["next_branch"] ==
                f"branch/{target_type}/{pattern}{branches_count}")

        partial_branches = archive.lookup_snapshot(
            hash_to_hex(snapshot.id),
            target_types=[target_type],
            branches_from=partial_branches["next_branch"],
            branch_name_include_substring=pattern,
        )
        branches = partial_branches["branches"]

        assert len(branches) == branches_count
        for branch_name, branch_data in branches.items():
            assert pattern in branch_name
            assert branch_data["target_type"] == target_type
        for i in range(branches_count, 2 * branches_count):
            assert f"branch/{target_type}/{pattern}{i}" in branches
        assert partial_branches["next_branch"] is None
class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
    """Mostly the same loading scenario but with a ``parent_origin`` different from the
    ``origin``; as if the ``origin`` was a forge-fork of ``parent_origin``, detected
    by the metadata loader.

    To walk slightly different paths, the end result should stay the same.

    """
    @pytest.fixture(autouse=True)
    def init(self, swh_storage, datadir, tmp_path, mocker):
        archive_name = "testrepo"
        archive_path = os.path.join(datadir, f"{archive_name}.tgz")
        tmp_path = str(tmp_path)
        self.repo_url = prepare_repository_from_archive(archive_path,
                                                        archive_name,
                                                        tmp_path=tmp_path)
        self.destination_path = os.path.join(tmp_path, archive_name)

        self.fetcher = MagicMock()
        self.fetcher.get_origin_metadata.return_value = []
        self.fetcher.get_parent_origins.return_value = [
            Origin(url=f"base://{self.repo_url}")
        ]
        self.fetcher_cls = MagicMock(return_value=self.fetcher)
        self.fetcher_cls.SUPPORTED_LISTERS = ["fake-lister"]
        mocker.patch(
            "swh.loader.core.metadata_fetchers._fetchers",
            return_value=[self.fetcher_cls],
        )

        self.loader = GitLoader(
            MagicMock(wraps=swh_storage),
            self.repo_url,
            lister_name="fake-lister",
            lister_instance_name="",
        )
        self.repo = dulwich.repo.Repo(self.destination_path)

    def test_no_previous_snapshot(self, mocker):
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")
        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        # TODO: assert "incremental" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 0.0, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": False,
            "has_previous_snapshot": False,
            "has_parent_origins": True,
        }

    def test_load_incremental(self, mocker):
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")

        snapshot_id = b"\x01" * 20
        now = datetime.datetime.now(tz=datetime.timezone.utc)

        def ovgl(origin_url, allowed_statuses, require_snapshot, type):
            if origin_url == f"base://{self.repo_url}":
                return OriginVisit(origin=origin_url,
                                   visit=42,
                                   date=now,
                                   type="git")
            else:
                return None

        self.loader.storage.origin_visit_get_latest.side_effect = ovgl
        self.loader.storage.origin_visit_status_get_latest.return_value = (
            OriginVisitStatus(
                origin=f"base://{self.repo_url}",
                visit=42,
                snapshot=snapshot_id,
                date=now,
                status="full",
            ))
        self.loader.storage.snapshot_get_branches.return_value = {
            "id": snapshot_id,
            "branches": {
                b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"]
            },
            "next_branch": None,
        }

        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        # TODO: assert "incremental*" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 0.25, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": True,
            "has_previous_snapshot": False,
            "has_parent_origins": True,
        }

        self.fetcher.reset_mock()
        self.fetcher_cls.reset_mock()
        if sys.version_info >= (3, 9, 0):
            self.loader.storage.reset_mock(return_value=True, side_effect=True)
        else:
            # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838  # noqa
            # for old Python versions:
            def reset_mock(m):
                m.reset_mock(return_value=True, side_effect=True)
                for child in m._mock_children.values():
                    reset_mock(child)

            reset_mock(self.loader.storage)
        statsd_report.reset_mock()

        # Load again
        res = self.loader.load()
        assert res == {"status": "uneventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_not_called()

        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            # Tries the same origin, and finds a snapshot
            call(
                self.repo_url,
                type=None,
                allowed_statuses=None,
                require_snapshot=True,
            ),
            # also fetches the parent, in case the origin was rebased on the parent
            # since the last visit
            call(
                f"base://{self.repo_url}",
                type=None,
                allowed_statuses=None,
                require_snapshot=True,
            ),
        ]

        # TODO: assert "incremental*" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 1.0, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot":
            False,  # Because we reset the mock since last time
            "has_previous_snapshot": True,
            "has_parent_origins": True,
        }

    @pytest.mark.parametrize(
        "parent_snapshot,previous_snapshot,expected_git_known_refs_percent",
        [
            pytest.param(
                Snapshot(branches={
                    b"refs/heads/master":
                    SNAPSHOT1.branches[b"refs/heads/master"]
                }),
                Snapshot(branches={}),
                0.25,
                id="partial-parent-and-empty-previous",
            ),
            pytest.param(
                SNAPSHOT1,
                Snapshot(branches={
                    b"refs/heads/master":
                    SNAPSHOT1.branches[b"refs/heads/master"]
                }),
                1.0,
                id="full-parent-and-partial-previous",
            ),
        ],
    )
    def test_load_incremental_from(
        self,
        parent_snapshot,
        previous_snapshot,
        expected_git_known_refs_percent,
        mocker,
    ):
        """Snapshot of parent origin has all branches, but previous snapshot was
        empty."""
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")

        now = datetime.datetime.now(tz=datetime.timezone.utc)

        self.loader.storage.snapshot_add([parent_snapshot, previous_snapshot])
        self.loader.storage.origin_add(
            [Origin(url=f"base://{self.repo_url}"),
             Origin(url=self.repo_url)])
        self.loader.storage.origin_visit_add([
            OriginVisit(
                origin=f"base://{self.repo_url}",
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
            OriginVisit(
                origin=self.repo_url,
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
        ])
        self.loader.storage.origin_visit_status_add([
            OriginVisitStatus(
                origin=f"base://{self.repo_url}",
                visit=42,
                type="git",
                snapshot=parent_snapshot.id,
                date=now,
                status="full",
            ),
            OriginVisitStatus(
                origin=self.repo_url,
                visit=42,
                type="git",
                snapshot=previous_snapshot.id,
                date=now,
                status="full",
            ),
        ])
        self.loader.storage.flush()

        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": True,
            "has_previous_snapshot": True,
            "has_parent_origins": True,
        }
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h",
                 expected_git_known_refs_percent, {}, 1),
        ]
Exemplo n.º 7
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Exemplo n.º 8
0
    def get_snapshot(self) -> Snapshot:
        """Get the snapshot for the current visit.

        The main complexity of this function is mapping target objects to their
        types, as the `refs` dictionaries returned by the git server only give
        us the identifiers for the target objects, and not their types.

        The loader itself only knows the types of the objects that it has
        fetched from the server (as it has parsed them while loading them to
        the archive). As we only fetched an increment between the previous
        snapshot and the current state of the server, we are missing the type
        information for the objects that would already have been referenced by
        the previous snapshot, and that the git server didn't send us. We infer
        the type of these objects from the previous snapshot.

        """
        branches: Dict[bytes, Optional[SnapshotBranch]] = {}

        unfetched_refs: Dict[bytes, bytes] = {}

        # Retrieve types from the objects loaded by the current loader
        for ref_name, ref_object in self.remote_refs.items():
            if ref_name in self.symbolic_refs:
                continue
            target = hashutil.hash_to_bytes(ref_object.decode())
            target_type = self.ref_object_types.get(ref_object)
            if target_type:
                branches[ref_name] = SnapshotBranch(
                    target=target, target_type=target_type
                )
            else:
                # The object pointed at by this ref was not fetched, supposedly
                # because it existed in the base snapshot. We record it here,
                # and we can get it from the base snapshot later.
                unfetched_refs[ref_name] = target

        dangling_branches = {}
        # Handle symbolic references as alias branches
        for ref_name, target in self.symbolic_refs.items():
            branches[ref_name] = SnapshotBranch(
                target_type=TargetType.ALIAS,
                target=target,
            )
            if target not in branches and target not in unfetched_refs:
                # This handles the case where the pointer is "dangling".
                # There's a chance that a further symbolic reference
                # override this default value, which is totally fine.
                dangling_branches[target] = ref_name
                branches[target] = None

        if unfetched_refs:
            # Handle inference of object types from the contents of the
            # previous snapshot
            unknown_objects = {}

            base_snapshot_reverse_branches = {
                branch.target: branch
                for base_snapshot in reversed(self.base_snapshots)
                for branch in base_snapshot.branches.values()
                if branch and branch.target_type != TargetType.ALIAS
            }
            assert all(
                base_snapshot_reverse_branches[branch.target] == branch
                for branch in self.prev_snapshot.branches.values()
                if branch and branch.target_type != TargetType.ALIAS
            ), "base_snapshot_reverse_branches is not a superset of prev_snapshot"

            for ref_name, target in unfetched_refs.items():
                branch = base_snapshot_reverse_branches.get(target)
                branches[ref_name] = branch
                if not branch:
                    unknown_objects[ref_name] = target

            if unknown_objects:
                # This object was referenced by the server; We did not fetch
                # it, and we do not know it from the previous snapshot. This is
                # likely a bug in the loader.
                raise RuntimeError(
                    "Unknown objects referenced by remote refs: %s"
                    % (
                        ", ".join(
                            f"{name.decode()}: {hashutil.hash_to_hex(obj)}"
                            for name, obj in unknown_objects.items()
                        )
                    )
                )

        utils.warn_dangling_branches(
            branches, dangling_branches, logger, self.origin.url
        )

        self.snapshot = Snapshot(branches=branches)
        return self.snapshot
Exemplo n.º 9
0
    synthetic=True,
)

SNAPSHOT = Snapshot(
    id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"),
    branches={
        b"release/0.1.0":
        SnapshotBranch(
            target=RELEASE.id,
            target_type=TargetType.RELEASE,
        ),
        b"HEAD":
        SnapshotBranch(
            target=REVISION.id,
            target_type=TargetType.REVISION,
        ),
        b"alias":
        SnapshotBranch(
            target=b"HEAD",
            target_type=TargetType.ALIAS,
        ),
        b"evaluation":
        SnapshotBranch(  # branch dedicated to not exist in storage
            target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
            target_type=TargetType.REVISION,
        ),
    },
)


@pytest.fixture
Exemplo n.º 10
0
def test_load_extids() -> None:
    """Checks PackageLoader.load() skips iff it should, and writes (only)
    the new ExtIDs"""
    storage = get_storage("memory")

    dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                          object_id=b"e" * 20)

    rels = [
        Release(
            name=f"v{i}.0".encode(),
            message=b"blah\n",
            target=dir_swhid.object_id,
            target_type=ModelObjectType.DIRECTORY,
            synthetic=True,
        ) for i in (1, 2, 3, 4)
    ]
    storage.release_add(rels[0:3])

    origin = "http://example.org"
    rel1_swhid = rels[0].swhid()
    rel2_swhid = rels[1].swhid()
    rel3_swhid = rels[2].swhid()
    rel4_swhid = rels[3].swhid()

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
    ])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel3_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel4_swhid.object_id, dir_swhid.object_id),
        autospec=True,
    ).start()

    loader.load()

    assert loader._load_release.mock_calls == [  # type: ignore
        # v1.0: not loaded because there is already its (extid_type, extid, rel)
        #       in the storage.
        # v2.0: loaded, because there is already a similar extid, but different type
        call(
            StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
            Origin(url=origin),
        ),
        # v3.0: loaded despite having an (extid_type, extid) in storage, because
        #       the target of the extid is not in the previous snapshot
        call(
            StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
            Origin(url=origin),
        ),
        # v4.0: loaded, because there isn't its extid
        call(
            StubPackageInfo(origin, "example-v4.0.tar", "v4.0"),
            Origin(url=origin),
        ),
    ]

    # then check the snapshot has all the branches.
    # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last
    # snapshot), because they had to be loaded (mismatched extid), and the mocked
    # _load_release always returns rel4_swhid.
    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v4.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
            rel3_swhid.object_id,
            rel4_swhid.object_id,
        ],
    )

    assert set(extids) == {
        # What we inserted at the beginning of the test:
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
        # Added by the loader:
        ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid),
    }
Exemplo n.º 11
0
    def test_original_malformed_objects(self, swh_storage,
                                        cook_extract_snapshot):
        """Tests that objects that were originally malformed:

        * are still interpreted somewhat correctly (if the loader could make sense of
          them), especially that they still have links to children
        * have their original manifest in the bundle
        """
        date = TimestampWithTimezone.from_numeric_offset(
            Timestamp(1643819927, 0), 0, False)

        content = Content.from_data(b"foo")
        swh_storage.content_add([content])

        # disordered
        # fmt: off
        malformed_dir_manifest = (b"" + b"100644 file2\x00" +
                                  content.sha1_git + b"100644 file1\x00" +
                                  content.sha1_git)
        # fmt: on
        directory = Directory(
            entries=(
                DirectoryEntry(name=b"file1",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
                DirectoryEntry(name=b"file2",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
            ),
            raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() +
            malformed_dir_manifest,
        )
        swh_storage.directory_add([directory])

        # 'committer' and 'author' swapped
        # fmt: off
        malformed_rev_manifest = (
            b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" +
            b"committer me <*****@*****.**> 1643819927 +0000\n" +
            b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" +
            b"rev")
        # fmt: on
        revision = Revision(
            message=b"rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=directory.id,
            synthetic=True,
            raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() +
            malformed_rev_manifest,
        )
        swh_storage.revision_add([revision])

        # 'tag' and 'tagger' swapped
        # fmt: off
        malformed_rel_manifest = (
            b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" +
            b"type commit\n" +
            b"tagger me <*****@*****.**> 1643819927 +0000\n" +
            b"tag v1.1.0\n")
        # fmt: on

        release = Release(
            name=b"v1.1.0",
            message=None,
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            target=revision.id,
            target_type=ModelObjectType.REVISION,
            synthetic=True,
            raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() +
            malformed_rel_manifest,
        )
        swh_storage.release_add([release])

        snapshot = Snapshot(
            branches={
                b"refs/tags/v1.1.0":
                SnapshotBranch(target=release.id,
                               target_type=TargetType.RELEASE),
                b"HEAD":
                SnapshotBranch(target=revision.id,
                               target_type=TargetType.REVISION),
            })
        swh_storage.snapshot_add([snapshot])

        with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p):
            tag = ert.repo[b"refs/tags/v1.1.0"]
            assert tag.as_raw_string() == malformed_rel_manifest

            commit = ert.repo[tag.object[1]]
            assert commit.as_raw_string() == malformed_rev_manifest

            tree = ert.repo[commit.tree]
            assert tree.as_raw_string() == malformed_dir_manifest
    return config


class OriginHeadTestIndexer(OriginHeadIndexer):
    """Specific indexer whose configuration is enough to satisfy the
       indexing tests.
    """
    def persist_index_computations(self, results):
        self.results = results


SAMPLE_SNAPSHOT = Snapshot(branches={
    b"foo":
    None,
    b"HEAD":
    SnapshotBranch(
        target_type=TargetType.ALIAS,
        target=b"foo",
    ),
}, )


class OriginHead(unittest.TestCase):
    @pytest.fixture(autouse=True)
    def init(self, swh_config):
        super().setUp()
        self.indexer = OriginHeadTestIndexer()
        self.indexer.catch_exceptions = False
        fill_storage(self.indexer.storage)

    def test_git(self):
Exemplo n.º 13
0
SNAPSHOT1 = Snapshot(
    id=hash_to_bytes("a23699280a82a043f8c0994cf1631b568f716f95"),
    branches={
        b"HEAD": SnapshotBranch(
            target=b"refs/heads/master",
            target_type=TargetType.ALIAS,
        ),
        b"refs/heads/master": SnapshotBranch(
            target=hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"),
            target_type=TargetType.REVISION,
        ),
        b"refs/heads/branch1": SnapshotBranch(
            target=hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"),
            target_type=TargetType.REVISION,
        ),
        b"refs/heads/branch2": SnapshotBranch(
            target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"),
            target_type=TargetType.REVISION,
        ),
        b"refs/tags/branch2-after-delete": SnapshotBranch(
            target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"),
            target_type=TargetType.REVISION,
        ),
        b"refs/tags/branch2-before-delete": SnapshotBranch(
            target=hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"),
            target_type=TargetType.REVISION,
        ),
    },
)
Exemplo n.º 14
0
            name=b"file1.ext",
            perms=0o644,
            type="file",
            target=b"\x11" * 20,
        ), ),
        raw_manifest=(
            b"tree 34\x00" + b"00644 file1.ext\x00"  # added two leading zeros
            + b"\x11" * 20),
    ),
]

SNAPSHOTS = [
    Snapshot(
        id=hash_to_bytes("9e78d7105c5e0f886487511e2a92377b4ee4c32a"),
        branches={
            b"master":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=REVISIONS[0].id)
        },
    ),
    Snapshot(
        id=hash_to_bytes("0e7f84ede9a254f2cd55649ad5240783f557e65f"),
        branches={
            b"target/revision":
            SnapshotBranch(
                target_type=TargetType.REVISION,
                target=REVISIONS[0].id,
            ),
            b"target/alias":
            SnapshotBranch(target_type=TargetType.ALIAS,
                           target=b"target/revision"),
            b"target/directory":
def test_loader_two_visits(swh_storage, requests_mock_datadir_visits):
    """To ensure there is only one origin, but two visits, two revisions
    and two snapshots are created.

    The first visit creates a snapshot containing one tarball. The
    second visit creates a snapshot containing the same tarball and
    another tarball.

    """
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT1.id.hex()
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a"
    expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex)
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=expected_snapshot_id,
    )

    # This ensures visits are incremental. Indeed, if we request a
    # second time an url, because of the requests_mock_datadir_visits
    # fixture, the file has to end with `_visit1`.
    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "602140776b2ce6c9159bcf52ada73a297c063d5e"),
                target_type=TargetType.REVISION,
            ),
            b"https://github.com/owner-1/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "df7811b9644ed8ef088e2e7add62ed32b0bab15f"),
                target_type=TargetType.RELEASE,
            ),
            b"https://github.com/owner-2/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"),
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 2,
        "directory": 5,
        "origin": 1,
        "origin_visit": 2,
        "release": 3,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 2,
    } == stats
                         "nixpkgs-swh_sources.json"),
            "rb",
    ) as f:
        return f.read()


SNAPSHOT1 = Snapshot(
    id=hash_to_bytes("fafcfe32016d018bd892114fce211f37a36a092a"),
    branches={
        b"evaluation":
        SnapshotBranch(
            target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
            target_type=TargetType.REVISION,
        ),
        b"https://github.com/owner-1/repository-1/revision-1.tgz":
        SnapshotBranch(
            target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"),
            target_type=TargetType.RELEASE,
        ),
        b"https://github.com/owner-3/repository-1/revision-1.tgz":
        SnapshotBranch(
            target=hash_to_bytes("dc7dc10a664396d5c88adc56352904db231bde14"),
            target_type=TargetType.RELEASE,
        ),
    },
)


def check_snapshot(snapshot: Snapshot, storage: StorageInterface):
    # The `evaluation` branch is allowed to be unresolvable. It's possible at current
    # nixguix visit time, it is not yet visited (the git loader is in charge of its
    # visit for now). For more details, check the
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag,
                         weird_branches):
    r"""
    Build objects::

                                     snp
                                    /|||\
                                   / ||| \
                        rel2 <----°  /|\  \----> rel4
                         |          / | \         |
                         v         /  v  \        v
          rev1  <------ rev2 <----°  dir4 \      rel3
           |             |            |    \      |
           v             v            v     \     |
          dir1          dir2         dir3   |     |
           |           /   |          |     |     |
           v          /    v          v     v     v
          cnt1  <----°    cnt2       cnt3  cnt4  cnt5

    If up_to_date_graph is true, then swh-graph contains all objects.
    Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.

    If tag is False, rel2 is excluded.

    If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
    """
    from swh.graph.naive_client import NaiveClient as GraphClient

    # Create objects:

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")
    cnt1 = Content.from_data(b"correct")
    cnt2 = Content.from_data(b"horse")
    cnt3 = Content.from_data(b"battery")
    cnt4 = Content.from_data(b"staple")
    cnt5 = Content.from_data(b"Tr0ub4dor&3")
    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))
    dir2 = Directory(entries=(
        DirectoryEntry(
            name=b"file1",
            type="file",
            perms=DentryPerms.content,
            target=cnt1.sha1_git,
        ),
        DirectoryEntry(
            name=b"file2",
            type="file",
            perms=DentryPerms.content,
            target=cnt2.sha1_git,
        ),
    ))
    dir3 = Directory(entries=(DirectoryEntry(
        name=b"file3",
        type="file",
        perms=DentryPerms.content,
        target=cnt3.sha1_git,
    ), ))
    dir4 = Directory(entries=(DirectoryEntry(
        name=b"directory3",
        type="dir",
        perms=DentryPerms.directory,
        target=dir3.id,
    ), ))
    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )
    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir2.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    rel2 = Release(
        name=b"1.0.0",
        message=b"tag2",
        target_type=ObjectType.REVISION,
        target=rev2.id,
        synthetic=True,
    )
    rel3 = Release(
        name=b"1.0.0-blob",
        message=b"tagged-blob",
        target_type=ObjectType.CONTENT,
        target=cnt5.sha1_git,
        synthetic=True,
    )
    rel4 = Release(
        name=b"1.0.0-weird",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel3.id,
        synthetic=True,
    )
    rel5 = Release(
        name=b"1.0.0:weirdname",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel2.id,
        synthetic=True,
    )

    # Create snapshot:

    branches = {
        b"refs/heads/master":
        SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION),
    }
    if tag:
        branches[b"refs/tags/1.0.0"] = SnapshotBranch(
            target=rel2.id, target_type=TargetType.RELEASE)
    if weird_branches:
        branches[b"refs/heads/tree-ref"] = SnapshotBranch(
            target=dir4.id, target_type=TargetType.DIRECTORY)
        branches[b"refs/heads/blob-ref"] = SnapshotBranch(
            target=cnt4.sha1_git, target_type=TargetType.CONTENT)
        branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
            target=rel4.id, target_type=TargetType.RELEASE)
    snp = Snapshot(branches=branches)

    # "Fill" swh-graph

    if up_to_date_graph:
        nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (rev1, dir1),
            (rev2, dir2),
            (rev2, rev1),
            (snp, rev2),
        ]
        if tag:
            nodes.append(rel2)
            edges.append((rel2, rev2))
            edges.append((snp, rel2))
        if weird_branches:
            nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
            edges.extend([
                (dir3, cnt3),
                (dir4, dir3),
                (snp, dir4),
                (snp, cnt4),
                (snp, rel4),
                (rel4, rel3),
                (rel3, cnt5),
                (rel5, rev2),
            ])
    else:
        nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (dir3, cnt3),
            (rev1, dir1),
        ]
        if tag:
            nodes.append(rel2)
        if weird_branches:
            nodes.extend([cnt3, dir3])
            edges.extend([(dir3, cnt3)])

    nodes = [str(n.swhid()) for n in nodes]
    edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]

    # Add all objects to storage
    swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
    swh_storage.directory_add([dir1, dir2, dir3, dir4])
    swh_storage.revision_add([rev1, rev2])
    swh_storage.release_add([rel2, rel3, rel4, rel5])
    swh_storage.snapshot_add([snp])

    # Add spy on swh_storage, to make sure revision_log is not called
    # (the graph must be used instead)
    swh_storage = unittest.mock.MagicMock(wraps=swh_storage)

    # Add all objects to graph
    swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))

    # Cook
    backend = InMemoryVaultBackend()
    cooked_swhid = {
        RootObjects.SNAPSHOT: snp.swhid(),
        RootObjects.REVISION: rev2.swhid(),
        RootObjects.RELEASE: rel2.swhid(),
        RootObjects.WEIRD_RELEASE: rel5.swhid(),
    }[root_object]
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    if weird_branches:
        # git-fsck now rejects refs pointing to trees and blobs,
        # but some old git repos have them.
        cooker.use_fsck = False

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
            log_head = "master"
        elif root_object == RootObjects.RELEASE:
            log_head = "1.0.0"
        elif root_object == RootObjects.WEIRD_RELEASE:
            log_head = "release"
        else:
            assert False, root_object

        output = subprocess.check_output([
            "git",
            "-C",
            f"{tempdir}/{cooked_swhid}.git",
            "log",
            "--format=oneline",
            "--decorate=",
            log_head,
        ])

        assert output.decode(
        ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"

    # Make sure the graph was used instead of swh_storage.revision_log
    if root_object == RootObjects.SNAPSHOT:
        if up_to_date_graph:
            # The graph has everything, so the first call succeeds and returns
            # all objects transitively pointed by the snapshot
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
            ])
        else:
            # The graph does not have everything, so the first call returns nothing.
            # However, the second call (on the top rev) succeeds and returns
            # all objects but the rev and the rel
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
                unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
            ])
    elif root_object in (
            RootObjects.REVISION,
            RootObjects.RELEASE,
            RootObjects.WEIRD_RELEASE,
    ):
        swh_graph.visit_nodes.assert_has_calls(
            [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")])
    else:
        assert False, root_object

    if up_to_date_graph:
        swh_storage.revision_log.assert_not_called()
        swh_storage.revision_shortlog.assert_not_called()
    else:
        swh_storage.revision_log.assert_called()
Exemplo n.º 18
0
def test_pypi_good_origin():
    """Tests loading a revision whose origin we can find"""

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2"
    row = {
        "id":
        revision_id,
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
def test_ignore_displayname(swh_storage, use_graph):
    """Tests the original authorship information is used instead of
    configured display names; otherwise objects would not match their hash,
    and git-fsck/git-clone would fail.

    This tests both with and without swh-graph, as both configurations use different
    code paths to fetch revisions.
    """

    date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0),
                                                     0, False)
    legacy_person = Person.from_fullname(b"old me <*****@*****.**>")
    current_person = Person.from_fullname(b"me <*****@*****.**>")

    content = Content.from_data(b"foo")
    swh_storage.content_add([content])

    directory = Directory(
        entries=(DirectoryEntry(name=b"file1",
                                type="file",
                                perms=0o100644,
                                target=content.sha1_git), ), )
    swh_storage.directory_add([directory])

    revision = Revision(
        message=b"rev",
        author=legacy_person,
        date=date,
        committer=legacy_person,
        committer_date=date,
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        synthetic=True,
    )
    swh_storage.revision_add([revision])

    release = Release(
        name=b"v1.1.0",
        message=None,
        author=legacy_person,
        date=date,
        target=revision.id,
        target_type=ObjectType.REVISION,
        synthetic=True,
    )
    swh_storage.release_add([release])

    snapshot = Snapshot(
        branches={
            b"refs/tags/v1.1.0":
            SnapshotBranch(target=release.id, target_type=TargetType.RELEASE),
            b"HEAD":
            SnapshotBranch(target=revision.id,
                           target_type=TargetType.REVISION),
        })
    swh_storage.snapshot_add([snapshot])

    # Add all objects to graph
    if use_graph:
        from swh.graph.naive_client import NaiveClient as GraphClient

        nodes = [
            str(x.swhid())
            for x in [content, directory, revision, release, snapshot]
        ]
        edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [
            (directory, content),
            (revision, directory),
            (release, revision),
            (snapshot, release),
            (snapshot, revision),
        ]]
        swh_graph = unittest.mock.Mock(
            wraps=GraphClient(nodes=nodes, edges=edges))
    else:
        swh_graph = None

    # Set a display name
    with swh_storage.db() as db:
        with db.transaction() as cur:
            cur.execute(
                "UPDATE person set displayname = %s where fullname = %s",
                (current_person.fullname, legacy_person.fullname),
            )

    # Check the display name did apply in the storage
    assert swh_storage.revision_get([revision.id])[0] == attr.evolve(
        revision,
        author=current_person,
        committer=current_person,
    )

    # Cook
    cooked_swhid = snapshot.swhid()
    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        # If we are here, it means git-fsck succeeded when called by cooker.cook(),
        # so we already know the original person was used. Let's double-check.

        repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git")

        tag = repo[b"refs/tags/v1.1.0"]
        assert tag.tagger == legacy_person.fullname

        commit = repo[tag.object[1]]
        assert commit.author == legacy_person.fullname
Exemplo n.º 20
0
def test_pypi_release_metadata_structure(
    swh_storage, requests_mock_datadir, _0805nexter_api_info
):
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_release_id = hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(
        object_type=ObjectType.RELEASE, object_id=expected_release_id
    )
    directory_swhid = ExtendedSWHID(
        object_type=ExtendedObjectType.DIRECTORY, object_id=release.target
    )
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://pypi.org/",
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.pypi.loader.PyPILoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="pypi-project-json",
            metadata=json.dumps(
                json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0]
            ).encode(),
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Exemplo n.º 21
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
Exemplo n.º 22
0
def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir):
    """Multiple visits with no changes results in 1 same snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": snapshot_id.hex(),
    }
    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=snapshot_id
    )

    expected_snapshot = Snapshot(
        id=snapshot_id,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)

    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    actual_load_status2 = loader.load()
    assert actual_load_status2 == {
        "status": "uneventful",
        "snapshot_id": actual_load_status2["snapshot_id"],
    }

    visit_status2 = assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi"
    )

    stats2 = get_stats(swh_storage)
    expected_stats2 = stats.copy()
    expected_stats2["origin_visit"] = 1 + 1
    assert expected_stats2 == stats2

    # same snapshot
    assert visit_status2.snapshot == snapshot_id
Exemplo n.º 23
0
def test_check_snapshot_failures(swh_storage):
    """Failure scenarios:

    0. snapshot parameter is not a snapshot
    1. snapshot id is correct but branches mismatched
    2. snapshot id is not correct, it's not found in the storage
    3. snapshot reference an alias which does not exist
    4. snapshot is found in storage, targeted revision does not exist
    5. snapshot is found in storage, targeted revision exists but the directory the
       revision targets does not exist
    6. snapshot is found in storage, target revision exists, targeted directory by the
       revision exist. Content targeted by the directory does not exist.
    7. snapshot is found in storage, targeted release does not exist

    """
    snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"
    snapshot = Snapshot(
        id=hash_to_bytes(snap_id_hex),
        branches={
            b"master":
            SnapshotBranch(
                target=hash_to_bytes(hash_hex),
                target_type=TargetType.REVISION,
            ),
        },
    )

    s = swh_storage.snapshot_add([snapshot])
    assert s == {
        "snapshot:add": 1,
    }

    unexpected_snapshot = Snapshot(
        branches={
            b"tip":
            SnapshotBranch(  # wrong branch
                target=hash_to_bytes(hash_hex),
                target_type=TargetType.RELEASE)
        }, )

    # 0. not a Snapshot object, raise!
    with pytest.raises(
            AssertionError,
            match="argument 'expected_snapshot' must be a snapshot"):
        check_snapshot(ORIGIN_VISIT, swh_storage)

    # 1. snapshot id is correct but branches mismatched
    with pytest.raises(
            AssertionError):  # sadly debian build raises only assertion
        check_snapshot(attr.evolve(unexpected_snapshot, id=snapshot.id),
                       swh_storage)

    # 2. snapshot id is not correct, it's not found in the storage
    wrong_snap_id = hash_to_bytes("999666f535f882bc7f9a18fb16c9ad27fda7bab7")
    with pytest.raises(AssertionError, match="is not found"):
        check_snapshot(attr.evolve(unexpected_snapshot, id=wrong_snap_id),
                       swh_storage)

    # 3. snapshot references an inexistent alias
    snapshot0 = Snapshot(
        id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    swh_storage.snapshot_add([snapshot0])

    with pytest.raises(InconsistentAliasBranchError,
                       match="Alias branch HEAD"):
        check_snapshot(snapshot0, swh_storage)

    # 4. snapshot is found in storage, targeted revision does not exist

    rev_not_found = list(swh_storage.revision_missing([REVISION.id]))
    assert len(rev_not_found) == 1

    snapshot1 = Snapshot(
        id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot1])

    with pytest.raises(InexistentObjectsError, match="Branch/Revision"):
        check_snapshot(snapshot1, swh_storage)

    # 5. snapshot is found in storage, targeted revision exists but the directory the
    # revision targets does not exist

    swh_storage.revision_add([REVISION])

    dir_not_found = list(swh_storage.directory_missing([REVISION.directory]))
    assert len(dir_not_found) == 1

    snapshot2 = Snapshot(
        id=hash_to_bytes("987123f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot2])
    with pytest.raises(InexistentObjectsError, match="Missing directories"):
        check_snapshot(snapshot2, swh_storage)

    assert DIRECTORY.id == REVISION.directory
    swh_storage.directory_add([DIRECTORY])

    # 6. snapshot is found in storage, target revision exists, targeted directory by the
    # revision exist. Content targeted by the directory does not exist.

    assert DIRECTORY.entries[0].target == CONTENT.sha1_git
    not_found = list(
        swh_storage.content_missing_per_sha1_git([CONTENT.sha1_git]))
    assert len(not_found) == 1

    swh_storage.directory_add([DIRECTORY])

    snapshot3 = Snapshot(
        id=hash_to_bytes("091456f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot3])
    with pytest.raises(InexistentObjectsError, match="Missing content(s)"):
        check_snapshot(snapshot3, swh_storage)

    # 7. snapshot is found in storage, targeted release does not exist

    # release targets the revisions which exists
    assert RELEASE.target == REVISION.id

    snapshot4 = Snapshot(
        id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
            b"release/0.1.0":
            SnapshotBranch(
                target=RELEASE.id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot4])

    with pytest.raises(InexistentObjectsError, match="Branch/Release"):
        check_snapshot(snapshot4, swh_storage)
Exemplo n.º 24
0
def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits):
    """With prior visit, 2nd load will result with a different snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    visit1_actual_load_status = loader.load()
    visit1_stats = get_stats(swh_storage)
    expected_snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert visit1_actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == visit1_stats

    # Reset internal state
    del loader._cached__raw_info
    del loader._cached_info

    visit2_actual_load_status = loader.load()
    visit2_stats = get_stats(swh_storage)

    assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status
    expected_snapshot_id2 = hash_to_bytes("77febe6ff0faf6cc00dd015a6c9763579a9fb6c7")
    assert visit2_actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id2.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id2,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.3.0": SnapshotBranch(
                target=hash_to_bytes("a21b09cbec8e31f47307f196bb1f939effc26e11"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.3.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    assert {
        "content": 6 + 1,  # 1 more content
        "directory": 4 + 2,  # 2 more directories
        "origin": 1,
        "origin_visit": 1 + 1,
        "release": 2 + 1,  # 1 more release
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1 + 1,  # 1 more snapshot
    } == visit2_stats

    urls = [
        m.url
        for m in requests_mock_datadir_visits.request_history
        if m.url.startswith("https://files.pythonhosted.org")
    ]
    # visited each artifact once across 2 visits
    assert len(urls) == len(set(urls))
Exemplo n.º 25
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Exemplo n.º 26
0
 def get(ids):
     return [
         Snapshot.from_dict(
             remove_keys(swh_storage.snapshot_get(ids[0]),
                         ("next_branch", )))
     ]
Exemplo n.º 27
0
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir):
    """Test with two versions that have exactly the same tarball"""
    package = "org_version_mismatch"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1"
    release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0"
    versions = [
        ("0.0.3-beta", beta_release_id),
        ("0.0.3", release_id),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.3", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release(
        name=b"0.0.3-beta",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3-beta\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(beta_release_id),
    )

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.3",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    # Check incremental re-load keeps it unchanged

    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status == {
        "status": "uneventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )
Exemplo n.º 28
0
def test_arch_loader_load_one_version(datadir, requests_mock_datadir,
                                      swh_storage):
    loader = ArchLoader(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        artifacts=EXPECTED_PACKAGES[1]["artifacts"],
    )
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4"
    expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc"

    assert expected_snapshot_id == actual_load_status["snapshot_id"]

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz":
            SnapshotBranch(
                target=hash_to_bytes(expected_release_id),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 1,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert swh_storage.release_get([
        hash_to_bytes(expected_release_id)
    ])[0] == Release(
        name=b"1.12-1",
        message=b"Synthetic release for Arch Linux source package gzip version "
        b"1.12-1\n\nGNU compression utility\n",
        target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(
            b"Arch Linux ARM Build System <*****@*****.**>"),
        date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"),
        id=hash_to_bytes(expected_release_id),
    )

    assert_last_visit_matches(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        status="full",
        type="arch",
        snapshot=expected_snapshot.id,
    )
Exemplo n.º 29
0
def test_arch_loader_load_n_versions(datadir, requests_mock_datadir,
                                     swh_storage):

    loader = ArchLoader(
        swh_storage,
        url=EXPECTED_PACKAGES[0]["url"],
        artifacts=EXPECTED_PACKAGES[0]["artifacts"],
    )
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3"

    assert expected_snapshot_id == actual_load_status["snapshot_id"]

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"releases/1:1.3_20190211-1/"
            b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1:1.3_20220414-1/"
            b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst":
            SnapshotBranch(
                target=hash_to_bytes(
                    "020d3f5627df7474f257fd04f1ede4415296e265"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=
                b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst",
                target_type=TargetType.ALIAS,
            ),
        },
    )

    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 2,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert_last_visit_matches(
        swh_storage,
        url=EXPECTED_PACKAGES[0]["url"],
        status="full",
        type="arch",
        snapshot=expected_snapshot.id,
    )
Exemplo n.º 30
0
    parents=(),
)

REVISIONS = [REVISION]

SNAPSHOTS = [
    Snapshot(
        id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"),
        branches={
            b"refs/heads/add-revision-origin-cache": SnapshotBranch(
                target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e',
                target_type=TargetType.REVISION,
            ),
            b"refs/head/master": SnapshotBranch(
                target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm",
                target_type=TargetType.REVISION,
            ),
            b"HEAD": SnapshotBranch(
                target=b"refs/head/master", target_type=TargetType.ALIAS
            ),
            b"refs/tags/v0.0.103": SnapshotBranch(
                target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd',
                target_type=TargetType.RELEASE,
            ),
        },
    ),
    Snapshot(
        id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"),
        branches={
            b"3DLDF-1.1.4.tar.gz": SnapshotBranch(
                target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11',
                target_type=TargetType.REVISION,