def _check_debian_loading(swh_storage, packages): loader = DebianLoader( swh_storage, URL, packages=packages, ) actual_load_status = loader.load() expected_snapshot_id = "474c0e3d5796d15363031c333533527d659c559e" assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id, } assert_last_visit_matches( swh_storage, URL, status="full", type="deb", snapshot=hash_to_bytes(expected_snapshot_id), ) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/stretch/contrib/0.7.2-3": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes( "de96ae3d3e136f5c1709117059e2a2c05b8ee5ae"), ), b"releases/buster/contrib/0.7.2-4": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes( "11824484c585319302ea4fde4917faf78dfb1973"), ), }, ) check_snapshot(expected_snapshot, swh_storage)
def test_npm_artifact_with_no_upload_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped""" package = "jammit-no-time" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage)
def test_npm_artifact_with_no_intrinsic_metadata(swh_storage, requests_mock_datadir): """Skip artifact with no intrinsic metadata during ingestion""" package = "nativescript-telerik-analytics" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() # no branch as one artifact without any intrinsic metadata expected_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot.id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage)
def test_origin_browse_directory_branch_with_non_resolvable_revision( client, archive_data, new_origin, unknown_revision ): branch_name = "master" snapshot = Snapshot( branches={ branch_name.encode(): SnapshotBranch( target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION, ) } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=now(), type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": branch_name}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains( resp, f"Revision {unknown_revision } could not be found in the archive." )
def test_lookup_snapshot_branch_names_filtering_paginated( archive_data, directory, revision): pattern = "foo" nb_branches_by_target_type = 10 branches = {} for i in range(nb_branches_by_target_type): branches[f"branch/directory/bar{i}".encode()] = SnapshotBranch( target=hash_to_bytes(directory), target_type=TargetType.DIRECTORY, ) branches[f"branch/revision/bar{i}".encode()] = SnapshotBranch( target=hash_to_bytes(revision), target_type=TargetType.REVISION, ) branches[f"branch/directory/{pattern}{i}".encode()] = SnapshotBranch( target=hash_to_bytes(directory), target_type=TargetType.DIRECTORY, ) branches[f"branch/revision/{pattern}{i}".encode()] = SnapshotBranch( target=hash_to_bytes(revision), target_type=TargetType.REVISION, ) snapshot = Snapshot(branches=branches) archive_data.snapshot_add([snapshot]) branches_count = nb_branches_by_target_type // 2 for target_type in ( DIRECTORY, REVISION, ): partial_branches = archive.lookup_snapshot( hash_to_hex(snapshot.id), target_types=[target_type], branches_count=branches_count, branch_name_include_substring=pattern, ) branches = partial_branches["branches"] assert len(branches) == branches_count for branch_name, branch_data in branches.items(): assert pattern in branch_name assert branch_data["target_type"] == target_type for i in range(branches_count): assert f"branch/{target_type}/{pattern}{i}" in branches assert (partial_branches["next_branch"] == f"branch/{target_type}/{pattern}{branches_count}") partial_branches = archive.lookup_snapshot( hash_to_hex(snapshot.id), target_types=[target_type], branches_from=partial_branches["next_branch"], branch_name_include_substring=pattern, ) branches = partial_branches["branches"] assert len(branches) == branches_count for branch_name, branch_data in branches.items(): assert pattern in branch_name assert branch_data["target_type"] == target_type for i in range(branches_count, 2 * branches_count): assert f"branch/{target_type}/{pattern}{i}" in branches assert partial_branches["next_branch"] is None
class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a ``parent_origin`` different from the ``origin``; as if the ``origin`` was a forge-fork of ``parent_origin``, detected by the metadata loader. To walk slightly different paths, the end result should stay the same. """ @pytest.fixture(autouse=True) def init(self, swh_storage, datadir, tmp_path, mocker): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path=tmp_path) self.destination_path = os.path.join(tmp_path, archive_name) self.fetcher = MagicMock() self.fetcher.get_origin_metadata.return_value = [] self.fetcher.get_parent_origins.return_value = [ Origin(url=f"base://{self.repo_url}") ] self.fetcher_cls = MagicMock(return_value=self.fetcher) self.fetcher_cls.SUPPORTED_LISTERS = ["fake-lister"] mocker.patch( "swh.loader.core.metadata_fetchers._fetchers", return_value=[self.fetcher_cls], ) self.loader = GitLoader( MagicMock(wraps=swh_storage), self.repo_url, lister_name="fake-lister", lister_instance_name="", ) self.repo = dulwich.repo.Repo(self.destination_path) def test_no_previous_snapshot(self, mocker): statsd_report = mocker.patch.object(self.loader.statsd, "_report") res = self.loader.load() assert res == {"status": "eventful"} self.fetcher_cls.assert_called_once_with( credentials={}, lister_name="fake-lister", lister_instance_name="", origin=Origin(url=self.repo_url), ) self.fetcher.get_parent_origins.assert_called_once_with() # First tries the same origin assert self.loader.storage.origin_visit_get_latest.mock_calls == [ call( self.repo_url, allowed_statuses=None, require_snapshot=True, type=None, ), # As it does not already have a snapshot, fall back to the parent origin call( f"base://{self.repo_url}", allowed_statuses=None, require_snapshot=True, type=None, ), ] # TODO: assert "incremental" is added to constant tags before these # metrics are sent assert [ c for c in statsd_report.mock_calls if c[1][0].startswith("git_") ] == [ call("git_total", "c", 1, {}, 1), call("git_ignored_refs_percent", "h", 0.0, {}, 1), call("git_known_refs_percent", "h", 0.0, {}, 1), ] assert self.loader.statsd.constant_tags == { "visit_type": "git", "incremental_enabled": True, "has_parent_snapshot": False, "has_previous_snapshot": False, "has_parent_origins": True, } def test_load_incremental(self, mocker): statsd_report = mocker.patch.object(self.loader.statsd, "_report") snapshot_id = b"\x01" * 20 now = datetime.datetime.now(tz=datetime.timezone.utc) def ovgl(origin_url, allowed_statuses, require_snapshot, type): if origin_url == f"base://{self.repo_url}": return OriginVisit(origin=origin_url, visit=42, date=now, type="git") else: return None self.loader.storage.origin_visit_get_latest.side_effect = ovgl self.loader.storage.origin_visit_status_get_latest.return_value = ( OriginVisitStatus( origin=f"base://{self.repo_url}", visit=42, snapshot=snapshot_id, date=now, status="full", )) self.loader.storage.snapshot_get_branches.return_value = { "id": snapshot_id, "branches": { b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] }, "next_branch": None, } res = self.loader.load() assert res == {"status": "eventful"} self.fetcher_cls.assert_called_once_with( credentials={}, lister_name="fake-lister", lister_instance_name="", origin=Origin(url=self.repo_url), ) self.fetcher.get_parent_origins.assert_called_once_with() # First tries the same origin assert self.loader.storage.origin_visit_get_latest.mock_calls == [ call( self.repo_url, allowed_statuses=None, require_snapshot=True, type=None, ), # As it does not already have a snapshot, fall back to the parent origin call( f"base://{self.repo_url}", allowed_statuses=None, require_snapshot=True, type=None, ), ] # TODO: assert "incremental*" is added to constant tags before these # metrics are sent assert [ c for c in statsd_report.mock_calls if c[1][0].startswith("git_") ] == [ call("git_total", "c", 1, {}, 1), call("git_ignored_refs_percent", "h", 0.0, {}, 1), call("git_known_refs_percent", "h", 0.25, {}, 1), ] assert self.loader.statsd.constant_tags == { "visit_type": "git", "incremental_enabled": True, "has_parent_snapshot": True, "has_previous_snapshot": False, "has_parent_origins": True, } self.fetcher.reset_mock() self.fetcher_cls.reset_mock() if sys.version_info >= (3, 9, 0): self.loader.storage.reset_mock(return_value=True, side_effect=True) else: # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838 # noqa # for old Python versions: def reset_mock(m): m.reset_mock(return_value=True, side_effect=True) for child in m._mock_children.values(): reset_mock(child) reset_mock(self.loader.storage) statsd_report.reset_mock() # Load again res = self.loader.load() assert res == {"status": "uneventful"} self.fetcher_cls.assert_called_once_with( credentials={}, lister_name="fake-lister", lister_instance_name="", origin=Origin(url=self.repo_url), ) self.fetcher.get_parent_origins.assert_not_called() assert self.loader.storage.origin_visit_get_latest.mock_calls == [ # Tries the same origin, and finds a snapshot call( self.repo_url, type=None, allowed_statuses=None, require_snapshot=True, ), # also fetches the parent, in case the origin was rebased on the parent # since the last visit call( f"base://{self.repo_url}", type=None, allowed_statuses=None, require_snapshot=True, ), ] # TODO: assert "incremental*" is added to constant tags before these # metrics are sent assert [ c for c in statsd_report.mock_calls if c[1][0].startswith("git_") ] == [ call("git_total", "c", 1, {}, 1), call("git_ignored_refs_percent", "h", 0.0, {}, 1), call("git_known_refs_percent", "h", 1.0, {}, 1), ] assert self.loader.statsd.constant_tags == { "visit_type": "git", "incremental_enabled": True, "has_parent_snapshot": False, # Because we reset the mock since last time "has_previous_snapshot": True, "has_parent_origins": True, } @pytest.mark.parametrize( "parent_snapshot,previous_snapshot,expected_git_known_refs_percent", [ pytest.param( Snapshot(branches={ b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] }), Snapshot(branches={}), 0.25, id="partial-parent-and-empty-previous", ), pytest.param( SNAPSHOT1, Snapshot(branches={ b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"] }), 1.0, id="full-parent-and-partial-previous", ), ], ) def test_load_incremental_from( self, parent_snapshot, previous_snapshot, expected_git_known_refs_percent, mocker, ): """Snapshot of parent origin has all branches, but previous snapshot was empty.""" statsd_report = mocker.patch.object(self.loader.statsd, "_report") now = datetime.datetime.now(tz=datetime.timezone.utc) self.loader.storage.snapshot_add([parent_snapshot, previous_snapshot]) self.loader.storage.origin_add( [Origin(url=f"base://{self.repo_url}"), Origin(url=self.repo_url)]) self.loader.storage.origin_visit_add([ OriginVisit( origin=f"base://{self.repo_url}", visit=42, date=now - datetime.timedelta(seconds=-1), type="git", ), OriginVisit( origin=self.repo_url, visit=42, date=now - datetime.timedelta(seconds=-1), type="git", ), ]) self.loader.storage.origin_visit_status_add([ OriginVisitStatus( origin=f"base://{self.repo_url}", visit=42, type="git", snapshot=parent_snapshot.id, date=now, status="full", ), OriginVisitStatus( origin=self.repo_url, visit=42, type="git", snapshot=previous_snapshot.id, date=now, status="full", ), ]) self.loader.storage.flush() res = self.loader.load() assert res == {"status": "eventful"} self.fetcher_cls.assert_called_once_with( credentials={}, lister_name="fake-lister", lister_instance_name="", origin=Origin(url=self.repo_url), ) self.fetcher.get_parent_origins.assert_called_once_with() # First tries the same origin assert self.loader.storage.origin_visit_get_latest.mock_calls == [ call( self.repo_url, allowed_statuses=None, require_snapshot=True, type=None, ), # As it does not already have a snapshot, fall back to the parent origin call( f"base://{self.repo_url}", allowed_statuses=None, require_snapshot=True, type=None, ), ] assert self.loader.statsd.constant_tags == { "visit_type": "git", "incremental_enabled": True, "has_parent_snapshot": True, "has_previous_snapshot": True, "has_parent_origins": True, } assert [ c for c in statsd_report.mock_calls if c[1][0].startswith("git_") ] == [ call("git_total", "c", 1, {}, 1), call("git_ignored_refs_percent", "h", 0.0, {}, 1), call("git_known_refs_percent", "h", expected_git_known_refs_percent, {}, 1), ]
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) release_id = "d38cc0b571cd41f3c85513864e049766b42032a7" versions = [ ("0.0.2", release_id), ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"), ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.4", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.2", message=b"Synthetic release for NPM source package org version 0.0.2\n", target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"mooz <*****@*****.**>", name=b"mooz", email=b"*****@*****.**", ), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == [] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, release_id) in versions: release = swh_storage.release_get([hash_to_bytes(release_id)])[0] assert release.target_type == ModelObjectType.DIRECTORY directory_id = release.target directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id, ) release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id), ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, ), discovery_date=loader.visit_date, format="replicate-npm-package-json", metadata=json.dumps( json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult( next_page_token=None, results=expected_metadata, ) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats
def get_snapshot(self) -> Snapshot: """Get the snapshot for the current visit. The main complexity of this function is mapping target objects to their types, as the `refs` dictionaries returned by the git server only give us the identifiers for the target objects, and not their types. The loader itself only knows the types of the objects that it has fetched from the server (as it has parsed them while loading them to the archive). As we only fetched an increment between the previous snapshot and the current state of the server, we are missing the type information for the objects that would already have been referenced by the previous snapshot, and that the git server didn't send us. We infer the type of these objects from the previous snapshot. """ branches: Dict[bytes, Optional[SnapshotBranch]] = {} unfetched_refs: Dict[bytes, bytes] = {} # Retrieve types from the objects loaded by the current loader for ref_name, ref_object in self.remote_refs.items(): if ref_name in self.symbolic_refs: continue target = hashutil.hash_to_bytes(ref_object.decode()) target_type = self.ref_object_types.get(ref_object) if target_type: branches[ref_name] = SnapshotBranch( target=target, target_type=target_type ) else: # The object pointed at by this ref was not fetched, supposedly # because it existed in the base snapshot. We record it here, # and we can get it from the base snapshot later. unfetched_refs[ref_name] = target dangling_branches = {} # Handle symbolic references as alias branches for ref_name, target in self.symbolic_refs.items(): branches[ref_name] = SnapshotBranch( target_type=TargetType.ALIAS, target=target, ) if target not in branches and target not in unfetched_refs: # This handles the case where the pointer is "dangling". # There's a chance that a further symbolic reference # override this default value, which is totally fine. dangling_branches[target] = ref_name branches[target] = None if unfetched_refs: # Handle inference of object types from the contents of the # previous snapshot unknown_objects = {} base_snapshot_reverse_branches = { branch.target: branch for base_snapshot in reversed(self.base_snapshots) for branch in base_snapshot.branches.values() if branch and branch.target_type != TargetType.ALIAS } assert all( base_snapshot_reverse_branches[branch.target] == branch for branch in self.prev_snapshot.branches.values() if branch and branch.target_type != TargetType.ALIAS ), "base_snapshot_reverse_branches is not a superset of prev_snapshot" for ref_name, target in unfetched_refs.items(): branch = base_snapshot_reverse_branches.get(target) branches[ref_name] = branch if not branch: unknown_objects[ref_name] = target if unknown_objects: # This object was referenced by the server; We did not fetch # it, and we do not know it from the previous snapshot. This is # likely a bug in the loader. raise RuntimeError( "Unknown objects referenced by remote refs: %s" % ( ", ".join( f"{name.decode()}: {hashutil.hash_to_hex(obj)}" for name, obj in unknown_objects.items() ) ) ) utils.warn_dangling_branches( branches, dangling_branches, logger, self.origin.url ) self.snapshot = Snapshot(branches=branches) return self.snapshot
synthetic=True, ) SNAPSHOT = Snapshot( id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), b"evaluation": SnapshotBranch( # branch dedicated to not exist in storage target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) @pytest.fixture
def test_load_extids() -> None: """Checks PackageLoader.load() skips iff it should, and writes (only) the new ExtIDs""" storage = get_storage("memory") dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) rels = [ Release( name=f"v{i}.0".encode(), message=b"blah\n", target=dir_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) for i in (1, 2, 3, 4) ] storage.release_add(rels[0:3]) origin = "http://example.org" rel1_swhid = rels[0].swhid() rel2_swhid = rels[1].swhid() rel3_swhid = rels[2].swhid() rel4_swhid = rels[3].swhid() # Results of a previous load storage.extid_add([ ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), ]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), b"v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel3_swhid.object_id), }) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add([ OriginVisit(origin="http://example.org", visit=1, date=date, type="tar") ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ]) loader = StubPackageLoader(storage, "http://example.org") patch.object( loader, "_load_release", return_value=(rel4_swhid.object_id, dir_swhid.object_id), autospec=True, ).start() loader.load() assert loader._load_release.mock_calls == [ # type: ignore # v1.0: not loaded because there is already its (extid_type, extid, rel) # in the storage. # v2.0: loaded, because there is already a similar extid, but different type call( StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin), ), # v3.0: loaded despite having an (extid_type, extid) in storage, because # the target of the extid is not in the previous snapshot call( StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin), ), # v4.0: loaded, because there isn't its extid call( StubPackageInfo(origin, "example-v4.0.tar", "v4.0"), Origin(url=origin), ), ] # then check the snapshot has all the branches. # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last # snapshot), because they had to be loaded (mismatched extid), and the mocked # _load_release always returns rel4_swhid. snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"branch-v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), b"branch-v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), b"branch-v4.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), }) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, rel3_swhid.object_id, rel4_swhid.object_id, ], ) assert set(extids) == { # What we inserted at the beginning of the test: ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), # Added by the loader: ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid), }
def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): """Tests that objects that were originally malformed: * are still interpreted somewhat correctly (if the loader could make sense of them), especially that they still have links to children * have their original manifest in the bundle """ date = TimestampWithTimezone.from_numeric_offset( Timestamp(1643819927, 0), 0, False) content = Content.from_data(b"foo") swh_storage.content_add([content]) # disordered # fmt: off malformed_dir_manifest = (b"" + b"100644 file2\x00" + content.sha1_git + b"100644 file1\x00" + content.sha1_git) # fmt: on directory = Directory( entries=( DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), DirectoryEntry(name=b"file2", type="file", perms=0o100644, target=content.sha1_git), ), raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + malformed_dir_manifest, ) swh_storage.directory_add([directory]) # 'committer' and 'author' swapped # fmt: off malformed_rev_manifest = ( b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + b"committer me <*****@*****.**> 1643819927 +0000\n" + b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" + b"rev") # fmt: on revision = Revision( message=b"rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + malformed_rev_manifest, ) swh_storage.revision_add([revision]) # 'tag' and 'tagger' swapped # fmt: off malformed_rel_manifest = ( b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + b"type commit\n" + b"tagger me <*****@*****.**> 1643819927 +0000\n" + b"tag v1.1.0\n") # fmt: on release = Release( name=b"v1.1.0", message=None, author=Person.from_fullname(b"me <*****@*****.**>"), date=date, target=revision.id, target_type=ModelObjectType.REVISION, synthetic=True, raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + malformed_rel_manifest, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): tag = ert.repo[b"refs/tags/v1.1.0"] assert tag.as_raw_string() == malformed_rel_manifest commit = ert.repo[tag.object[1]] assert commit.as_raw_string() == malformed_rev_manifest tree = ert.repo[commit.tree] assert tree.as_raw_string() == malformed_dir_manifest
return config class OriginHeadTestIndexer(OriginHeadIndexer): """Specific indexer whose configuration is enough to satisfy the indexing tests. """ def persist_index_computations(self, results): self.results = results SAMPLE_SNAPSHOT = Snapshot(branches={ b"foo": None, b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"foo", ), }, ) class OriginHead(unittest.TestCase): @pytest.fixture(autouse=True) def init(self, swh_config): super().setUp() self.indexer = OriginHeadTestIndexer() self.indexer.catch_exceptions = False fill_storage(self.indexer.storage) def test_git(self):
SNAPSHOT1 = Snapshot( id=hash_to_bytes("a23699280a82a043f8c0994cf1631b568f716f95"), branches={ b"HEAD": SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ), b"refs/heads/master": SnapshotBranch( target=hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"), target_type=TargetType.REVISION, ), b"refs/heads/branch1": SnapshotBranch( target=hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"), target_type=TargetType.REVISION, ), b"refs/heads/branch2": SnapshotBranch( target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), target_type=TargetType.REVISION, ), b"refs/tags/branch2-after-delete": SnapshotBranch( target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), target_type=TargetType.REVISION, ), b"refs/tags/branch2-before-delete": SnapshotBranch( target=hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"), target_type=TargetType.REVISION, ), }, )
name=b"file1.ext", perms=0o644, type="file", target=b"\x11" * 20, ), ), raw_manifest=( b"tree 34\x00" + b"00644 file1.ext\x00" # added two leading zeros + b"\x11" * 20), ), ] SNAPSHOTS = [ Snapshot( id=hash_to_bytes("9e78d7105c5e0f886487511e2a92377b4ee4c32a"), branches={ b"master": SnapshotBranch(target_type=TargetType.REVISION, target=REVISIONS[0].id) }, ), Snapshot( id=hash_to_bytes("0e7f84ede9a254f2cd55649ad5240783f557e65f"), branches={ b"target/revision": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[0].id, ), b"target/alias": SnapshotBranch(target_type=TargetType.ALIAS, target=b"target/revision"), b"target/directory":
def test_loader_two_visits(swh_storage, requests_mock_datadir_visits): """To ensure there is only one origin, but two visits, two revisions and two snapshots are created. The first visit creates a snapshot containing one tarball. The second visit creates a snapshot containing the same tarball and another tarball. """ loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() assert load_status == { "status": "eventful", "snapshot_id": SNAPSHOT1.id.hex() } assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=SNAPSHOT1.id, ) check_snapshot(SNAPSHOT1, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 3, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats loader = NixGuixLoader(swh_storage, sources_url) load_status = loader.load() expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a" expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex) assert load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id_hex, } assert_last_visit_matches( swh_storage, sources_url, status="partial", type="nixguix", snapshot=expected_snapshot_id, ) # This ensures visits are incremental. Indeed, if we request a # second time an url, because of the requests_mock_datadir_visits # fixture, the file has to end with `_visit1`. expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes( "602140776b2ce6c9159bcf52ada73a297c063d5e"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes( "df7811b9644ed8ef088e2e7add62ed32b0bab15f"), target_type=TargetType.RELEASE, ), b"https://github.com/owner-2/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes( "5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 2, "directory": 5, "origin": 1, "origin_visit": 2, "release": 3, "revision": 0, "skipped_content": 0, "snapshot": 2, } == stats
"nixpkgs-swh_sources.json"), "rb", ) as f: return f.read() SNAPSHOT1 = Snapshot( id=hash_to_bytes("fafcfe32016d018bd892114fce211f37a36a092a"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), b"https://github.com/owner-1/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("df7811b9644ed8ef088e2e7add62ed32b0bab15f"), target_type=TargetType.RELEASE, ), b"https://github.com/owner-3/repository-1/revision-1.tgz": SnapshotBranch( target=hash_to_bytes("dc7dc10a664396d5c88adc56352904db231bde14"), target_type=TargetType.RELEASE, ), }, ) def check_snapshot(snapshot: Snapshot, storage: StorageInterface): # The `evaluation` branch is allowed to be unresolvable. It's possible at current # nixguix visit time, it is not yet visited (the git loader is in charge of its # visit for now). For more details, check the
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag, weird_branches): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) dir2 = Directory(entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), )) dir3 = Directory(entries=(DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), )) dir4 = Directory(entries=(DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), )) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) rel5 = Release( name=b"1.0.0:weirdname", message=b"weird release", target_type=ObjectType.RELEASE, target=rel2.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5]) edges.extend([ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), (rel5, rev2), ]) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4, rel5]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() cooked_swhid = { RootObjects.SNAPSHOT: snp.swhid(), RootObjects.REVISION: rev2.swhid(), RootObjects.RELEASE: rel2.swhid(), RootObjects.WEIRD_RELEASE: rel5.swhid(), }[root_object] cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION): log_head = "master" elif root_object == RootObjects.RELEASE: log_head = "1.0.0" elif root_object == RootObjects.WEIRD_RELEASE: log_head = "release" else: assert False, root_object output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", log_head, ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if root_object == RootObjects.SNAPSHOT: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), ]) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ]) elif root_object in ( RootObjects.REVISION, RootObjects.RELEASE, RootObjects.WEIRD_RELEASE, ): swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")]) else: assert False, root_object if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called()
def test_pypi_good_origin(): """Tests loading a revision whose origin we can find""" source_original_artifact = { "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", "date": "2014-05-07T22:03:00", "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", "size": 46644, "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", "filename": "PyPDFLite-0.1.32.tar.gz", "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", "archive_type": "tar", } dest_original_artifacts = [{ "url": "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz", "filename": "PyPDFLite-0.1.32.tar.gz", "archive_type": "tar", "length": 46644, "checksums": { "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12", "sha256": "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824", "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5", "blake2s256": "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385", }, }] revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2" row = { "id": revision_id, "directory": DIRECTORY_ID, "date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc), "committer_date": datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc), "type": "tar", "message": b"0.1.32", "metadata": { "original_artifact": source_original_artifact }, } origin_url = "https://pypi.org/project/PyPDFLite/" storage = get_storage("memory") snapshot_id = b"42" * 10 storage.origin_add([Origin(url=origin_url)]) storage.origin_visit_add( [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin_url, visit=1, date=now(), status="partial", snapshot=snapshot_id, ) ]) storage.snapshot_add([ Snapshot( id=snapshot_id, branches={ b"foo": SnapshotBranch( target_type=TargetType.REVISION, target=revision_id, ) }, ) ]) storage.metadata_authority_add([ attr.evolve(PYPI_AUTHORITY, metadata={}), attr.evolve(SWH_AUTHORITY, metadata={}), ]) storage.metadata_fetcher_add([FETCHER]) deposit_cur = None handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False) revision_swhid = CoreSWHID.from_string( "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2") assert storage.raw_extrinsic_metadata_get( DIRECTORY_SWHID, authority=PYPI_AUTHORITY, ) == PagedResult( results=[], next_page_token=None, ) assert storage.raw_extrinsic_metadata_get( DIRECTORY_SWHID, authority=SWH_AUTHORITY, ) == PagedResult( results=[ RawExtrinsicMetadata( target=DIRECTORY_SWHID, discovery_date=datetime.datetime( 2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc, ), authority=SWH_AUTHORITY, fetcher=FETCHER, format="original-artifacts-json", metadata=json.dumps(dest_original_artifacts).encode(), origin=origin_url, revision=revision_swhid, ), ], next_page_token=None, )
def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me <*****@*****.**>") current_person = Person.from_fullname(b"me <*****@*****.**>") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=(DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ]] swh_graph = unittest.mock.Mock( wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname
def test_pypi_release_metadata_structure( swh_storage, requests_mock_datadir, _0805nexter_api_info ): url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_release_id = hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68") expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( target=expected_release_id, target_type=TargetType.RELEASE, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=expected_release_id ) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.pypi.loader.PyPILoader", version=__version__, ), discovery_date=loader.visit_date, format="pypi-project-json", metadata=json.dumps( json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0] ).encode(), origin=url, release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult( next_page_token=None, results=expected_metadata, )
def test_load_upgrade_from_revision_extids(caplog): """Tests that, when loading incrementally based on a snapshot made by an old version of the loader, the loader will convert revisions to releases and add them to the storage. Also checks that, if an extid exists pointing to a non-existent revision (which should never happen, but you never know...), the release is loaded from scratch.""" storage = get_storage("memory") origin = "http://example.org" dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) date = TimestampWithTimezone.from_datetime( datetime.datetime.now(tz=datetime.timezone.utc)) person = Person.from_fullname(b"Jane Doe <*****@*****.**>") rev1 = Revision( message=b"blah", author=person, date=date, committer=person, committer_date=date, directory=dir1_swhid.object_id, type=RevisionType.TAR, synthetic=True, ) rel1 = Release( name=b"v1.0", message=b"blah\n", author=person, date=date, target=dir1_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) rev1_swhid = rev1.swhid() rel1_swhid = rel1.swhid() rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) # Results of a previous load storage.extid_add([ ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0), ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0), ]) storage.revision_add([rev1]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev1_swhid.object_id), b"v2.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev2_swhid.object_id), }) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add([ OriginVisit(origin="http://example.org", visit=1, date=date, type="tar") ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ]) loader = StubPackageLoader(storage, "http://example.org") patch.object( loader, "_load_release", return_value=(rel2_swhid.object_id, dir2_swhid.object_id), autospec=True, ).start() patch.object( loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, ).start() caplog.set_level(logging.ERROR) loader.load() assert len(caplog.records) == 1 (record, ) = caplog.records assert record.levelname == "ERROR" assert "Failed to upgrade branch branch-v2.0" in record.message assert loader._load_release.mock_calls == [ # v1.0: not loaded because there is already a revision matching it # v2.0: loaded, as the revision is missing from the storage even though there # is an extid call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin)), # v3.0: loaded (did not exist yet) call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin)), ] snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"branch-v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), b"branch-v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), }) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, ], ) assert set(extids) == { ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), }
def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir): """Multiple visits with no changes results in 1 same snapshot""" url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e") assert actual_load_status == { "status": "eventful", "snapshot_id": snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=snapshot_id ) expected_snapshot = Snapshot( id=snapshot_id, branches={ b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats actual_load_status2 = loader.load() assert actual_load_status2 == { "status": "uneventful", "snapshot_id": actual_load_status2["snapshot_id"], } visit_status2 = assert_last_visit_matches( swh_storage, url, status="full", type="pypi" ) stats2 = get_stats(swh_storage) expected_stats2 = stats.copy() expected_stats2["origin_visit"] = 1 + 1 assert expected_stats2 == stats2 # same snapshot assert visit_status2.snapshot == snapshot_id
def test_check_snapshot_failures(swh_storage): """Failure scenarios: 0. snapshot parameter is not a snapshot 1. snapshot id is correct but branches mismatched 2. snapshot id is not correct, it's not found in the storage 3. snapshot reference an alias which does not exist 4. snapshot is found in storage, targeted revision does not exist 5. snapshot is found in storage, targeted revision exists but the directory the revision targets does not exist 6. snapshot is found in storage, target revision exists, targeted directory by the revision exist. Content targeted by the directory does not exist. 7. snapshot is found in storage, targeted release does not exist """ snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7" snapshot = Snapshot( id=hash_to_bytes(snap_id_hex), branches={ b"master": SnapshotBranch( target=hash_to_bytes(hash_hex), target_type=TargetType.REVISION, ), }, ) s = swh_storage.snapshot_add([snapshot]) assert s == { "snapshot:add": 1, } unexpected_snapshot = Snapshot( branches={ b"tip": SnapshotBranch( # wrong branch target=hash_to_bytes(hash_hex), target_type=TargetType.RELEASE) }, ) # 0. not a Snapshot object, raise! with pytest.raises( AssertionError, match="argument 'expected_snapshot' must be a snapshot"): check_snapshot(ORIGIN_VISIT, swh_storage) # 1. snapshot id is correct but branches mismatched with pytest.raises( AssertionError): # sadly debian build raises only assertion check_snapshot(attr.evolve(unexpected_snapshot, id=snapshot.id), swh_storage) # 2. snapshot id is not correct, it's not found in the storage wrong_snap_id = hash_to_bytes("999666f535f882bc7f9a18fb16c9ad27fda7bab7") with pytest.raises(AssertionError, match="is not found"): check_snapshot(attr.evolve(unexpected_snapshot, id=wrong_snap_id), swh_storage) # 3. snapshot references an inexistent alias snapshot0 = Snapshot( id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), }, ) swh_storage.snapshot_add([snapshot0]) with pytest.raises(InconsistentAliasBranchError, match="Alias branch HEAD"): check_snapshot(snapshot0, swh_storage) # 4. snapshot is found in storage, targeted revision does not exist rev_not_found = list(swh_storage.revision_missing([REVISION.id])) assert len(rev_not_found) == 1 snapshot1 = Snapshot( id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot1]) with pytest.raises(InexistentObjectsError, match="Branch/Revision"): check_snapshot(snapshot1, swh_storage) # 5. snapshot is found in storage, targeted revision exists but the directory the # revision targets does not exist swh_storage.revision_add([REVISION]) dir_not_found = list(swh_storage.directory_missing([REVISION.directory])) assert len(dir_not_found) == 1 snapshot2 = Snapshot( id=hash_to_bytes("987123f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot2]) with pytest.raises(InexistentObjectsError, match="Missing directories"): check_snapshot(snapshot2, swh_storage) assert DIRECTORY.id == REVISION.directory swh_storage.directory_add([DIRECTORY]) # 6. snapshot is found in storage, target revision exists, targeted directory by the # revision exist. Content targeted by the directory does not exist. assert DIRECTORY.entries[0].target == CONTENT.sha1_git not_found = list( swh_storage.content_missing_per_sha1_git([CONTENT.sha1_git])) assert len(not_found) == 1 swh_storage.directory_add([DIRECTORY]) snapshot3 = Snapshot( id=hash_to_bytes("091456f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), }, ) swh_storage.snapshot_add([snapshot3]) with pytest.raises(InexistentObjectsError, match="Missing content(s)"): check_snapshot(snapshot3, swh_storage) # 7. snapshot is found in storage, targeted release does not exist # release targets the revisions which exists assert RELEASE.target == REVISION.id snapshot4 = Snapshot( id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"), branches={ b"alias": SnapshotBranch( target=b"HEAD", target_type=TargetType.ALIAS, ), b"HEAD": SnapshotBranch( target=REVISION.id, target_type=TargetType.REVISION, ), b"release/0.1.0": SnapshotBranch( target=RELEASE.id, target_type=TargetType.RELEASE, ), }, ) swh_storage.snapshot_add([snapshot4]) with pytest.raises(InexistentObjectsError, match="Branch/Release"): check_snapshot(snapshot4, swh_storage)
def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits): """With prior visit, 2nd load will result with a different snapshot""" url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) visit1_actual_load_status = loader.load() visit1_stats = get_stats(swh_storage) expected_snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e") assert visit1_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) assert { "content": 6, "directory": 4, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == visit1_stats # Reset internal state del loader._cached__raw_info del loader._cached_info visit2_actual_load_status = loader.load() visit2_stats = get_stats(swh_storage) assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status expected_snapshot_id2 = hash_to_bytes("77febe6ff0faf6cc00dd015a6c9763579a9fb6c7") assert visit2_actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id2.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2 ) expected_snapshot = Snapshot( id=expected_snapshot_id2, branches={ b"releases/1.1.0": SnapshotBranch( target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"), target_type=TargetType.RELEASE, ), b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"), target_type=TargetType.RELEASE, ), b"releases/1.3.0": SnapshotBranch( target=hash_to_bytes("a21b09cbec8e31f47307f196bb1f939effc26e11"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.3.0", target_type=TargetType.ALIAS, ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage) assert { "content": 6 + 1, # 1 more content "directory": 4 + 2, # 2 more directories "origin": 1, "origin_visit": 1 + 1, "release": 2 + 1, # 1 more release "revision": 0, "skipped_content": 0, "snapshot": 1 + 1, # 1 more snapshot } == visit2_stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith("https://files.pythonhosted.org") ] # visited each artifact once across 2 visits assert len(urls) == len(set(urls))
class StorageData: """Data model objects to use within tests.""" content = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) content2 = Content( data=b"4242\n", length=5, sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"), sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"), sha256=hash_to_bytes( "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd" ), blake2s256=hash_to_bytes( "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d" ), status="visible", ) content3 = Content( data=b"424242\n", length=7, sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"), sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"), sha256=hash_to_bytes( "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36" ), blake2s256=hash_to_bytes( "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11" ), status="visible", ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc), ) contents: Tuple[Content, ...] = (content, content2, content3) skipped_content = SkippedContent( length=1024 * 1024 * 200, sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"), sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", origin="file:///dev/zero", ) skipped_content2 = SkippedContent( length=1024 * 1024 * 300, sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"), sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", ) skipped_contents: Tuple[SkippedContent, ...] = (skipped_content, skipped_content2) directory5 = Directory( id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=(), ) directory = Directory( id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar\xc3", type="dir", target=directory5.id, perms=from_disk.DentryPerms.directory, ), ], ), ) directory2 = Directory( id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), entries=tuple([ DirectoryEntry( name=b"oof", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ) ], ), ) directory3 = Directory( id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"subdir", type="dir", target=directory.id, perms=from_disk.DentryPerms.directory, ), DirectoryEntry( name=b"hello", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) directory4 = Directory( id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"), entries=tuple([ DirectoryEntry( name=b"subdir1", type="dir", target=directory3.id, perms=from_disk.DentryPerms.directory, ) ], ), ) directory6 = Directory( id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=b"\x00" * 20, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar", type="dir", target=b"\x01" * 20, perms=from_disk.DentryPerms.directory, ), ], ), raw_manifest=( b"tree 61\x00" b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" # noqa b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" # noqa ), ) directories: Tuple[Directory, ...] = ( directory2, directory, directory3, directory4, directory5, directory6, ) revision = Revision( id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) revision2 = Revision( id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=False, ) revision3 = Revision( id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id, revision2.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=True, ) revision4 = Revision( id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([revision3.id]), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ) git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) hg_revision = Revision( id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.MERCURIAL, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", "node": "a316dfb434af2b451c1f393496b7eaeda343f543", }, extra_headers=(), synthetic=True, ) hg_revision2 = Revision( id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ), synthetic=False, ) hg_revision3 = Revision( id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id, hg_revision2.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ), synthetic=True, ) hg_revision4 = Revision( id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([hg_revision3.id]), type=RevisionType.MERCURIAL, directory=directory.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ), synthetic=False, ) hg_revisions: Tuple[Revision, ...] = ( hg_revision, hg_revision2, hg_revision3, hg_revision4, ) revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), Origin(url="https://github.com/user2/repo1"), Origin(url="https://github.com/user3/repo1"), Origin(url="https://gitlab.com/user1/repo1"), Origin(url="https://gitlab.com/user2/repo1"), Origin(url="https://forge.softwareheritage.org/source/repo1"), Origin(url="https://example.рф/🏛️.txt"), ) origin, origin2 = origins[:2] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="http://hal.inria.example.com/", ) metadata_authority2 = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="http://wikidata.example.com/", ) authorities: Tuple[MetadataAuthority, ...] = ( metadata_authority, metadata_authority2, ) metadata_fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) metadata_fetcher2 = MetadataFetcher( name="swh-example", version="0.0.1", ) fetchers: Tuple[MetadataFetcher, ...] = (metadata_fetcher, metadata_fetcher2) date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit2 = datetime.datetime(2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit3 = datetime.datetime(2018, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) type_visit1 = "git" type_visit2 = "hg" type_visit3 = "deb" origin_visit = OriginVisit( origin=origin.url, visit=1, date=date_visit1, type=type_visit1, ) origin_visit2 = OriginVisit( origin=origin.url, visit=2, date=date_visit2, type=type_visit1, ) origin_visit3 = OriginVisit( origin=origin2.url, visit=1, date=date_visit1, type=type_visit2, ) origin_visits: Tuple[OriginVisit, ...] = ( origin_visit, origin_visit2, origin_visit3, ) release = Release( id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"*****@*****.**", fullname=b"olasd <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0042", ), target=revision.id, target_type=ObjectType.REVISION, message=b"synthetic release", synthetic=True, ) release2 = Release( id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision2.id, target_type=ObjectType.REVISION, message=b"v0.0.2\nMisc performance improvements + bug fixes", synthetic=False, ) release3 = Release( id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision3.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }, ) empty_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) complete_snapshot = Snapshot( id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"), branches={ b"directory": SnapshotBranch( target=directory.id, target_type=TargetType.DIRECTORY, ), b"directory2": SnapshotBranch( target=directory2.id, target_type=TargetType.DIRECTORY, ), b"content": SnapshotBranch( target=content.sha1_git, target_type=TargetType.CONTENT, ), b"alias": SnapshotBranch( target=b"revision", target_type=TargetType.ALIAS, ), b"revision": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), b"release": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE, ), b"snapshot": SnapshotBranch( target=empty_snapshot.id, target_type=TargetType.SNAPSHOT, ), b"dangling": None, }, ) snapshots: Tuple[Snapshot, ...] = (snapshot, empty_snapshot, complete_snapshot) content_metadata1 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin.url, discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="json", metadata=b'{"foo": "bar"}', ) content_metadata2 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin2.url, discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="yaml", metadata=b"foo: bar", ) content_metadata3 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", origin=origin.url, visit=42, snapshot=snapshot.swhid(), release=release.swhid(), revision=revision.swhid(), directory=directory.swhid(), path=b"/foo/bar", ) content_metadata: Tuple[RawExtrinsicMetadata, ...] = ( content_metadata1, content_metadata2, content_metadata3, ) origin_metadata1 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) origin_metadata2 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata3 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata: Tuple[RawExtrinsicMetadata, ...] = ( origin_metadata1, origin_metadata2, origin_metadata3, ) extid1 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=revision.id), extid_type="git", extid=revision.id, ) extid2 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=hg_revision.id), extid_type="mercurial", extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"), ) extid3 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory.id), extid_type="directory", extid=b"something", ) extid4 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id), extid_type="directory", extid=b"something", extid_version=2, ) extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, extid4, )
def get(ids): return [ Snapshot.from_dict( remove_keys(swh_storage.snapshot_get(ids[0]), ("next_branch", ))) ]
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir): """Test with two versions that have exactly the same tarball""" package = "org_version_mismatch" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1" release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0" versions = [ ("0.0.3-beta", beta_release_id), ("0.0.3", release_id), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.3", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release( name=b"0.0.3-beta", message=( b"Synthetic release for NPM source package org_version_mismatch " b"version 0.0.3-beta\n" ), target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(beta_release_id), ) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.3", message=( b"Synthetic release for NPM source package org_version_mismatch " b"version 0.0.3\n" ), target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) # Check incremental re-load keeps it unchanged loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id )
def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[1]["url"], artifacts=EXPECTED_PACKAGES[1]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4" expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch( target=hash_to_bytes(expected_release_id), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 1, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get([ hash_to_bytes(expected_release_id) ])[0] == Release( name=b"1.12-1", message=b"Synthetic release for Arch Linux source package gzip version " b"1.12-1\n\nGNU compression utility\n", target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname( b"Arch Linux ARM Build System <*****@*****.**>"), date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), id=hash_to_bytes(expected_release_id), ) assert_last_visit_matches( swh_storage, url=EXPECTED_PACKAGES[1]["url"], status="full", type="arch", snapshot=expected_snapshot.id, )
def test_arch_loader_load_n_versions(datadir, requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[0]["url"], artifacts=EXPECTED_PACKAGES[0]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/1:1.3_20190211-1/" b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz": SnapshotBranch( target=hash_to_bytes( "37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"), target_type=TargetType.RELEASE, ), b"releases/1:1.3_20220414-1/" b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst": SnapshotBranch( target=hash_to_bytes( "020d3f5627df7474f257fd04f1ede4415296e265"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target= b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 2, "directory": 2, "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert_last_visit_matches( swh_storage, url=EXPECTED_PACKAGES[0]["url"], status="full", type="arch", snapshot=expected_snapshot.id, )
parents=(), ) REVISIONS = [REVISION] SNAPSHOTS = [ Snapshot( id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), branches={ b"refs/heads/add-revision-origin-cache": SnapshotBranch( target=b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0s\xe7/\xe9l\x1e', target_type=TargetType.REVISION, ), b"refs/head/master": SnapshotBranch( target=b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm", target_type=TargetType.REVISION, ), b"HEAD": SnapshotBranch( target=b"refs/head/master", target_type=TargetType.ALIAS ), b"refs/tags/v0.0.103": SnapshotBranch( target=b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+\x0f\xdd', target_type=TargetType.RELEASE, ), }, ), Snapshot( id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), branches={ b"3DLDF-1.1.4.tar.gz": SnapshotBranch( target=b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc"G\x99\x11', target_type=TargetType.REVISION,