def new_revision(draw): """ Hypothesis strategy returning random raw swh revision data not ingested into the test archive. """ return Revision( directory=draw(sha1().map(hash_to_bytes)), author=draw(new_person()), committer=draw(new_person()), message=draw( text(min_size=20, max_size=100).map(lambda t: t.encode())), date=TimestampWithTimezone.from_datetime(draw(new_swh_date())), committer_date=TimestampWithTimezone.from_datetime(draw( new_swh_date())), synthetic=False, type=RevisionType.GIT, )
def test_commit_to_revision_with_extra_headers_mergetag(self): sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex( "faa4b64a841ca3e3f07d6501caebda2e3e8e544e"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594138183, microseconds=0, ), offset_bytes=b"+0200", ), message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag", MERGETAG)), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594138183, microseconds=0, ), offset_bytes=b"+0200", ), parents=( bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"), bytes.fromhex("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), ), synthetic=False, ) assert revision == expected_revision
def test_svn_date_to_swh_date_epoch(): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch default_tstz = TimestampWithTimezone(timestamp=Timestamp(seconds=0, microseconds=0), offset_bytes=b"+0000") assert converters.svn_date_to_swh_date("") == default_tstz assert converters.svn_date_to_swh_date(None) == default_tstz
def test_normalize_timestamp_datetime(date, seconds, tz, offset, offset_bytes, microsecond): date = date.astimezone(tz).replace(microsecond=microsecond) assert TimestampWithTimezone.from_dict(date).to_dict() == { "timestamp": { "seconds": seconds, "microseconds": microsecond }, "offset_bytes": offset_bytes, }
def test_commit_to_revision_with_extra_headers(self): sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex( "f8ec06e4ed7b9fff4918a0241a48023143f30000"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594137902, microseconds=0, ), offset_bytes=b"+0200", ), message=b"Am\xe9lioration du fichier READM\xa4\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594136900, microseconds=0, ), offset_bytes=b"+0200", ), parents=( bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"), ), synthetic=False, ) assert revision == expected_revision
def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca", type=RevisionType.GIT, committer=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), message=b"add submodule dependency\n", metadata=None, extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), parents=( b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r", ), synthetic=False, ) assert revision == expected_revision
def test_api_revision_directory_ok_returns_revision(api_client, archive_data, revision, person, date): rev_path = "foo" _dir = Directory(entries=(DirectoryEntry( name=rev_path.encode(), type="rev", target=hash_to_bytes(revision), perms=DentryPerms.revision, ), )) archive_data.directory_add([_dir]) rev = Revision( directory=_dir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([rev]) revision_id = hash_to_hex(rev.id) rev_data = archive_data.revision_get(revision) url = reverse( "api-1-revision-directory", { "sha1_git": revision_id, "dir_path": rev_path }, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "content": enrich_revision(rev_data, request=rv.wsgi_request), "path": rev_path, "type": "rev", "revision": revision_id, }
def dulwich_tsinfo_to_timestamp( timestamp, timezone: int, timezone_neg_utc: bool, timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage.""" ts = Timestamp( seconds=int(timestamp), microseconds=0, ) if timezone_bytes is None: # Failed to parse from the raw manifest, fallback to what Dulwich managed to # parse. return TimestampWithTimezone.from_numeric_offset( timestamp=ts, offset=timezone // 60, negative_utc=timezone_neg_utc, ) else: return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes)
def build_release(self, p_info: MavenPackageInfo, uncompressed_path: str, directory: Sha1Git) -> Optional[Release]: msg = f"Synthetic release for archive at {p_info.url}\n".encode( "utf-8") normalized_time = TimestampWithTimezone.from_datetime(p_info.time) return Release( name=p_info.version.encode(), message=msg, date=normalized_time, author=EMPTY_AUTHOR, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, )
def test_revision_submodule(self, swh_storage, cook_extract_revision, ingest_target_revision): date = TimestampWithTimezone.from_datetime( datetime.datetime.now( datetime.timezone.utc).replace(microsecond=0)) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex( "3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory(entries=(DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string()
def test_dulwich_tag_to_release_author_and_date(self): sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef") tagger = b"hey dude <*****@*****.**>" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" date = int( datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()) tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = None tag.tagger = tagger tag.tag_time = date tag.tag_timezone = 0 assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"*****@*****.**", fullname=b"hey dude <*****@*****.**>", name=b"hey dude", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1196812800, microseconds=0, ), offset_bytes=b"+0000", ), id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release
def test_dulwich_tag_to_release_author_zero_date(self): # to reproduce bug T815 (fixed) sha = hash_to_bytes("6cc1deff5cdcd853428bb63b937f43dd2566c36f") tagger = b"hey dude <*****@*****.**>" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" date = int( datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc).timestamp()) tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = None tag.tagger = tagger tag.tag_time = date tag.tag_timezone = 0 assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"*****@*****.**", fullname=b"hey dude <*****@*****.**>", name=b"hey dude", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=0, microseconds=0, ), offset_bytes=b"+0000", ), id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release
def svn_date_to_swh_date(strdate: Optional[bytes]) -> TimestampWithTimezone: """Convert a string date to an swh one. Args: strdate: A string representing a date with format like ``b'YYYY-mm-DDTHH:MM:SS.800722Z'`` Returns: An swh date format """ if not strdate: # either None or empty string dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) else: dt = iso8601.parse_date(strdate.decode("ascii")) assert dt.tzinfo is not None, strdate return TimestampWithTimezone.from_datetime(dt)
def test_from_release(): """Convert release model object to a dict should be ok""" ts = int( datetime.datetime(2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) release_input = Release( id=hashutil.hash_to_bytes("aad23fa492a0c5fed0708a6703be875448c86884"), target=hashutil.hash_to_bytes( "5e46d564378afc44b31bb89f99d5675195fbdf67"), target_type=ObjectType.REVISION, date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), author=Person( name=b"author name", fullname=b"Author Name author@email", email=b"author@email", ), name=b"v0.0.1", message=b"some comment on release", synthetic=True, ) expected_release = { "id": "aad23fa492a0c5fed0708a6703be875448c86884", "target": "5e46d564378afc44b31bb89f99d5675195fbdf67", "target_type": "revision", "date": "2015-01-01T22:00:00+00:00", "author": { "name": "author name", "fullname": "Author Name author@email", "email": "author@email", }, "name": "v0.0.1", "message": "some comment on release", "target_type": "revision", "synthetic": True, } actual_release = converters.from_release(release_input) assert actual_release == expected_release
def _make_stub_directory_revision(self, dir_id: Sha1Git) -> Sha1Git: author = Person.from_fullname( b"swh-vault, git-bare cooker <*****@*****.**>") dt = datetime.datetime.now(tz=datetime.timezone.utc) dt = dt.replace(microsecond=0) # not supported by git date = TimestampWithTimezone.from_datetime(dt) revision = Revision( author=author, committer=author, date=date, committer_date=date, message=b"Initial commit", type=RevisionType.GIT, directory=self.obj_id, synthetic=True, ) self.write_revision_node(revision) return revision.id
def build_release(self, p_info: ArchPackageInfo, uncompressed_path: str, directory: Sha1Git) -> Optional[Release]: intrinsic_metadata = extract_intrinsic_metadata( Path(uncompressed_path)) author = Person.from_fullname(intrinsic_metadata["packager"].encode()) description = intrinsic_metadata["pkgdesc"] message = ( f"Synthetic release for Arch Linux source package {p_info.name} " f"version {p_info.version}\n\n" f"{description}\n") return Release( name=p_info.version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(p_info.last_modified), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, )
def build_release(self, p_info: NpmPackageInfo, uncompressed_path: str, directory: Sha1Git) -> Optional[Release]: # Metadata from NPM is not intrinsic to tarballs. # This means two package versions can have the same tarball, but different # metadata. To avoid mixing up releases, every field used to build the # release object must be part of NpmPackageInfo.MANIFEST_FORMAT. i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: return None author = extract_npm_package_author(i_metadata) assert self.package_name == p_info.package_name msg = ( f"Synthetic release for NPM source package {p_info.package_name} " f"version {p_info.version}\n") if p_info.date is None: url = p_info.url artifact_name = os.path.basename(url) raise ValueError( "Origin %s: Cannot determine upload time for artifact %s." % (p_info.url, artifact_name)) date = TimestampWithTimezone.from_iso8601(p_info.date) # FIXME: this is to remain bug-compatible with earlier versions: date = attr.evolve(date, timestamp=attr.evolve(date.timestamp, microseconds=0)) r = Release( name=p_info.version.encode(), message=msg.encode(), author=author, date=date, target=directory, target_type=ObjectType.DIRECTORY, synthetic=True, ) return r
def build_release(self, p_info: CratesPackageInfo, uncompressed_path: str, directory: Sha1Git) -> Optional[Release]: # Extract intrinsic metadata from dir_path/Cargo.toml name = p_info.name version = p_info.version dir_path = Path(uncompressed_path, f"{name}-{version}") i_metadata_raw = extract_intrinsic_metadata(dir_path) # Get only corresponding key of IntrinsicPackageMetadata i_metadata_keys = [ k for k in IntrinsicPackageMetadata.__annotations__.keys() ] # We use data only from "package" entry i_metadata = { k: v for k, v in i_metadata_raw["package"].items() if k in i_metadata_keys } p_info.i_metadata = IntrinsicPackageMetadata( **i_metadata) # type: ignore[misc] author = extract_author(p_info) description = extract_description(p_info) message = (f"Synthetic release for Crate source package {p_info.name} " f"version {p_info.version}\n\n" f"{description}\n") # The only way to get a value for updated_at is through extrinsic metadata updated_at = p_info.e_metadata_version.get("updated_at") return Release( name=version.encode(), author=author, date=TimestampWithTimezone.from_iso8601(updated_at), message=message.encode(), target_type=ObjectType.DIRECTORY, target=directory, synthetic=True, )
def test_weird_commit(self): """Checks raw_manifest is set when the commit cannot fit the data model""" # Well-formed manifest raw_manifest = (b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" b"author Foo <*****@*****.**> 1640191028 +0200\n" b"committer Foo <*****@*****.**> 1640191028 +0200\n\n" b"some commit message") commit = dulwich.objects.Commit.from_raw_string( b"commit", raw_manifest) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", ) assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", directory=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), committer=Person.from_fullname(b"Foo <*****@*****.**>", ), date=date, committer_date=date, type=RevisionType.GIT, raw_manifest=None, ) # Mess with the offset raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") commit = dulwich.objects.Commit.from_raw_string( b"commit", raw_manifest2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+200", ) assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", directory=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), committer=Person.from_fullname(b"Foo <*****@*****.**>", ), date=date, committer_date=date, type=RevisionType.GIT, raw_manifest=None, ) # Mess with the rest of the manifest raw_manifest2 = raw_manifest.replace( b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) commit = dulwich.objects.Commit.from_raw_string( b"commit", raw_manifest2) date = TimestampWithTimezone( timestamp=Timestamp(seconds=1640191028, microseconds=0), offset_bytes=b"+0200", ) assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", directory=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), committer=Person.from_fullname(b"Foo <*****@*****.**>", ), date=date, committer_date=date, type=RevisionType.GIT, raw_manifest=b"commit 161\x00" + raw_manifest2, )
f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom", ] REL_MSGS = ( b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/" b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n", b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/" b"sprova4j/0.1.1/sprova4j-0.1.1-sources.jar\n", ) REL_DATES = ( TimestampWithTimezone.from_datetime( datetime.datetime(2021, 7, 12, 19, 6, 59, 335000, tzinfo=datetime.timezone.utc)), TimestampWithTimezone.from_datetime( datetime.datetime(2021, 7, 12, 19, 37, 5, 534000, tzinfo=datetime.timezone.utc)), )
def test_arch_loader_load_one_version(datadir, requests_mock_datadir, swh_storage): loader = ArchLoader( swh_storage, url=EXPECTED_PACKAGES[1]["url"], artifacts=EXPECTED_PACKAGES[1]["artifacts"], ) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] is not None expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4" expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc" assert expected_snapshot_id == actual_load_status["snapshot_id"] expected_snapshot = Snapshot( id=hash_to_bytes(actual_load_status["snapshot_id"]), branches={ b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz": SnapshotBranch( target=hash_to_bytes(expected_release_id), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert { "content": 1, "directory": 1, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats assert swh_storage.release_get([ hash_to_bytes(expected_release_id) ])[0] == Release( name=b"1.12-1", message=b"Synthetic release for Arch Linux source package gzip version " b"1.12-1\n\nGNU compression utility\n", target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"), target_type=ObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname( b"Arch Linux ARM Build System <*****@*****.**>"), date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"), id=hash_to_bytes(expected_release_id), ) assert_last_visit_matches( swh_storage, url=EXPECTED_PACKAGES[1]["url"], status="full", type="arch", snapshot=expected_snapshot.id, )
type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, ) ]), ) REVISION = Revision( id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone(Timestamp(1234567890, 0), offset_bytes=b"+0200"), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone(Timestamp(1123456789, 0), offset_bytes=b"-0000"), parents=(), type=RevisionType.GIT, directory=DIRECTORY.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", },
def test_load_upgrade_from_revision_extids(caplog): """Tests that, when loading incrementally based on a snapshot made by an old version of the loader, the loader will convert revisions to releases and add them to the storage. Also checks that, if an extid exists pointing to a non-existent revision (which should never happen, but you never know...), the release is loaded from scratch.""" storage = get_storage("memory") origin = "http://example.org" dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) date = TimestampWithTimezone.from_datetime( datetime.datetime.now(tz=datetime.timezone.utc)) person = Person.from_fullname(b"Jane Doe <*****@*****.**>") rev1 = Revision( message=b"blah", author=person, date=date, committer=person, committer_date=date, directory=dir1_swhid.object_id, type=RevisionType.TAR, synthetic=True, ) rel1 = Release( name=b"v1.0", message=b"blah\n", author=person, date=date, target=dir1_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) rev1_swhid = rev1.swhid() rel1_swhid = rel1.swhid() rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) # Results of a previous load storage.extid_add([ ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0), ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0), ]) storage.revision_add([rev1]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev1_swhid.object_id), b"v2.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev2_swhid.object_id), }) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add([ OriginVisit(origin="http://example.org", visit=1, date=date, type="tar") ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ]) loader = StubPackageLoader(storage, "http://example.org") patch.object( loader, "_load_release", return_value=(rel2_swhid.object_id, dir2_swhid.object_id), autospec=True, ).start() patch.object( loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, ).start() caplog.set_level(logging.ERROR) loader.load() assert len(caplog.records) == 1 (record, ) = caplog.records assert record.levelname == "ERROR" assert "Failed to upgrade branch branch-v2.0" in record.message assert loader._load_release.mock_calls == [ # v1.0: not loaded because there is already a revision matching it # v2.0: loaded, as the revision is missing from the storage even though there # is an extid call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin)), # v3.0: loaded (did not exist yet) call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin)), ] snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"branch-v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), b"branch-v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), }) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, ], ) assert set(extids) == { ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), }
def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me <*****@*****.**>") current_person = Person.from_fullname(b"me <*****@*****.**>") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=(DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ]] swh_graph = unittest.mock.Mock( wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname
def test_weird_tag(self): """Checks raw_manifest is set when the tag cannot fit the data model""" # Well-formed manifest raw_manifest = (b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n" b"type commit\n" b"tag blah\n" b"tagger Foo <*****@*****.**> 1640191027 +0200\n\n" b"some release message") tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), target_type=ObjectType.REVISION, synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1640191027, microseconds=0), offset_bytes=b"+0200", ), raw_manifest=None, ) # Mess with the offset (negative UTC) raw_manifest2 = raw_manifest.replace(b"+0200", b"-0000") tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), target_type=ObjectType.REVISION, synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1640191027, microseconds=0), offset_bytes=b"-0000", ), ) # Mess with the offset (other) raw_manifest2 = raw_manifest.replace(b"+0200", b"+200") tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), target_type=ObjectType.REVISION, synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1640191027, microseconds=0), offset_bytes=b"+200", ), ) # Mess with the rest of the manifest raw_manifest2 = raw_manifest.replace( b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce", b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE", ) tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2) assert converters.dulwich_tag_to_release(tag) == Release( name=b"blah", message=b"some release message", target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), target_type=ObjectType.REVISION, synthetic=False, author=Person.from_fullname(b"Foo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1640191027, microseconds=0), offset_bytes=b"+0200", ), raw_manifest=b"tag 136\x00" + raw_manifest2, )
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info): package = "org" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) release_id = "d38cc0b571cd41f3c85513864e049766b42032a7" versions = [ ("0.0.2", release_id), ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"), ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.4", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.2", message=b"Synthetic release for NPM source package org version 0.0.2\n", target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person( fullname=b"mooz <*****@*****.**>", name=b"mooz", email=b"*****@*****.**", ), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) contents = swh_storage.content_get(_expected_new_contents_first_visit) count = sum(0 if content is None else 1 for content in contents) assert count == len(_expected_new_contents_first_visit) assert ( list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == [] ) assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == [] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) for (version_name, release_id) in versions: release = swh_storage.release_get([hash_to_bytes(release_id)])[0] assert release.target_type == ModelObjectType.DIRECTORY directory_id = release.target directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id, ) release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id), ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, ), discovery_date=loader.visit_date, format="replicate-npm-package-json", metadata=json.dumps( json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", release=release_swhid, ) ] assert swh_storage.raw_extrinsic_metadata_get( directory_swhid, metadata_authority, ) == PagedResult( next_page_token=None, results=expected_metadata, ) stats = get_stats(swh_storage) assert { "content": len(_expected_new_contents_first_visit), "directory": len(_expected_new_directories_first_visit), "origin": 1, "origin_visit": 1, "release": len(_expected_new_releases_first_visit), "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats
class StorageData: """Data model objects to use within tests.""" content = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) content2 = Content( data=b"4242\n", length=5, sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"), sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"), sha256=hash_to_bytes( "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd" ), blake2s256=hash_to_bytes( "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d" ), status="visible", ) content3 = Content( data=b"424242\n", length=7, sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"), sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"), sha256=hash_to_bytes( "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36" ), blake2s256=hash_to_bytes( "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11" ), status="visible", ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc), ) contents: Tuple[Content, ...] = (content, content2, content3) skipped_content = SkippedContent( length=1024 * 1024 * 200, sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"), sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", origin="file:///dev/zero", ) skipped_content2 = SkippedContent( length=1024 * 1024 * 300, sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"), sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", ) skipped_contents: Tuple[SkippedContent, ...] = (skipped_content, skipped_content2) directory5 = Directory( id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=(), ) directory = Directory( id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar\xc3", type="dir", target=directory5.id, perms=from_disk.DentryPerms.directory, ), ], ), ) directory2 = Directory( id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), entries=tuple([ DirectoryEntry( name=b"oof", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ) ], ), ) directory3 = Directory( id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"subdir", type="dir", target=directory.id, perms=from_disk.DentryPerms.directory, ), DirectoryEntry( name=b"hello", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) directory4 = Directory( id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"), entries=tuple([ DirectoryEntry( name=b"subdir1", type="dir", target=directory3.id, perms=from_disk.DentryPerms.directory, ) ], ), ) directory6 = Directory( id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=b"\x00" * 20, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar", type="dir", target=b"\x01" * 20, perms=from_disk.DentryPerms.directory, ), ], ), raw_manifest=( b"tree 61\x00" b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" # noqa b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" # noqa ), ) directories: Tuple[Directory, ...] = ( directory2, directory, directory3, directory4, directory5, directory6, ) revision = Revision( id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) revision2 = Revision( id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=False, ) revision3 = Revision( id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id, revision2.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=True, ) revision4 = Revision( id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([revision3.id]), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ) git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) hg_revision = Revision( id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.MERCURIAL, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", "node": "a316dfb434af2b451c1f393496b7eaeda343f543", }, extra_headers=(), synthetic=True, ) hg_revision2 = Revision( id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ), synthetic=False, ) hg_revision3 = Revision( id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id, hg_revision2.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ), synthetic=True, ) hg_revision4 = Revision( id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([hg_revision3.id]), type=RevisionType.MERCURIAL, directory=directory.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ), synthetic=False, ) hg_revisions: Tuple[Revision, ...] = ( hg_revision, hg_revision2, hg_revision3, hg_revision4, ) revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), Origin(url="https://github.com/user2/repo1"), Origin(url="https://github.com/user3/repo1"), Origin(url="https://gitlab.com/user1/repo1"), Origin(url="https://gitlab.com/user2/repo1"), Origin(url="https://forge.softwareheritage.org/source/repo1"), Origin(url="https://example.рф/🏛️.txt"), ) origin, origin2 = origins[:2] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="http://hal.inria.example.com/", ) metadata_authority2 = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="http://wikidata.example.com/", ) authorities: Tuple[MetadataAuthority, ...] = ( metadata_authority, metadata_authority2, ) metadata_fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) metadata_fetcher2 = MetadataFetcher( name="swh-example", version="0.0.1", ) fetchers: Tuple[MetadataFetcher, ...] = (metadata_fetcher, metadata_fetcher2) date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit2 = datetime.datetime(2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit3 = datetime.datetime(2018, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) type_visit1 = "git" type_visit2 = "hg" type_visit3 = "deb" origin_visit = OriginVisit( origin=origin.url, visit=1, date=date_visit1, type=type_visit1, ) origin_visit2 = OriginVisit( origin=origin.url, visit=2, date=date_visit2, type=type_visit1, ) origin_visit3 = OriginVisit( origin=origin2.url, visit=1, date=date_visit1, type=type_visit2, ) origin_visits: Tuple[OriginVisit, ...] = ( origin_visit, origin_visit2, origin_visit3, ) release = Release( id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"*****@*****.**", fullname=b"olasd <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0042", ), target=revision.id, target_type=ObjectType.REVISION, message=b"synthetic release", synthetic=True, ) release2 = Release( id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision2.id, target_type=ObjectType.REVISION, message=b"v0.0.2\nMisc performance improvements + bug fixes", synthetic=False, ) release3 = Release( id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision3.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }, ) empty_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) complete_snapshot = Snapshot( id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"), branches={ b"directory": SnapshotBranch( target=directory.id, target_type=TargetType.DIRECTORY, ), b"directory2": SnapshotBranch( target=directory2.id, target_type=TargetType.DIRECTORY, ), b"content": SnapshotBranch( target=content.sha1_git, target_type=TargetType.CONTENT, ), b"alias": SnapshotBranch( target=b"revision", target_type=TargetType.ALIAS, ), b"revision": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), b"release": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE, ), b"snapshot": SnapshotBranch( target=empty_snapshot.id, target_type=TargetType.SNAPSHOT, ), b"dangling": None, }, ) snapshots: Tuple[Snapshot, ...] = (snapshot, empty_snapshot, complete_snapshot) content_metadata1 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin.url, discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="json", metadata=b'{"foo": "bar"}', ) content_metadata2 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin2.url, discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="yaml", metadata=b"foo: bar", ) content_metadata3 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", origin=origin.url, visit=42, snapshot=snapshot.swhid(), release=release.swhid(), revision=revision.swhid(), directory=directory.swhid(), path=b"/foo/bar", ) content_metadata: Tuple[RawExtrinsicMetadata, ...] = ( content_metadata1, content_metadata2, content_metadata3, ) origin_metadata1 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) origin_metadata2 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata3 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata: Tuple[RawExtrinsicMetadata, ...] = ( origin_metadata1, origin_metadata2, origin_metadata3, ) extid1 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=revision.id), extid_type="git", extid=revision.id, ) extid2 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=hg_revision.id), extid_type="mercurial", extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"), ) extid3 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory.id), extid_type="directory", extid=b"something", ) extid4 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id), extid_type="directory", extid=b"something", extid_version=2, ) extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, extid4, )
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag, weird_branches): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) dir2 = Directory(entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), )) dir3 = Directory(entries=(DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), )) dir4 = Directory(entries=(DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), )) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) rel5 = Release( name=b"1.0.0:weirdname", message=b"weird release", target_type=ObjectType.RELEASE, target=rel2.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5]) edges.extend([ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), (rel5, rev2), ]) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4, rel5]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() cooked_swhid = { RootObjects.SNAPSHOT: snp.swhid(), RootObjects.REVISION: rev2.swhid(), RootObjects.RELEASE: rel2.swhid(), RootObjects.WEIRD_RELEASE: rel5.swhid(), }[root_object] cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION): log_head = "master" elif root_object == RootObjects.RELEASE: log_head = "1.0.0" elif root_object == RootObjects.WEIRD_RELEASE: log_head = "release" else: assert False, root_object output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", log_head, ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if root_object == RootObjects.SNAPSHOT: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), ]) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ]) elif root_object in ( RootObjects.REVISION, RootObjects.RELEASE, RootObjects.WEIRD_RELEASE, ): swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")]) else: assert False, root_object if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called()
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir): """Test with two versions that have exactly the same tarball""" package = "org_version_mismatch" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id ) beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1" release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0" versions = [ ("0.0.3-beta", beta_release_id), ("0.0.3", release_id), ] expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target=b"releases/0.0.3", target_type=TargetType.ALIAS ), **{ b"releases/" + version_name.encode(): SnapshotBranch( target=hash_to_bytes(version_id), target_type=TargetType.RELEASE, ) for (version_name, version_id) in versions }, }, ) check_snapshot(expected_snapshot, swh_storage) assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release( name=b"0.0.3-beta", message=( b"Synthetic release for NPM source package org_version_mismatch " b"version 0.0.3-beta\n" ), target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(beta_release_id), ) assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release( name=b"0.0.3", message=( b"Synthetic release for NPM source package org_version_mismatch " b"version 0.0.3\n" ), target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"), target_type=ModelObjectType.DIRECTORY, synthetic=True, author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"), date=TimestampWithTimezone.from_datetime( datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc) ), id=hash_to_bytes(release_id), ) # Check incremental re-load keeps it unchanged loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() assert actual_load_status == { "status": "uneventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id )
def test_checksum_mismatch(swh_storage, mismatch_on): date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") wrong_hash = b"\x12\x34" * 10 cnt1 = Content.from_data(b"Tr0ub4dor&3") if mismatch_on == "content": cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash) dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) if mismatch_on == "directory": dir1 = attr.evolve(dir1, id=wrong_hash) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision1": rev1 = attr.evolve(rev1, id=wrong_hash) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision2": rev2 = attr.evolve(rev2, id=wrong_hash) cooked_swhid = rev2.swhid() swh_storage.content_add([cnt1]) swh_storage.directory_add([dir1]) swh_storage.revision_add([rev1, rev2]) backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=None, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if mismatch_on != "revision2": # git-log fails if the head revision is corrupted # TODO: we need to find a way to make this somewhat usable output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"