def test_weird_tree(self): """Tests a tree with entries the wrong order""" raw_manifest = ( b"0644 file2\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" b"0644 file1\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" ) tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_manifest) assert converters.dulwich_tree_to_directory(tree) == Directory( entries=( # in alphabetical order, as it should be DirectoryEntry( name=b"file1", type="file", target=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), perms=0o644, ), DirectoryEntry( name=b"file2", type="file", target=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), perms=0o644, ), ), raw_manifest=b"tree 62\x00" + raw_manifest, )
def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev = REVISION assert rev.directory == DIRECTORY2.id directory = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, ), ), ) assert directory.id is not None metadata_indexer.storage.directory_add([directory]) new_rev_dict = {**rev.to_dict(), "directory": directory.id} new_rev_dict.pop("id") new_rev = Revision.from_dict(new_rev_dict) metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=new_rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results)
def test_tree_perms(self): entries = [ (b"blob_100644", 0o100644, "file"), (b"blob_100664", 0o100664, "file"), (b"blob_100666", 0o100666, "file"), (b"blob_120000", 0o120000, "file"), (b"commit_160644", 0o160644, "rev"), (b"commit_160664", 0o160664, "rev"), (b"commit_160666", 0o160666, "rev"), (b"commit_normal", 0o160000, "rev"), (b"tree_040644", 0o040644, "dir"), (b"tree_040664", 0o040664, "dir"), (b"tree_040666", 0o040666, "dir"), (b"tree_normal", 0o040000, "dir"), ] tree = dulwich.objects.Tree() for (name, mode, _) in entries: tree.add(name, mode, b"00" * 20) assert converters.dulwich_tree_to_directory(tree) == Directory( entries=tuple( DirectoryEntry( type=type, perms=perms, name=name, target=b"\x00" * 20) for (name, perms, type) in entries))
def test_lookup_directory_with_revision_unknown_content( archive_data, new_revision): unknown_content_ = random_content() dir_path = "README.md" # A directory that points to unknown content dir = Directory(entries=(DirectoryEntry( name=bytes(dir_path.encode("utf-8")), type="file", target=hash_to_bytes(unknown_content_["sha1_git"]), perms=DentryPerms.content, ), )) # Create a revision that points to a directory # Which points to unknown content new_revision = new_revision.to_dict() new_revision["directory"] = dir.id del new_revision["id"] new_revision = Revision.from_dict(new_revision) # Add the directory and revision in mem archive_data.directory_add([dir]) archive_data.revision_add([new_revision]) new_revision_id = hash_to_hex(new_revision.id) with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(new_revision_id, dir_path) assert e.match("Content not found for revision %s" % new_revision_id)
def _directory_with_entries(self, sample_data, nb_entries): """Returns a dir with ``nb_entries``, all pointing to the same content""" return Directory(entries=tuple( DirectoryEntry( name=f"file{i:10}".encode(), type="file", target=sample_data.content.sha1_git, perms=from_disk.DentryPerms.directory, ) for i in range(nb_entries)))
def test_revision_submodule(self, swh_storage, cook_extract_revision, ingest_target_revision): date = TimestampWithTimezone.from_datetime( datetime.datetime.now( datetime.timezone.utc).replace(microsecond=0)) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex( "3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory(entries=(DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string()
def test_directory_large(swh_storage): expected_directory = Directory(entries=tuple( DirectoryEntry( name=f"entry{i:04}".encode(), type="file", target=b"\x00" * 20, perms=0o000664, ) for i in range(10))) swh_storage.directory_add([expected_directory]) returned_directory = directory_get(swh_storage, expected_directory.id) assert returned_directory.id == expected_directory.id assert set(returned_directory.entries) == set(expected_directory.entries) assert returned_directory.raw_manifest == expected_directory.raw_manifest
def get(ids): return [ Directory( id=ids[0], entries=tuple( map( lambda entry: DirectoryEntry( name=entry["name"], type=entry["type"], target=entry["sha1_git"], perms=entry["perms"], ), swh_storage.directory_ls(ids[0]), )), ) ]
def test_directory_revision_data(self, swh_storage): target_rev = "0e8a3ad980ec179856012b7eecf4327e99cd44cd" dir = Directory(entries=(DirectoryEntry( name=b"submodule", type="rev", target=hashutil.hash_to_bytes(target_rev), perms=0o100644, ), ), ) swh_storage.directory_add([dir]) with cook_extract_directory_dircooker(swh_storage, dir.swhid(), fsck=False) as p: assert (p / "submodule").is_symlink() assert os.readlink(str(p / "submodule")) == target_rev
def dulwich_tree_to_directory(obj: ShaFile) -> Directory: """Format a tree as a directory""" if obj.type_name != b"tree": raise ValueError("Argument is not a tree.") tree = cast(Tree, obj) entries = [] for entry in tree.iteritems(): if entry.mode & COMMIT_MODE_MASK == COMMIT_MODE_MASK: type_ = "rev" elif entry.mode & TREE_MODE_MASK == TREE_MODE_MASK: type_ = "dir" else: type_ = "file" entries.append( DirectoryEntry( type=type_, perms=entry.mode, name=entry.path, target=hash_to_bytes(entry.sha.decode("ascii")), )) dir_ = Directory( id=tree.sha().digest(), entries=tuple(entries), ) if dir_.compute_hash() != dir_.id: expected_id = dir_.id actual_id = dir_.compute_hash() logger.warning( "Expected directory to have id %s, but got %s. Recording raw_manifest.", hash_to_hex(expected_id), hash_to_hex(actual_id), ) raw_string = tree.as_raw_string() dir_ = attr.evolve( dir_, raw_manifest=git_object_header("tree", len(raw_string)) + raw_string) check_id(dir_) return dir_
def directory_converter(db: BaseDb, directory_d: Dict[str, Any]) -> Directory: """Convert directory from the flat representation to swh model compatible objects. """ columns = ["target", "name", "perms"] query_template = """ select %(columns)s from directory_entry_%(type)s where id in %%s """ types = ["file", "dir", "rev"] entries = [] with db.cursor() as cur: for type in types: ids = directory_d.pop("%s_entries" % type) if not ids: continue query = query_template % { "columns": ",".join(columns), "type": type, } cur.execute(query, (tuple(ids), )) for row in cur: entry_d = dict(zip(columns, row)) entry = DirectoryEntry( name=entry_d["name"], type=type, target=entry_d["target"], perms=entry_d["perms"], ) entries.append(entry) return Directory( id=directory_d["id"], entries=tuple(entries), raw_manifest=directory_d["raw_manifest"], )
def test_api_revision_directory_ok_returns_revision(api_client, archive_data, revision, person, date): rev_path = "foo" _dir = Directory(entries=(DirectoryEntry( name=rev_path.encode(), type="rev", target=hash_to_bytes(revision), perms=DentryPerms.revision, ), )) archive_data.directory_add([_dir]) rev = Revision( directory=_dir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([rev]) revision_id = hash_to_hex(rev.id) rev_data = archive_data.revision_get(revision) url = reverse( "api-1-revision-directory", { "sha1_git": revision_id, "dir_path": rev_path }, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "content": enrich_revision(rev_data, request=rv.wsgi_request), "path": rev_path, "type": "rev", "revision": revision_id, }
length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a"), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"), status="visible", ) DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, ) ]), ) REVISION = Revision( id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone(Timestamp(1234567890, 0), offset_bytes=b"+0200"),
def test_sub_directory_view_origin_context(client, archive_data, empty_directory, person, date): origin_url = "test_sub_directory_view_origin_context" subdir = Directory(entries=( DirectoryEntry( name=b"foo", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), DirectoryEntry( name=b"bar", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), )) parentdir = Directory(entries=(DirectoryEntry( name=b"baz", type="dir", target=subdir.id, perms=DentryPerms.directory, ), )) archive_data.directory_add([subdir, parentdir]) revision = Revision( directory=parentdir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([revision]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target="refs/head/master".encode(), target_type=TargetType.ALIAS, ), b"refs/head/master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }) archive_data.snapshot_add([snapshot]) archive_data.origin_add([Origin(url=origin_url)]) date = now() visit = OriginVisit(origin=origin_url, date=date, type="git") visit = archive_data.origin_visit_add([visit])[0] visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=date, status="full", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id)) subdir = dir_content[0] subdir_content = archive_data.directory_ls(subdir["target"]) _directory_view_checks( client, hash_to_hex(parentdir.id), subdir_content, subdir["name"], origin_url, hash_to_hex(snapshot.id), hash_to_hex(revision.id), )
def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me <*****@*****.**>") current_person = Person.from_fullname(b"me <*****@*****.**>") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=(DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ]] swh_graph = unittest.mock.Mock( wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname
def test_checksum_mismatch(swh_storage, mismatch_on): date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") wrong_hash = b"\x12\x34" * 10 cnt1 = Content.from_data(b"Tr0ub4dor&3") if mismatch_on == "content": cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash) dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) if mismatch_on == "directory": dir1 = attr.evolve(dir1, id=wrong_hash) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision1": rev1 = attr.evolve(rev1, id=wrong_hash) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision2": rev2 = attr.evolve(rev2, id=wrong_hash) cooked_swhid = rev2.swhid() swh_storage.content_add([cnt1]) swh_storage.directory_add([dir1]) swh_storage.revision_add([rev1, rev2]) backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=None, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if mismatch_on != "revision2": # git-log fails if the head revision is corrupted # TODO: we need to find a way to make this somewhat usable output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag, weird_branches): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) dir2 = Directory(entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), )) dir3 = Directory(entries=(DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), )) dir4 = Directory(entries=(DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), )) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) rel5 = Release( name=b"1.0.0:weirdname", message=b"weird release", target_type=ObjectType.RELEASE, target=rel2.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5]) edges.extend([ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), (rel5, rev2), ]) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4, rel5]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() cooked_swhid = { RootObjects.SNAPSHOT: snp.swhid(), RootObjects.REVISION: rev2.swhid(), RootObjects.RELEASE: rel2.swhid(), RootObjects.WEIRD_RELEASE: rel5.swhid(), }[root_object] cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION): log_head = "master" elif root_object == RootObjects.RELEASE: log_head = "1.0.0" elif root_object == RootObjects.WEIRD_RELEASE: log_head = "release" else: assert False, root_object output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", log_head, ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if root_object == RootObjects.SNAPSHOT: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), ]) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ]) elif root_object in ( RootObjects.REVISION, RootObjects.RELEASE, RootObjects.WEIRD_RELEASE, ): swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")]) else: assert False, root_object if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called()
{"type": "git", "origin": ORIGINS[0].url}, {"type": "ftp", "origin": ORIGINS[1].url}, {"type": "deposit", "origin": ORIGINS[2].url}, {"type": "pypi", "origin": ORIGINS[3].url}, {"type": "svn", "origin": ORIGINS[4].url}, {"type": "git", "origin": ORIGINS[5].url}, {"type": "git", "origin": ORIGINS[6].url}, ] DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=( DirectoryEntry( name=b"index.js", type="file", target=hash_to_bytes("01c9379dfc33803963d07c1ccc748d3fe4c96bb5"), perms=0o100644, ), DirectoryEntry( name=b"package.json", type="file", target=hash_to_bytes("26a9f72a7c87cc9205725cfd879f514ff4f3d8d5"), perms=0o100644, ), DirectoryEntry( name=b".github", type="dir", target=Directory(entries=()).id, perms=0o040000, ), ),
type="hg", status="partial", snapshot=hash_to_bytes("0e7f84ede9a254f2cd55649ad5240783f557e65f"), metadata=None, ), ] DIRECTORIES = [ Directory(id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=()), Directory( id=hash_to_bytes("87b339104f7dc2a8163dec988445e3987995545f"), entries=( DirectoryEntry( name=b"file1.ext", perms=0o644, type="file", target=CONTENTS[0].sha1_git, ), DirectoryEntry( name=b"dir1", perms=0o755, type="dir", target=hash_to_bytes( "4b825dc642cb6eb9a060e54bf8d69288fbee4904"), ), DirectoryEntry( name=b"subprepo1", perms=0o160000, type="rev", target=REVISIONS[1].id, ),
class StorageData: """Data model objects to use within tests.""" content = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) content2 = Content( data=b"4242\n", length=5, sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"), sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"), sha256=hash_to_bytes( "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd" ), blake2s256=hash_to_bytes( "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d" ), status="visible", ) content3 = Content( data=b"424242\n", length=7, sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"), sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"), sha256=hash_to_bytes( "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36" ), blake2s256=hash_to_bytes( "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11" ), status="visible", ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc), ) contents: Tuple[Content, ...] = (content, content2, content3) skipped_content = SkippedContent( length=1024 * 1024 * 200, sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"), sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", origin="file:///dev/zero", ) skipped_content2 = SkippedContent( length=1024 * 1024 * 300, sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"), sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", ) skipped_contents: Tuple[SkippedContent, ...] = (skipped_content, skipped_content2) directory5 = Directory( id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=(), ) directory = Directory( id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar\xc3", type="dir", target=directory5.id, perms=from_disk.DentryPerms.directory, ), ], ), ) directory2 = Directory( id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), entries=tuple([ DirectoryEntry( name=b"oof", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ) ], ), ) directory3 = Directory( id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"subdir", type="dir", target=directory.id, perms=from_disk.DentryPerms.directory, ), DirectoryEntry( name=b"hello", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) directory4 = Directory( id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"), entries=tuple([ DirectoryEntry( name=b"subdir1", type="dir", target=directory3.id, perms=from_disk.DentryPerms.directory, ) ], ), ) directory6 = Directory( id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=b"\x00" * 20, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar", type="dir", target=b"\x01" * 20, perms=from_disk.DentryPerms.directory, ), ], ), raw_manifest=( b"tree 61\x00" b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" # noqa b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" # noqa ), ) directories: Tuple[Directory, ...] = ( directory2, directory, directory3, directory4, directory5, directory6, ) revision = Revision( id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) revision2 = Revision( id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=False, ) revision3 = Revision( id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id, revision2.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=True, ) revision4 = Revision( id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([revision3.id]), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ) git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) hg_revision = Revision( id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.MERCURIAL, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", "node": "a316dfb434af2b451c1f393496b7eaeda343f543", }, extra_headers=(), synthetic=True, ) hg_revision2 = Revision( id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ), synthetic=False, ) hg_revision3 = Revision( id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id, hg_revision2.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ), synthetic=True, ) hg_revision4 = Revision( id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([hg_revision3.id]), type=RevisionType.MERCURIAL, directory=directory.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ), synthetic=False, ) hg_revisions: Tuple[Revision, ...] = ( hg_revision, hg_revision2, hg_revision3, hg_revision4, ) revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), Origin(url="https://github.com/user2/repo1"), Origin(url="https://github.com/user3/repo1"), Origin(url="https://gitlab.com/user1/repo1"), Origin(url="https://gitlab.com/user2/repo1"), Origin(url="https://forge.softwareheritage.org/source/repo1"), Origin(url="https://example.рф/🏛️.txt"), ) origin, origin2 = origins[:2] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="http://hal.inria.example.com/", ) metadata_authority2 = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="http://wikidata.example.com/", ) authorities: Tuple[MetadataAuthority, ...] = ( metadata_authority, metadata_authority2, ) metadata_fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) metadata_fetcher2 = MetadataFetcher( name="swh-example", version="0.0.1", ) fetchers: Tuple[MetadataFetcher, ...] = (metadata_fetcher, metadata_fetcher2) date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit2 = datetime.datetime(2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit3 = datetime.datetime(2018, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) type_visit1 = "git" type_visit2 = "hg" type_visit3 = "deb" origin_visit = OriginVisit( origin=origin.url, visit=1, date=date_visit1, type=type_visit1, ) origin_visit2 = OriginVisit( origin=origin.url, visit=2, date=date_visit2, type=type_visit1, ) origin_visit3 = OriginVisit( origin=origin2.url, visit=1, date=date_visit1, type=type_visit2, ) origin_visits: Tuple[OriginVisit, ...] = ( origin_visit, origin_visit2, origin_visit3, ) release = Release( id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"*****@*****.**", fullname=b"olasd <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0042", ), target=revision.id, target_type=ObjectType.REVISION, message=b"synthetic release", synthetic=True, ) release2 = Release( id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision2.id, target_type=ObjectType.REVISION, message=b"v0.0.2\nMisc performance improvements + bug fixes", synthetic=False, ) release3 = Release( id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision3.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }, ) empty_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) complete_snapshot = Snapshot( id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"), branches={ b"directory": SnapshotBranch( target=directory.id, target_type=TargetType.DIRECTORY, ), b"directory2": SnapshotBranch( target=directory2.id, target_type=TargetType.DIRECTORY, ), b"content": SnapshotBranch( target=content.sha1_git, target_type=TargetType.CONTENT, ), b"alias": SnapshotBranch( target=b"revision", target_type=TargetType.ALIAS, ), b"revision": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), b"release": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE, ), b"snapshot": SnapshotBranch( target=empty_snapshot.id, target_type=TargetType.SNAPSHOT, ), b"dangling": None, }, ) snapshots: Tuple[Snapshot, ...] = (snapshot, empty_snapshot, complete_snapshot) content_metadata1 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin.url, discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="json", metadata=b'{"foo": "bar"}', ) content_metadata2 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin2.url, discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="yaml", metadata=b"foo: bar", ) content_metadata3 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", origin=origin.url, visit=42, snapshot=snapshot.swhid(), release=release.swhid(), revision=revision.swhid(), directory=directory.swhid(), path=b"/foo/bar", ) content_metadata: Tuple[RawExtrinsicMetadata, ...] = ( content_metadata1, content_metadata2, content_metadata3, ) origin_metadata1 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) origin_metadata2 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata3 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata: Tuple[RawExtrinsicMetadata, ...] = ( origin_metadata1, origin_metadata2, origin_metadata3, ) extid1 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=revision.id), extid_type="git", extid=revision.id, ) extid2 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=hg_revision.id), extid_type="mercurial", extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"), ) extid3 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory.id), extid_type="directory", extid=b"something", ) extid4 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id), extid_type="directory", extid=b"something", extid_version=2, ) extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, extid4, )
def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): """Tests that objects that were originally malformed: * are still interpreted somewhat correctly (if the loader could make sense of them), especially that they still have links to children * have their original manifest in the bundle """ date = TimestampWithTimezone.from_numeric_offset( Timestamp(1643819927, 0), 0, False) content = Content.from_data(b"foo") swh_storage.content_add([content]) # disordered # fmt: off malformed_dir_manifest = (b"" + b"100644 file2\x00" + content.sha1_git + b"100644 file1\x00" + content.sha1_git) # fmt: on directory = Directory( entries=( DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), DirectoryEntry(name=b"file2", type="file", perms=0o100644, target=content.sha1_git), ), raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + malformed_dir_manifest, ) swh_storage.directory_add([directory]) # 'committer' and 'author' swapped # fmt: off malformed_rev_manifest = ( b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + b"committer me <*****@*****.**> 1643819927 +0000\n" + b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" + b"rev") # fmt: on revision = Revision( message=b"rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + malformed_rev_manifest, ) swh_storage.revision_add([revision]) # 'tag' and 'tagger' swapped # fmt: off malformed_rel_manifest = ( b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + b"type commit\n" + b"tagger me <*****@*****.**> 1643819927 +0000\n" + b"tag v1.1.0\n") # fmt: on release = Release( name=b"v1.1.0", message=None, author=Person.from_fullname(b"me <*****@*****.**>"), date=date, target=revision.id, target_type=ModelObjectType.REVISION, synthetic=True, raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + malformed_rel_manifest, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): tag = ert.repo[b"refs/tags/v1.1.0"] assert tag.as_raw_string() == malformed_rel_manifest commit = ert.repo[tag.object[1]] assert commit.as_raw_string() == malformed_rev_manifest tree = ert.repo[commit.tree] assert tree.as_raw_string() == malformed_dir_manifest