def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success)""" deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") deposit.swhid = f"swh:1:dir:{directory_id}" deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.RELEASE, object_id=release_id), path=b"/", ) ) deposit.save() return deposit
async def lookup(self, name: str) -> Optional[FuseEntry]: # On the fly mounting of a new artifact try: if name.endswith(JSON_SUFFIX): swhid = CoreSWHID.from_string(name[: -len(JSON_SUFFIX)]) return self.create_child( MetaEntry, name=f"{swhid}{JSON_SUFFIX}", mode=int(EntryMode.RDONLY_FILE), swhid=swhid, ) else: swhid = CoreSWHID.from_string(name) await self.fuse.get_metadata(swhid) return self.create_child( OBJTYPE_GETTERS[swhid.object_type], name=str(swhid), mode=int( EntryMode.RDONLY_FILE if swhid.object_type == ObjectType.CONTENT else EntryMode.RDONLY_DIR ), swhid=swhid, ) except ValidationError: return None
def test_get_snapshot(web_api_client, web_api_mock): # small snapshot, the one from Web API doc swhid = CoreSWHID.from_string("swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a") obj = web_api_client.get(swhid) assert len(obj) == 4 assert obj["refs/heads/master"]["target_type"] == "revision" assert obj["refs/heads/master"]["target"] == CoreSWHID.from_string( "swh:1:rev:83c20a6a63a7ebc1a549d367bc07a61b926cecf3" ) assert obj["refs/tags/dpkt-1.7"]["target_type"] == "revision" assert obj["refs/tags/dpkt-1.7"]["target"] == CoreSWHID.from_string( "swh:1:rev:0c9dbfbc0974ec8ac1d8253aa1092366a03633a8" )
def process_put( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] release_id = data["release_id"] dir_id = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id)) snp_id = CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=hash_to_bytes(data["snapshot_id"]), ) rel_id = CoreSWHID(object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id)) deposit.swhid = str(dir_id) # new id with contextual information deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin_url, visit=snp_id, anchor=rel_id, path="/", )) else: # rejected deposit.status = status if "status_detail" in data: deposit.status_detail = data["status_detail"] deposit.save()
def test_get_directory(web_api_client, web_api_mock): swhid = CoreSWHID.from_string("swh:1:dir:977fc4b98c0e85816348cebd3b12026407c368b6") obj = web_api_client.get(swhid) assert len(obj) == 35 # number of directory entries assert all(map(lambda entry: entry["dir_id"] == swhid, obj)) dir_entry = obj[0] assert dir_entry["type"] == "file" assert dir_entry["target"] == CoreSWHID.from_string( "swh:1:cnt:58471109208922c9ee8c4b06135725f03ed16814" ) assert dir_entry["name"] == ".bzrignore" assert dir_entry["length"] == 582 assert obj == web_api_client.directory(swhid)
def test_get_release(web_api_client, web_api_mock): swhid = CoreSWHID.from_string("swh:1:rel:b9db10d00835e9a43e2eebef2db1d04d4ae82342") obj = web_api_client.get(swhid) assert obj["id"] == swhid assert obj["author"]["fullname"] == "Paul Tagliamonte <*****@*****.**>" assert obj["author"]["name"] == "Paul Tagliamonte" assert obj["date"] == parse_date("2013-07-06T19:34:11-04:00") assert obj["name"] == "0.9.9" assert obj["target_type"] == "revision" assert obj["target"] == CoreSWHID.from_string( "swh:1:rev:e005cb773c769436709ca6a1d625dc784dbc1636" ) assert not obj["synthetic"] assert obj == web_api_client.release(swhid)
def test_generate_table_body(source_tree): chart_path = b"/bar/barfoo" dir_path = source_tree[b"/bar/barfoo"].data["path"].decode() nodes_data = MerkleNodeInfo() # CoreSWHID of 'another-quote.org' known_cnt_swhid = CoreSWHID( object_type=ObjectType.CONTENT, object_id= b"\x136\x93\xb1%\xba\xd2\xb4\xac1\x855\xb8I\x01\xeb\xb1\xf6\xb68", ) nodes_data[known_cnt_swhid] = {"known": True} generated_body = generate_table_body(chart_path, source_tree, nodes_data) expected_body = [ html.Tbody([ html.Tr([ html.Td("✔"), html.Td( html.A( children="another-quote.org", href=f"file://{dir_path}/another-quote.org", )), html.Td("swh:1:cnt:133693b125bad2b4ac318535b84901ebb1f6b638"), ]), ]) ] # workaround: dash_html_component.__eq__ checks for object identity only assert str(generated_body) == str(expected_body)
def to_swhid(object_type: Union[str, ObjectType], s: Any) -> CoreSWHID: if isinstance(object_type, str): parsed_object_type = ObjectType[object_type.upper()] else: parsed_object_type = object_type return CoreSWHID(object_type=parsed_object_type, object_id=hash_to_bytes(s))
def iter(self, swhid: SWHIDish, typify: bool = True, **req_args) -> Iterator[Dict[str, Any]]: """Stream over the information about an object of any kind Streaming variant of get() """ if isinstance(swhid, str): obj_type = CoreSWHID.from_string(swhid).object_type else: obj_type = swhid.object_type if obj_type == SNAPSHOT: yield from self.snapshot(swhid, typify) elif obj_type == REVISION: yield from [self.revision(swhid, typify)] elif obj_type == RELEASE: yield from [self.release(swhid, typify)] elif obj_type == DIRECTORY: yield from self.directory(swhid, typify) elif obj_type == CONTENT: yield from [self.content(swhid, typify)] else: raise ValueError(f"invalid object type: {obj_type}")
def test_snapshot_filtered_objects(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_filtered_objects(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_filtered_objects(ert, p, main_rev_id)
def convert(self, value, param, ctx) -> CoreSWHID: from swh.model.exceptions import ValidationError try: return CoreSWHID.from_string(value) except ValidationError as e: self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
def load_repo_simple(self, git_loader): # # 1--2--3--4--5--6--7 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("add file1") (rp / "file2").write_text(TEST_CONTENT) repo.commit("add file2") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) (rp / "bin1").write_bytes(TEST_EXECUTABLE) (rp / "bin1").chmod(0o755) repo.commit("add bin1") (rp / "link1").symlink_to("file1") repo.commit("link link1 to file1") (rp / "file2").unlink() repo.commit("remove file2") (rp / "bin1").rename(rp / "bin") repo.commit("rename bin1 to bin") loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) return (loader, swhid)
def load_repo_two_heads(self, git_loader): # # 1---2----4 <-- master and b1 # \ # ----3 <-- b2 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) repo.commit("Add file1") (rp / "file2").write_text(TEST_CONTENT) c2 = repo.commit("Add file2") repo.repo.refs[b"refs/heads/b2"] = c2 # branch b2 from master (rp / "file3").write_text(TEST_CONTENT) repo.commit("add file3", ref=b"refs/heads/b2") (rp / "file4").write_text(TEST_CONTENT) c4 = repo.commit("add file4", ref=b"refs/heads/master") repo.repo.refs[b"refs/heads/b1"] = c4 # branch b1 from master obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid)
def batch_progress(self, batch_id: int, db=None, cur=None) -> Dict[str, Any]: cur.execute( """ SELECT vault_bundle.id as id, type, swhid, task_id, task_status, sticky, ts_created, ts_done, ts_last_access, progress_msg FROM vault_batch_bundle LEFT JOIN vault_bundle ON vault_bundle.id = bundle_id WHERE batch_id = %s""", (batch_id, ), ) bundles = cur.fetchall() if not bundles: raise NotFoundExc(f"Batch {batch_id} does not exist.") for bundle in bundles: bundle["swhid"] = CoreSWHID.from_string(bundle["swhid"]) counter = collections.Counter(b["status"] for b in bundles) res = { "bundles": bundles, "total": len(bundles), **{k: 0 for k in ("new", "pending", "done", "failed")}, **dict(counter), } return res
def load_repo_triple_merge(self, git_loader): # # .---.---5 # / / / # 2 3 4 # / / / # 1---.---. # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Commit 1") repo.repo.refs[b"refs/heads/b1"] = c1 repo.repo.refs[b"refs/heads/b2"] = c1 repo.commit("Commit 2") c3 = repo.commit("Commit 3", ref=b"refs/heads/b1") c4 = repo.commit("Commit 4", ref=b"refs/heads/b2") repo.merge([c3, c4]) obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid)
def load_repo_two_double_fork_merge(self, git_loader): # # 2---4---6 # / / / # 1---3---5 # repo = TestRepo() with repo as rp: (rp / "file1").write_text(TEST_CONTENT) c1 = repo.commit("Add file1") # create commit 1 repo.repo.refs[b"refs/heads/c1"] = c1 # branch c1 from master (rp / "file2").write_text(TEST_CONTENT) repo.commit("Add file2") # create commit 2 (rp / "file3").write_text(TEST_CONTENT) c3 = repo.commit("Add file3", ref=b"refs/heads/c1") # create commit 3 on c1 repo.repo.refs[b"refs/heads/c3"] = c3 # branch c3 from c1 repo.merge([c3]) # create commit 4 (rp / "file5").write_text(TEST_CONTENT) c5 = repo.commit("Add file3", ref=b"refs/heads/c3") # create commit 5 on c3 repo.merge([c5]) # create commit 6 obj_id_hex = repo.repo.refs[b"HEAD"].decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=obj_id) loader = git_loader(str(rp)) loader.load() return (loader, swhid)
def test_lazybfs_policy(live_server, aiosession, event_loop, source_tree_policy, tmp_requests): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() init_merkle_node_info(source_tree_policy, nodes_data, {"known"}) policy = LazyBFS(source_tree_policy, nodes_data) client = Client(api_url, aiosession) event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) assert (backend_swhids_requests[0] == "swh:1:dir:fe8cd7076bef324eb8865f818ef08617879022ce") # the second request must contain 3 SWHIDs related to directories and one content dir_count, cnt_count = 0, 0 for swhid in backend_swhids_requests[1:5]: if CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY: dir_count += 1 else: cnt_count += 1 assert dir_count == 3 assert cnt_count == 1 # the last swhid must be a content related to the unknown directory # "sample-folder-policy/toexclude" assert (backend_swhids_requests[5] == "swh:1:cnt:5f1cfce26640056bed3710cfaf3062a6a326a119")
def identify_directory(path: Path) -> CoreSWHID: """Return the SWHID of the given path.""" return CoreSWHID.from_string( identify_object("directory", follow_symlinks=True, exclude_patterns=[b".hg"], obj=str(path)))
def generate_archive_web_api( swhid: CoreSWHID, raw: bool = False, recursive: bool = False ) -> None: # Already in mock archive if swhid in METADATA and not raw: return url = swhid_to_web_url(swhid, raw) data = get_from_api(url) if not raw: data = json.loads(data) MOCK_ARCHIVE[url] = data METADATA[swhid] = data # Retrieve additional needed data for different artifacts (eg: content's # blob data, release target, etc.) if recursive: if swhid.object_type == ObjectType.CONTENT: generate_archive_web_api(swhid, raw=True) elif swhid.object_type == ObjectType.RELEASE: target_type = METADATA[swhid]["target_type"] target_id = METADATA[swhid]["target"] target = CoreSWHID( object_type=ObjectType[target_type.upper()], object_id=hash_to_bytes(target_id), ) generate_archive_web_api(target, recursive=True)
def test_directory_simple(self, git_loader, cook_extract_directory): repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) (rp / "executable").write_bytes(TEST_EXECUTABLE) (rp / "executable").chmod(0o755) (rp / "link").symlink_to("file") (rp / "dir1/dir2").mkdir(parents=True) (rp / "dir1/dir2/file").write_text(TEST_CONTENT) c = repo.commit() loader = git_loader(str(rp)) loader.load() obj_id_hex = repo.repo[c].tree.decode() obj_id = hashutil.hash_to_bytes(obj_id_hex) swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=obj_id) with cook_extract_directory(loader.storage, swhid) as p: assert (p / "file").stat().st_mode == 0o100644 assert (p / "file").read_text() == TEST_CONTENT assert (p / "executable").stat().st_mode == 0o100755 assert (p / "executable").read_bytes() == TEST_EXECUTABLE assert (p / "link").is_symlink() assert os.readlink(str(p / "link")) == "file" assert (p / "dir1/dir2/file").stat().st_mode == 0o100644 assert (p / "dir1/dir2/file").read_text() == TEST_CONTENT directory = from_disk.Directory.from_disk(path=bytes(p)) assert obj_id_hex == hashutil.hash_to_hex(directory.hash)
def load_metadata( storage, revision_id, directory_id, discovery_date: datetime.datetime, metadata: Dict[str, Any], format: str, authority: MetadataAuthority, origin: Optional[str], dry_run: bool, ): """Does the actual loading to swh-storage.""" directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY, object_id=directory_id) revision_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=revision_id) obj = RawExtrinsicMetadata( target=directory_swhid, discovery_date=discovery_date, authority=authority, fetcher=FETCHER, format=format, metadata=json.dumps(metadata).encode(), origin=origin, revision=revision_swhid, ) if not dry_run: storage.raw_extrinsic_metadata_add([obj])
def test_maven_loader_extrinsic_metadata(swh_storage, expected_releases, expected_json_metadata, expected_pom_metadata): """With no prior visit, loading a jar ends up with 1 snapshot. Extrinsic metadata is the pom file associated to the source jar. """ loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" for i, expected_release in enumerate(expected_releases): expected_release_id = expected_release.id release = swh_storage.release_get([expected_release_id])[0] assert release is not None release_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=expected_release_id) directory_swhid = ExtendedSWHID( object_type=ExtendedObjectType.DIRECTORY, object_id=release.target) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url=REPO_BASE_URL, ) expected_metadata = [ RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.maven.loader.MavenLoader", version=__version__, ), discovery_date=loader.visit_date, format="maven-pom", metadata=expected_pom_metadata[i], origin=MVN_ORIGIN_URL, release=release_swhid, ), RawExtrinsicMetadata( target=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.maven.loader.MavenLoader", version=__version__, ), discovery_date=loader.visit_date, format="maven-json", metadata=json.dumps(expected_json_metadata[i]).encode(), origin=MVN_ORIGIN_URL, release=release_swhid, ), ] res = swh_storage.raw_extrinsic_metadata_get(directory_swhid, metadata_authority) assert res.next_page_token is None assert set(res.results) == set(expected_metadata)
def test_CoreSWHID_validation_error(ns, version, type, id): with pytest.raises(ValidationError): CoreSWHID( namespace=ns, scheme_version=version, object_type=type, object_id=_x(id), )
def _get_object_id_hex(swhidish: SWHIDish) -> str: """Parse string or SWHID and return the hex value of the object_id""" if isinstance(swhidish, str): swhid = CoreSWHID.from_string(swhidish) else: swhid = swhidish return hash_to_hex(swhid.object_id)
def convert(self, value, param, ctx): from swh.model.exceptions import ValidationError from swh.model.swhids import CoreSWHID try: return CoreSWHID.from_string(value) except ValidationError: self.fail(f"expected core SWHID, got {value!r}", param, ctx)
def test_snapshot_two_double_fork_merge(self, git_loader, cook_extract_snapshot): (loader, main_rev_id) = self.load_repo_two_double_fork_merge(git_loader) snp_id = loader.loaded_snapshot_id swhid = CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snp_id) with cook_extract_snapshot(loader.storage, swhid) as (ert, p): self.check_revision_two_double_fork_merge(ert, p, main_rev_id) self.check_snapshot_two_double_fork_merge(ert, p, main_rev_id)
def test_file_priority_policy(live_server, aiosession, event_loop, source_tree_policy, tmp_requests): open(tmp_requests, "w").close() api_url = url_for("index", _external=True) nodes_data = MerkleNodeInfo() init_merkle_node_info(source_tree_policy, nodes_data, {"known"}) policy = FilePriority(source_tree_policy, nodes_data) client = Client(api_url, aiosession) event_loop.run_until_complete(policy.run(client)) backend_swhids_requests = get_backend_swhids_order(tmp_requests) for swhid in backend_swhids_requests[0:4]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.CONTENT for swhid in backend_swhids_requests[5:]: assert CoreSWHID.from_string(swhid).object_type == ObjectType.DIRECTORY
async def get_cached_swhids(self) -> AsyncGenerator[CoreSWHID, None]: """ Return a list of all previously cached SWHID """ # Use the metadata db since it should always contain all accessed SWHIDs metadata_cursor = await self.metadata.conn.execute( "select swhid from metadata_cache") swhids = await metadata_cursor.fetchall() for raw_swhid in swhids: yield CoreSWHID.from_string(raw_swhid[0])
async def unlink(self, name: str) -> None: try: if name.endswith(JSON_SUFFIX): name = name[: -len(JSON_SUFFIX)] swhid = CoreSWHID.from_string(name) await self.fuse.cache.metadata.remove(swhid) await self.fuse.cache.blob.remove(swhid) except ValidationError: raise
def test_get_last_visit(web_api_client, web_api_mock): visit = web_api_client.last_visit("https://github.com/NixOS/nixpkgs") assert visit is not None timestamp = parse_date("2021-09-02 20:20:31.231786+00:00") assert visit["date"] == timestamp snapshot_swhid = "swh:1:snp:6e1fe7858066ff1a6905080ac6503a3a12b84f59" assert visit["snapshot"] == CoreSWHID.from_string(snapshot_swhid)