def test_multiple_open_heads(swh_storage, datadir, tmp_path): archive_name = "multiple-heads" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgLoader( storage=swh_storage, url=repo_url, ) actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} assert_last_visit_matches(swh_storage, repo_url, status="full", type="hg") snapshot = snapshot_get_latest(swh_storage, repo_url) expected_branches = [ b"HEAD", b"branch-heads/default/0", b"branch-heads/default/1", b"branch-tip/default", ] assert sorted(snapshot.branches.keys()) == expected_branches # Check that we don't load anything the second time loader = HgLoader( storage=swh_storage, url=repo_url, ) actual_load_status = loader.load() assert actual_load_status == {"status": "uneventful"}
def _latest_snapshot_revision( self, origin_url: str, ) -> Optional[Tuple[Snapshot, Revision]]: """Look for latest snapshot revision and returns it if any. Args: origin_url: Origin identifier previous_swh_revision: possible previous swh revision (either a dict or revision identifier) Returns: Tuple of the latest Snapshot from the previous visit and its targeted revision if any or None otherwise. """ storage = self.storage latest_snapshot = snapshot_get_latest(storage, origin_url) if not latest_snapshot: return None branches = latest_snapshot.branches if not branches: return None branch = branches.get(DEFAULT_BRANCH) if not branch: return None if branch.target_type != TargetType.REVISION: return None swh_id = branch.target revision = storage.revision_get([swh_id])[0] if not revision: return None return latest_snapshot, revision
def test_load_repo_check_extids_write_version(swh_storage, datadir, tmp_path): """ExtIDs should be stored with a given version when loading is done""" archive_name = "hello" archive_path = Path(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) hg_strip(repo_url.replace("file://", ""), "tip") loader = HgLoader(swh_storage, repo_url) assert loader.load() == {"status": "eventful"} # Ensure we write ExtIDs to a specific version. snapshot = snapshot_get_latest(swh_storage, repo_url) # First, filter out revisions from that snapshot revision_ids = [ branch.target for branch in snapshot.branches.values() if branch.target_type == TargetType.REVISION ] assert len(revision_ids) > 0 # Those revisions should have their associated ExtID version set to EXTID_VERSION extids = swh_storage.extid_get_from_target(ObjectType.REVISION, revision_ids) assert len(extids) == len(revision_ids) for extid in extids: assert extid.extid_version == EXTID_VERSION
def _partial_copy_storage(old_storage, origin_url: str, mechanism: str, copy_revisions: bool): """Create a new storage, and only copy ExtIDs or head revisions to it.""" new_storage = get_storage(cls="memory") snapshot = snapshot_get_latest(old_storage, origin_url) assert snapshot heads = [branch.target for branch in snapshot.branches.values()] if mechanism == "extid": extids = old_storage.extid_get_from_target(ObjectType.REVISION, heads) new_storage.extid_add(extids) if copy_revisions: # copy revisions, but erase their metadata to make sure the loader doesn't # fallback to revision.metadata["nodeid"] revisions = [ attr.evolve(rev, metadata={}) for rev in old_storage.revision_get(heads) if rev ] new_storage.revision_add(revisions) else: assert mechanism == "same storage" return old_storage # copy origin, visit, status new_storage.origin_add(old_storage.origin_get([origin_url])) visit = old_storage.origin_visit_get_latest(origin_url) new_storage.origin_visit_add([visit]) statuses = old_storage.origin_visit_status_get(origin_url, visit.visit).results new_storage.origin_visit_status_add(statuses) new_storage.snapshot_add([snapshot]) return new_storage
def _get_origin_dfs_revisions_walker(): tests_data = get_tests_data() storage = tests_data["storage"] origin = random.choice(tests_data["origins"][:-1]) snapshot = snapshot_get_latest(storage, origin["url"]) if snapshot.branches[b"HEAD"].target_type.value == "alias": target = snapshot.branches[b"HEAD"].target head = snapshot.branches[target].target else: head = snapshot.branches[b"HEAD"].target return get_revisions_walker("dfs", storage, head)
def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path): """Visit a mercurial repository visit transplant operations within should yield a snapshot as well. """ archive_name = "transplant" archive_path = os.path.join(datadir, f"{archive_name}.tgz") repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path) loader = HgLoader( swh_storage, url=repo_url, visit_date=VISIT_DATE, ) # load hg repository actual_load_status = loader.load() assert actual_load_status == {"status": "eventful"} # collect swh revisions assert_last_visit_matches(loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full") revisions = [] snapshot = snapshot_get_latest(loader.storage, repo_url) for branch in snapshot.branches.values(): if branch.target_type.value != "revision": continue revisions.append(branch.target) # extract original changesets info and the transplant sources hg_changesets = set() transplant_sources = set() for rev in loader.storage.revision_log(revisions): extids = list( loader.storage.extid_get_from_target(ObjectType.REVISION, [rev["id"]])) assert len(extids) == 1 hg_changesets.add(hash_to_hex(extids[0].extid)) for k, v in rev["extra_headers"]: if k == b"transplant_source": transplant_sources.add(v.decode("ascii")) # check extracted data are valid assert len(hg_changesets) > 0 assert len(transplant_sources) > 0 assert transplant_sources <= hg_changesets
def origin_with_pull_request_branches(): """ Hypothesis strategy returning a random origin with pull request branches ingested into the test archive. """ ret = [] tests_data = get_tests_data() storage = tests_data["storage"] origins = storage.origin_list(limit=1000) for origin in origins.results: snapshot = snapshot_get_latest(storage, origin.url) if any([b"refs/pull/" in b for b in snapshot.branches]): ret.append(origin) return sampled_from(ret)
def test_snapshot_get_latest_none(swh_storage, sample_data): """Retrieve latest snapshot on unknown origin or origin without snapshot should yield no result """ # unknown origin so None assert snapshot_get_latest(swh_storage, "unknown-origin") is None # no snapshot on origin visit so None origin = sample_data.origin swh_storage.origin_add([origin]) origin_visit, origin_visit2 = sample_data.origin_visits[:2] assert origin_visit.origin == origin.url swh_storage.origin_visit_add([origin_visit]) assert snapshot_get_latest(swh_storage, origin.url) is None ov1 = swh_storage.origin_visit_get_latest(origin.url) assert ov1 is not None # visit references a snapshot but the snapshot does not exist in backend for some # reason complete_snapshot = sample_data.snapshots[2] swh_storage.origin_visit_status_add([ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=origin_visit2.date, status="partial", snapshot=complete_snapshot.id, ) ]) # so we do not find it assert snapshot_get_latest(swh_storage, origin.url) is None assert snapshot_get_latest(swh_storage, origin.url, branches_count=1) is None
def origin_with_releases(): """ Hypothesis strategy returning a random origin ingested into the test archive. """ ret = [] tests_data = get_tests_data() for origin in tests_data["origins"]: snapshot = snapshot_get_latest(tests_data["storage"], origin["url"]) if any([ b.target_type.value == "release" for b in snapshot.branches.values() ]): ret.append(origin) return sampled_from(ret)
def lookup_latest_origin_snapshot( origin: str, allowed_statuses: List[str] = None) -> Optional[Dict[str, Any]]: """Return information about the latest snapshot of an origin. .. warning:: At most 1000 branches contained in the snapshot will be returned for performance reasons. Args: origin: URL or integer identifier of the origin allowed_statuses: list of visit statuses considered to find the latest snapshot for the visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. Returns: A dict filled with the snapshot content. """ snp = snapshot_get_latest(storage, origin, allowed_statuses=allowed_statuses, branches_count=1000) return converters.from_snapshot(snp.to_dict()) if snp is not None else None
def _init_tests_data(): # To hold reference to the memory storage storage = get_storage("memory") # Create search instance search = get_search("memory") search.initialize() search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS) # Create indexer storage instance that will be shared by indexers idx_storage = get_indexer_storage("memory") # Declare a test tool for origin intrinsic metadata tests idx_tool = idx_storage.indexer_configuration_add([INDEXER_TOOL])[0] INDEXER_TOOL["id"] = idx_tool["id"] # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive_ in enumerate(origin["archives"]): if i > 0: # ensure visit dates will be different when simulating # multiple visits of an origin time.sleep(1) origin_repo_archive = os.path.join(os.path.dirname(__file__), "resources/repos/%s" % archive_) loader = GitLoaderFromArchive( storage, origin["url"], archive_path=origin_repo_archive, ) result = loader.load() assert result["status"] == "eventful" ori = storage.origin_get([origin["url"]])[0] origin.update(ori.to_dict()) # add an 'id' key if enabled search.origin_update([{ "url": origin["url"], "has_visits": True, "visit_types": ["git"] }]) for i in range(250): _add_origin(storage, search, origin_url=f"https://many.origins/{i+1}", visit_type="tar") sha1s: Set[Sha1] = set() directories = set() revisions = set() releases = set() snapshots = set() content_path = {} # Get all objects loaded into the test archive common_metadata = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE} for origin in _TEST_ORIGINS: snp = snapshot_get_latest(storage, origin["url"]) snapshots.add(hash_to_hex(snp.id)) for branch_name, branch_data in snp.branches.items(): target_type = branch_data.target_type.value if target_type == "revision": revisions.add(branch_data.target) if b"master" in branch_name: # Add some origin intrinsic metadata for tests metadata = common_metadata metadata.update(origin.get("metadata", {})) origin_metadata = OriginIntrinsicMetadataRow( id=origin["url"], from_revision=branch_data.target, indexer_configuration_id=idx_tool["id"], metadata=metadata, mappings=[], ) idx_storage.origin_intrinsic_metadata_add( [origin_metadata]) search.origin_update([{ "url": origin["url"], "intrinsic_metadata": metadata }]) ORIGIN_MASTER_REVISION[origin["url"]] = hash_to_hex( branch_data.target) elif target_type == "release": release = storage.release_get([branch_data.target])[0] revisions.add(release.target) releases.add(hash_to_hex(branch_data.target)) for rev_log in storage.revision_shortlog(set(revisions)): rev_id = rev_log[0] revisions.add(rev_id) for rev in storage.revision_get(revisions): if rev is None: continue dir_id = rev.directory directories.add(hash_to_hex(dir_id)) for entry in dir_iterator(storage, dir_id): if entry["type"] == "file": sha1s.add(entry["sha1"]) content_path[entry["sha1"]] = "/".join( [hash_to_hex(dir_id), entry["path"].decode("utf-8")]) elif entry["type"] == "dir": directories.add(hash_to_hex(entry["target"])) _add_extra_contents(storage, sha1s) # Get all checksums for each content result: List[Optional[Content]] = storage.content_get(list(sha1s)) contents: List[Dict] = [] for content in result: assert content is not None sha1 = hash_to_hex(content.sha1) content_metadata = { algo: hash_to_hex(getattr(content, algo)) for algo in DEFAULT_ALGORITHMS } path = "" if content.sha1 in content_path: path = content_path[content.sha1] cnt_data = storage.content_get_data(content.sha1) assert cnt_data is not None mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data) _, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data) content_display_data = prepare_content_for_display( cnt_data, mimetype, path) content_metadata.update({ "path": path, "mimetype": mimetype, "encoding": encoding, "hljs_language": content_display_data["language"], "data": content_display_data["content_data"], }) _contents[sha1] = content_metadata contents.append(content_metadata) # Add the empty directory to the test archive storage.directory_add([Directory(entries=())]) # Add empty content to the test archive storage.content_add([Content.from_data(data=b"")]) # Add fake git origin with pull request branches _add_origin( storage, search, origin_url="https://git.example.org/project", snapshot_branches={ b"refs/heads/master": { "target_type": "revision", "target": next(iter(revisions)), }, **{ f"refs/pull/{i}".encode(): { "target_type": "revision", "target": next(iter(revisions)), } for i in range(300) }, }, ) # Return tests data return { "search": search, "storage": storage, "idx_storage": idx_storage, "origins": _TEST_ORIGINS, "contents": contents, "directories": list(directories), "releases": list(releases), "revisions": list(map(hash_to_hex, revisions)), "snapshots": list(snapshots), "generated_checksums": set(), }
def snapshot_get_latest(self, origin_url): snp = snapshot_get_latest(self.storage, origin_url) return converters.from_snapshot(snp.to_dict())
def get_full_snapshot(self, origin_url) -> Optional[Snapshot]: return snapshot_get_latest(self.storage, origin_url)
def test_load_upgrade_from_revision_extids(caplog): """Tests that, when loading incrementally based on a snapshot made by an old version of the loader, the loader will convert revisions to releases and add them to the storage. Also checks that, if an extid exists pointing to a non-existent revision (which should never happen, but you never know...), the release is loaded from scratch.""" storage = get_storage("memory") origin = "http://example.org" dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"d" * 20) dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) date = TimestampWithTimezone.from_datetime( datetime.datetime.now(tz=datetime.timezone.utc)) person = Person.from_fullname(b"Jane Doe <*****@*****.**>") rev1 = Revision( message=b"blah", author=person, date=date, committer=person, committer_date=date, directory=dir1_swhid.object_id, type=RevisionType.TAR, synthetic=True, ) rel1 = Release( name=b"v1.0", message=b"blah\n", author=person, date=date, target=dir1_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) rev1_swhid = rev1.swhid() rel1_swhid = rel1.swhid() rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION, object_id=b"b" * 20) rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20) # Results of a previous load storage.extid_add([ ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0), ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0), ]) storage.revision_add([rev1]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev1_swhid.object_id), b"v2.0": SnapshotBranch(target_type=TargetType.REVISION, target=rev2_swhid.object_id), }) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add([ OriginVisit(origin="http://example.org", visit=1, date=date, type="tar") ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ]) loader = StubPackageLoader(storage, "http://example.org") patch.object( loader, "_load_release", return_value=(rel2_swhid.object_id, dir2_swhid.object_id), autospec=True, ).start() patch.object( loader, "get_versions", return_value=["v1.0", "v2.0", "v3.0"], autospec=True, ).start() caplog.set_level(logging.ERROR) loader.load() assert len(caplog.records) == 1 (record, ) = caplog.records assert record.levelname == "ERROR" assert "Failed to upgrade branch branch-v2.0" in record.message assert loader._load_release.mock_calls == [ # v1.0: not loaded because there is already a revision matching it # v2.0: loaded, as the revision is missing from the storage even though there # is an extid call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin)), # v3.0: loaded (did not exist yet) call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin)), ] snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"branch-v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), b"branch-v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), }) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, ], ) assert set(extids) == { ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid), }
def test_load_extids() -> None: """Checks PackageLoader.load() skips iff it should, and writes (only) the new ExtIDs""" storage = get_storage("memory") dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=b"e" * 20) rels = [ Release( name=f"v{i}.0".encode(), message=b"blah\n", target=dir_swhid.object_id, target_type=ModelObjectType.DIRECTORY, synthetic=True, ) for i in (1, 2, 3, 4) ] storage.release_add(rels[0:3]) origin = "http://example.org" rel1_swhid = rels[0].swhid() rel2_swhid = rels[1].swhid() rel3_swhid = rels[2].swhid() rel4_swhid = rels[3].swhid() # Results of a previous load storage.extid_add([ ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), ]) last_snapshot = Snapshot( branches={ b"v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel2_swhid.object_id), b"v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel3_swhid.object_id), }) storage.snapshot_add([last_snapshot]) date = datetime.datetime.now(tz=datetime.timezone.utc) storage.origin_add([Origin(url=origin)]) storage.origin_visit_add([ OriginVisit(origin="http://example.org", visit=1, date=date, type="tar") ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin, visit=1, status="full", date=date, snapshot=last_snapshot.id, ) ]) loader = StubPackageLoader(storage, "http://example.org") patch.object( loader, "_load_release", return_value=(rel4_swhid.object_id, dir_swhid.object_id), autospec=True, ).start() loader.load() assert loader._load_release.mock_calls == [ # type: ignore # v1.0: not loaded because there is already its (extid_type, extid, rel) # in the storage. # v2.0: loaded, because there is already a similar extid, but different type call( StubPackageInfo(origin, "example-v2.0.tar", "v2.0"), Origin(url=origin), ), # v3.0: loaded despite having an (extid_type, extid) in storage, because # the target of the extid is not in the previous snapshot call( StubPackageInfo(origin, "example-v3.0.tar", "v3.0"), Origin(url=origin), ), # v4.0: loaded, because there isn't its extid call( StubPackageInfo(origin, "example-v4.0.tar", "v4.0"), Origin(url=origin), ), ] # then check the snapshot has all the branches. # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last # snapshot), because they had to be loaded (mismatched extid), and the mocked # _load_release always returns rel4_swhid. snapshot = Snapshot( branches={ b"branch-v1.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel1_swhid.object_id), b"branch-v2.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), b"branch-v3.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), b"branch-v4.0": SnapshotBranch(target_type=TargetType.RELEASE, target=rel4_swhid.object_id), }) assert snapshot_get_latest(storage, origin) == snapshot extids = storage.extid_get_from_target( ObjectType.RELEASE, [ rel1_swhid.object_id, rel2_swhid.object_id, rel3_swhid.object_id, rel4_swhid.object_id, ], ) assert set(extids) == { # What we inserted at the beginning of the test: ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid), ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid), # Added by the loader: ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid), ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid), }
def test_snapshot_get_latest(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url swh_storage.origin_visit_add([visit1]) ov1 = swh_storage.origin_visit_get_latest(origin.url) # Add snapshot to visit1, latest snapshot = visit 1 snapshot complete_snapshot = sample_data.snapshots[2] swh_storage.snapshot_add([complete_snapshot]) swh_storage.origin_visit_status_add([ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=visit2.date, status="partial", snapshot=None, ) ]) assert visit1.date < visit2.date # no snapshot associated to the visit, so None actual_snapshot = snapshot_get_latest(swh_storage, origin.url, allowed_statuses=["partial"]) assert actual_snapshot is None date_now = now() assert visit2.date < date_now swh_storage.origin_visit_status_add([ OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=date_now, type=ov1.type, status="full", snapshot=complete_snapshot.id, ) ]) swh_storage.origin_visit_add( [OriginVisit( origin=origin.url, date=now(), type=visit1.type, )]) actual_snapshot = snapshot_get_latest(swh_storage, origin.url) assert actual_snapshot is not None assert actual_snapshot == complete_snapshot actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1) assert actual_snapshot is not None assert actual_snapshot.id == complete_snapshot.id assert len(actual_snapshot.branches.values()) == 1 with pytest.raises(ValueError, match="branches_count must be a positive integer"): snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong")