def load_repo_null_fields(self, git_loader): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) storage = loader.storage storage.revision_add([test_revision]) return (loader, test_revision.swhid())
def cook_extract_directory_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision containing a directory and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with test_repo as p: date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc)) revision = Revision( directory=swhid.object_id, message=b"dummy message", author=Person.from_fullname(b"someone"), committer=Person.from_fullname(b"someone"), date=date, committer_date=date, type=RevisionType.GIT, synthetic=False, ) storage.revision_add([revision]) with cook_stream_revision_gitfast( storage, revision.swhid()) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) test_repo.checkout(b"HEAD") shutil.rmtree(p / ".git") yield p
def test_revision_identifier(self): self.assertEqual( Revision.from_dict(self.revision).id, self.revision["id"], ) self.assertEqual( Revision.from_dict(remove_id(self.revision)).id, self.revision["id"], )
def test_revision_submodule(self, swh_storage, cook_extract_revision, ingest_target_revision): date = TimestampWithTimezone.from_datetime( datetime.datetime.now( datetime.timezone.utc).replace(microsecond=0)) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex( "3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory(entries=(DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string()
def custom_deserializer(object_type, msg): assert object_type == "revision" obj = kafka_to_value(msg) # filter the first revision if obj["id"] == revisions[0].id: return None return Revision.from_dict(obj)
def test_lookup_directory_with_revision_unknown_content( archive_data, new_revision): unknown_content_ = random_content() dir_path = "README.md" # A directory that points to unknown content dir = Directory(entries=(DirectoryEntry( name=bytes(dir_path.encode("utf-8")), type="file", target=hash_to_bytes(unknown_content_["sha1_git"]), perms=DentryPerms.content, ), )) # Create a revision that points to a directory # Which points to unknown content new_revision = new_revision.to_dict() new_revision["directory"] = dir.id del new_revision["id"] new_revision = Revision.from_dict(new_revision) # Add the directory and revision in mem archive_data.directory_add([dir]) archive_data.revision_add([new_revision]) new_revision_id = hash_to_hex(new_revision.id) with pytest.raises(NotFoundExc) as e: archive.lookup_directory_with_revision(new_revision_id, dir_path) assert e.match("Content not found for revision %s" % new_revision_id)
def test_revision_metadata_indexer_single_root_dir(self): metadata_indexer = RevisionMetadataIndexer(config=REVISION_METADATA_CONFIG) fill_obj_storage(metadata_indexer.objstorage) fill_storage(metadata_indexer.storage) # Add a parent directory, that is the only directory at the root # of the revision rev = REVISION assert rev.directory == DIRECTORY2.id directory = Directory( entries=( DirectoryEntry( name=b"foobar-1.0.0", type="dir", target=rev.directory, perms=16384, ), ), ) assert directory.id is not None metadata_indexer.storage.directory_add([directory]) new_rev_dict = {**rev.to_dict(), "directory": directory.id} new_rev_dict.pop("id") new_rev = Revision.from_dict(new_rev_dict) metadata_indexer.storage.revision_add([new_rev]) tool = metadata_indexer.idx_storage.indexer_configuration_get( {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()} ) assert tool is not None metadata_indexer.idx_storage.content_metadata_add( [ ContentMetadataRow( id=DIRECTORY2.entries[0].target, indexer_configuration_id=tool["id"], metadata=YARN_PARSER_METADATA, ) ] ) metadata_indexer.run([new_rev.id]) results = list( metadata_indexer.idx_storage.revision_intrinsic_metadata_get([new_rev.id]) ) expected_results = [ RevisionIntrinsicMetadataRow( id=new_rev.id, tool=TRANSLATOR_TOOL, metadata=YARN_PARSER_METADATA, mappings=["npm"], ) ] for result in results: del result.tool["id"] # then self.assertEqual(results, expected_results)
def test_revision_metadata_display(archive_data, client, directory, person, date): metadata = {"foo": "bar"} revision = Revision( directory=hash_to_bytes(directory), author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, metadata=metadata, ) archive_data.revision_add([revision]) url = reverse("browse-revision", url_args={"sha1_git": hash_to_hex(revision.id)}) resp = check_html_get_response(client, url, status_code=200, template_used="browse/revision.html") assert_contains(resp, "swh-metadata-popover") assert_contains(resp, escape(json.dumps(metadata, indent=4)))
def test_lookup_revision_invalid_msg(archive_data, new_revision): new_revision = new_revision.to_dict() new_revision["message"] = b"elegant fix for bug \xff" archive_data.revision_add([Revision.from_dict(new_revision)]) revision = archive.lookup_revision(hash_to_hex(new_revision["id"])) assert revision["message"] == "elegant fix for bug \\xff" assert revision["decoding_failures"] == ["message"]
def test_db_to_revision(): # when actual_revision = converters.db_to_revision( { "id": b"revision-id", "date": None, "date_offset": None, "date_neg_utc_offset": None, "date_offset_bytes": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, "committer_date_offset_bytes": None, "type": "git", "directory": b"dir-sha1", "message": b"commit message", "author_fullname": b"auth-name <auth-email>", "author_name": b"auth-name", "author_email": b"auth-email", "committer_fullname": b"comm-name <comm-email>", "committer_name": b"comm-name", "committer_email": b"comm-email", "metadata": {}, "synthetic": False, "extra_headers": (), "raw_manifest": None, "parents": [b"123", b"456"], } ) # then assert actual_revision == Revision( id=b"revision-id", author=Person( fullname=b"auth-name <auth-email>", name=b"auth-name", email=b"auth-email", ), date=None, committer=Person( fullname=b"comm-name <comm-email>", name=b"comm-name", email=b"comm-email", ), committer_date=None, type=RevisionType.GIT, directory=b"dir-sha1", message=b"commit message", metadata={}, synthetic=False, extra_headers=(), parents=(b"123", b"456"), )
def push_revision_subgraph(self, obj_id: Sha1Git) -> None: """Fetches the graph of revisions induced by the given ``obj_id`` and adds them to ``self._rev_stack``. If swh-graph is not available, this requires fetching the revisions themselves, so they are directly loaded instead.""" loaded_from_graph = False if self.graph: from swh.graph.client import GraphArgumentException # First, try to cook using swh-graph, as it is more efficient than # swh-storage for querying the history obj_swhid = CoreSWHID( object_type=ObjectType.REVISION, object_id=obj_id, ) try: revision_ids = (swhid.object_id for swhid in map( CoreSWHID.from_string, self.graph.visit_nodes(str(obj_swhid), edges="rev:rev"), )) self._push(self._rev_stack, revision_ids) except GraphArgumentException as e: logger.info( "Revision %s not found in swh-graph, falling back to fetching " "history using swh-storage. %s", hash_to_hex(obj_id), e.args[0], ) else: loaded_from_graph = True if not loaded_from_graph: # If swh-graph is not available, or the revision is not yet in # swh-graph, fall back to self.storage.revision_log. # self.storage.revision_log also gives us the full revisions, # so we load them right now instead of just pushing them on the stack. walker = DFSRevisionsWalker(self.storage, obj_id, state=self._walker_state, ignore_displayname=True) for revision in walker: self.write_revision_node(Revision.from_dict(revision)) self.nb_loaded += 1 self._push(self._dir_stack, [revision["directory"]]) # Save the state, so the next call to the walker won't return the same # revisions self._walker_state = walker.export_state()
def revision_from_db(db_revision: RevisionRow, parents: Tuple[Sha1Git, ...]) -> Revision: revision = db_revision.to_dict() metadata = json.loads(revision.pop("metadata", None)) extra_headers = revision.pop("extra_headers", ()) if not extra_headers and metadata and "extra_headers" in metadata: extra_headers = metadata.pop("extra_headers") if extra_headers is None: extra_headers = () return Revision( parents=parents, type=RevisionType(revision.pop("type")), metadata=metadata, extra_headers=extra_headers, **revision, )
def new_revision(draw): """ Hypothesis strategy returning random raw swh revision data not ingested into the test archive. """ return Revision( directory=draw(sha1().map(hash_to_bytes)), author=draw(new_person()), committer=draw(new_person()), message=draw( text(min_size=20, max_size=100).map(lambda t: t.encode())), date=TimestampWithTimezone.from_datetime(draw(new_swh_date())), committer_date=TimestampWithTimezone.from_datetime(draw( new_swh_date())), synthetic=False, type=RevisionType.GIT, )
def test_client_stop_after_objects( kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, count: int ): producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test producer", "acks": "all", } ) # Fill Kafka revisions = cast(List[Revision], TEST_OBJECTS["revision"]) for rev in revisions: producer.produce( topic=kafka_prefix + ".revision", key=rev.id, value=value_to_kafka(rev.to_dict()), ) producer.flush() client = JournalClient( brokers=[kafka_server], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_on_eof=False, stop_after_objects=count, ) worker_fn = MagicMock() client.process(worker_fn) # this code below is not pretty, but needed since we have to deal with # dicts (so no set) which can have values that are list vs tuple, and we do # not know for sure how many calls of the worker_fn will happen during the # consumption of the topic... worker_fn.assert_called() revs = [] # list of (unique) rev dicts we got from the client for call in worker_fn.call_args_list: callrevs = call[0][0]["revision"] for rev in callrevs: assert Revision.from_dict(rev) in revisions if rev not in revs: revs.append(rev) assert len(revs) == count
def identify_revision( hg: Hg, rev: Optional[bytes] = None, node_id_2_swhid: Optional[Dict[bytes, CoreSWHID]] = None, ) -> Iterator[RevisionIdentity]: """Return the repository revision identities. Args: hg: A `Hg` repository instance rev: An optional revision or Mercurial revsets (See `hg help revsets`) If not provided all the repository revisions will be computed. node_id_2_swhid: An optional cache mapping hg node ids to SWHIDs It will be updated in place with new mappings. """ from swh.model.model import Revision if node_id_2_swhid is None: node_id_2_swhid = {} for revision in hg.log(rev): data = revision.to_dict() hg.up(revision.node_id) directory_swhid = identify_directory(hg.root()) data["directory"] = directory_swhid.object_id parents = [] for parent in data["parents"]: if parent not in node_id_2_swhid: parent_revision = next( identify_revision(hg, parent, node_id_2_swhid)) node_id_2_swhid[parent] = parent_revision.swhid assert node_id_2_swhid[parent].object_type == ObjectType.REVISION parents.append(node_id_2_swhid[parent].object_id) data["parents"] = parents revision_swhid = Revision.from_dict(data).swhid() node_id_2_swhid[revision.node_id] = revision_swhid yield RevisionIdentity( swhid=revision_swhid, node_id=revision.node_id, directory_swhid=directory_swhid, )
def _make_stub_directory_revision(self, dir_id: Sha1Git) -> Sha1Git: author = Person.from_fullname( b"swh-vault, git-bare cooker <*****@*****.**>") dt = datetime.datetime.now(tz=datetime.timezone.utc) dt = dt.replace(microsecond=0) # not supported by git date = TimestampWithTimezone.from_datetime(dt) revision = Revision( author=author, committer=author, date=date, committer_date=date, message=b"Initial commit", type=RevisionType.GIT, directory=self.obj_id, synthetic=True, ) self.write_revision_node(revision) return revision.id
def test_api_revision_directory_ok_returns_revision(api_client, archive_data, revision, person, date): rev_path = "foo" _dir = Directory(entries=(DirectoryEntry( name=rev_path.encode(), type="rev", target=hash_to_bytes(revision), perms=DentryPerms.revision, ), )) archive_data.directory_add([_dir]) rev = Revision( directory=_dir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([rev]) revision_id = hash_to_hex(rev.id) rev_data = archive_data.revision_get(revision) url = reverse( "api-1-revision-directory", { "sha1_git": revision_id, "dir_path": rev_path }, ) rv = check_api_get_responses(api_client, url, status_code=200) assert rv.data == { "content": enrich_revision(rev_data, request=rv.wsgi_request), "path": rev_path, "type": "rev", "revision": revision_id, }
def build_swh_revision(rev: int, commit: Dict, repo_uuid: bytes, dir_id: bytes, parents: Sequence[bytes]) -> Revision: """Given a svn revision, build a swh revision. This adds an 'extra-headers' entry with the repository's uuid and the svn revision. Args: rev: the svn revision number commit: the commit data: revision id, date, author, and message repo_uuid: The repository's uuid dir_id: the tree's hash identifier parents: the revision's parents identifier Returns: The swh revision dictionary. """ author = commit["author_name"] msg = commit["message"] date = commit["author_date"] extra_headers: Tuple[Tuple[bytes, bytes], ...] = ( (b"svn_repo_uuid", repo_uuid), (b"svn_revision", str(rev).encode()), ) return Revision( type=RevisionType.SUBVERSION, date=date, committer_date=date, directory=dir_id, message=msg, author=author, committer=author, synthetic=True, extra_headers=extra_headers, parents=tuple(parents), )