def cook_extract_directory_gitfast(storage, swhid, fsck=True): """Context manager that cooks a revision containing a directory and extract it, using RevisionGitfastCooker""" test_repo = TestRepo() with test_repo as p: date = TimestampWithTimezone.from_datetime( datetime.datetime.now(datetime.timezone.utc)) revision = Revision( directory=swhid.object_id, message=b"dummy message", author=Person.from_fullname(b"someone"), committer=Person.from_fullname(b"someone"), date=date, committer_date=date, type=RevisionType.GIT, synthetic=False, ) storage.revision_add([revision]) with cook_stream_revision_gitfast( storage, revision.swhid()) as stream, test_repo as p: processor = dulwich.fastexport.GitImportProcessor(test_repo.repo) processor.import_stream(stream) test_repo.checkout(b"HEAD") shutil.rmtree(p / ".git") yield p
def load_repo_null_fields(self, git_loader): # Our schema doesn't enforce a lot of non-null revision fields. We need # to check these cases don't break the cooker. repo = TestRepo() with repo as rp: (rp / "file").write_text(TEST_CONTENT) c = repo.commit("initial commit") loader = git_loader(str(rp)) loader.load() repo.repo.refs[b"HEAD"].decode() dir_id_hex = repo.repo[c].tree.decode() dir_id = hashutil.hash_to_bytes(dir_id_hex) test_revision = Revision( message=b"", author=Person(name=None, email=None, fullname=b""), date=None, committer=Person(name=None, email=None, fullname=b""), committer_date=None, parents=(), type=RevisionType.GIT, directory=dir_id, metadata={}, synthetic=True, ) storage = loader.storage storage.revision_add([test_revision]) return (loader, test_revision.swhid())
def test_db_to_revision(): # when actual_revision = converters.db_to_revision( { "id": b"revision-id", "date": None, "date_offset": None, "date_neg_utc_offset": None, "date_offset_bytes": None, "committer_date": None, "committer_date_offset": None, "committer_date_neg_utc_offset": None, "committer_date_offset_bytes": None, "type": "git", "directory": b"dir-sha1", "message": b"commit message", "author_fullname": b"auth-name <auth-email>", "author_name": b"auth-name", "author_email": b"auth-email", "committer_fullname": b"comm-name <comm-email>", "committer_name": b"comm-name", "committer_email": b"comm-email", "metadata": {}, "synthetic": False, "extra_headers": (), "raw_manifest": None, "parents": [b"123", b"456"], } ) # then assert actual_revision == Revision( id=b"revision-id", author=Person( fullname=b"auth-name <auth-email>", name=b"auth-name", email=b"auth-email", ), date=None, committer=Person( fullname=b"comm-name <comm-email>", name=b"comm-name", email=b"comm-email", ), committer_date=None, type=RevisionType.GIT, directory=b"dir-sha1", message=b"commit message", metadata={}, synthetic=False, extra_headers=(), parents=(b"123", b"456"), )
def test_revision_submodule(self, swh_storage, cook_extract_revision, ingest_target_revision): date = TimestampWithTimezone.from_datetime( datetime.datetime.now( datetime.timezone.utc).replace(microsecond=0)) target_rev = Revision( message=b"target_rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=bytes.fromhex( "3333333333333333333333333333333333333333"), metadata={}, synthetic=True, ) if ingest_target_revision: swh_storage.revision_add([target_rev]) dir = Directory(entries=(DirectoryEntry( name=b"submodule", type="rev", target=target_rev.id, perms=0o160000, ), ), ) swh_storage.directory_add([dir]) rev = Revision( message=b"msg", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=dir.id, metadata={}, synthetic=True, ) swh_storage.revision_add([rev]) with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p): ert.checkout(b"HEAD") pattern = b"160000 submodule\x00%s" % target_rev.id tree = ert.repo[b"HEAD"].tree assert pattern in ert.repo[tree].as_raw_string()
def test_commit_without_manifest(self): """Tests a Release can still be produced when the manifest is not understood by the custom parser in dulwich_commit_to_revision.""" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some commit message" author = Person(fullname=b"Foo <*****@*****.**>", name=b"Foo", email=b"*****@*****.**") commit = dulwich.objects.Commit() commit.tree = target commit.message = message commit.author = commit.committer = b"Foo <*****@*****.**>" commit.author_time = commit.commit_time = 1641980946 commit.author_timezone = commit.commit_timezone = 3600 assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", author=author, committer=author, date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), type=RevisionType.GIT, directory=hash_to_bytes(target.decode()), synthetic=False, metadata=None, parents=(), )
def parse_author(author) -> Person: """See prior fixme""" return Person( fullname=author["fullname"].encode("utf-8"), name=author["name"].encode("utf-8"), email=author["email"].encode("utf-8"), )
def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. Args: data (dict): Representing either artifact information or release information. Returns: swh-model dict representing a person. """ name = data.get("author") email = data.get("author_email") fullname = None # type: Optional[str] if email: fullname = "%s <%s>" % (name, email) else: fullname = name if not fullname: return EMPTY_AUTHOR if name is not None: name = name.encode("utf-8") if email is not None: email = email.encode("utf-8") return Person(fullname=fullname.encode("utf-8"), name=name, email=email)
def test_svn_author_to_swh_person_no_email(): """The author and fullname should be the same as the input (author).""" actual_person = converters.svn_author_to_swh_person(b"tony") assert actual_person == Person.from_dict({ "fullname": b"tony", "name": b"tony", "email": None, })
def test_commit_to_revision_with_extra_headers_mergetag(self): sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex( "faa4b64a841ca3e3f07d6501caebda2e3e8e544e"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594138183, microseconds=0, ), offset_bytes=b"+0200", ), message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag", MERGETAG)), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594138183, microseconds=0, ), offset_bytes=b"+0200", ), parents=( bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"), bytes.fromhex("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), ), synthetic=False, ) assert revision == expected_revision
def test_svn_author_to_swh_person(): """The author should have name, email and fullname filled.""" actual_person = converters.svn_author_to_swh_person(b"tony <ynot@dagobah>") assert actual_person == Person.from_dict({ "fullname": b"tony <ynot@dagobah>", "name": b"tony", "email": b"ynot@dagobah", })
def test_build_swh_revision_default(): """This should build the swh revision with the swh revision's extra headers about the repository. """ dir_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd") date = TimestampWithTimezone(timestamp=Timestamp(seconds=1088108379, microseconds=0), offset_bytes=b"+0000") actual_rev = converters.build_swh_revision( repo_uuid=b"uuid", dir_id=dir_id, commit={ "author_name": Person(name=b"theo", email=b"theo@uuid", fullname=b"theo <theo@uuid>"), "message": b"commit message", "author_date": date, }, rev=10, parents=(), ) expected_rev = Revision.from_dict({ "date": date.to_dict(), "committer_date": date.to_dict(), "type": "svn", "directory": dir_id, "message": b"commit message", "author": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo <theo@uuid>", }, "committer": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo <theo@uuid>", }, "synthetic": True, "extra_headers": ( (b"svn_repo_uuid", b"uuid"), (b"svn_revision", b"10"), ), "parents": (), }) assert actual_rev == expected_rev
def test_revision_get_displayname_behavior(self, swh_storage, sample_data): """Check revision_get behavior when displayname is set""" revision, revision2 = sample_data.revisions[:2] # Make authors and committers known revision = attr.evolve( revision, author=Person.from_fullname(b"author1 <*****@*****.**>"), committer=Person.from_fullname(b"committer1 <*****@*****.**>"), ) revision = attr.evolve(revision, id=revision.compute_hash()) revision2 = attr.evolve( revision2, author=Person.from_fullname(b"author2 <*****@*****.**>"), committer=Person.from_fullname(b"committer2 <*****@*****.**>"), ) revision2 = attr.evolve(revision2, id=revision2.compute_hash()) add_result = swh_storage.revision_add([revision, revision2]) assert add_result == {"revision:add": 2} # Before displayname change revisions = swh_storage.revision_get([revision.id, revision2.id]) assert revisions == [revision, revision2] displayname = b"Display Name <*****@*****.**>" with db_transaction(swh_storage) as (_, cur): cur.execute( "UPDATE person set displayname = %s where fullname = %s", (displayname, revision.author.fullname), ) revisions = swh_storage.revision_get([revision.id, revision2.id]) assert revisions == [ attr.evolve(revision, author=Person.from_fullname(displayname)), revision2, ] revisions = swh_storage.revision_get( [revision.id, revision2.id], ignore_displayname=True ) assert revisions == [revision, revision2]
def test_revision_log_displayname_behavior(self, swh_storage, sample_data): """Check revision_log behavior when displayname is set""" revision, revision2 = sample_data.revisions[:2] # Make authors, committers and parenthood relationship known # (revision2 -[parent]-> revision1) revision = attr.evolve( revision, author=Person.from_fullname(b"author1 <*****@*****.**>"), committer=Person.from_fullname(b"committer1 <*****@*****.**>"), ) revision = attr.evolve(revision, id=revision.compute_hash()) revision2 = attr.evolve( revision2, parents=(revision.id,), author=Person.from_fullname(b"author2 <*****@*****.**>"), committer=Person.from_fullname(b"committer2 <*****@*****.**>"), ) revision2 = attr.evolve(revision2, id=revision2.compute_hash()) add_result = swh_storage.revision_add([revision, revision2]) assert add_result == {"revision:add": 2} # Before displayname change revisions = swh_storage.revision_log([revision2.id]) assert list(revisions) == [revision2.to_dict(), revision.to_dict()] displayname = b"Display Name <*****@*****.**>" with db_transaction(swh_storage) as (_, cur): cur.execute( "UPDATE person set displayname = %s where fullname = %s", (displayname, revision.author.fullname), ) revisions = swh_storage.revision_log([revision2.id]) assert list(revisions) == [ revision2.to_dict(), attr.evolve(revision, author=Person.from_fullname(displayname)).to_dict(), ] revisions = swh_storage.revision_log([revision2.id], ignore_displayname=True) assert list(revisions) == [revision2.to_dict(), revision.to_dict()]
def test_commit_to_revision_with_extra_headers(self): sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex( "f8ec06e4ed7b9fff4918a0241a48023143f30000"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594137902, microseconds=0, ), offset_bytes=b"+0200", ), message=b"Am\xe9lioration du fichier READM\xa4\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594136900, microseconds=0, ), offset_bytes=b"+0200", ), parents=( bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"), ), synthetic=False, ) assert revision == expected_revision
def test_svn_author_to_swh_person_empty_person(): """Empty person has only its fullname filled with the empty byte-string. """ actual_person = converters.svn_author_to_swh_person(b"") assert actual_person == Person.from_dict({ "fullname": b"", "name": None, "email": None, })
def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca", type=RevisionType.GIT, committer=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), message=b"add submodule dependency\n", metadata=None, extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), parents=( b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r", ), synthetic=False, ) assert revision == expected_revision
def svn_author_to_swh_person(author: Optional[bytes]) -> Person: """Convert an svn author to an swh person. Default policy: No information is added. Args: author: the svn author (in bytes) Returns: a Person """ return Person.from_fullname(author or b"")
def get_package_info( self, version: str) -> Iterator[Tuple[str, OpamPackageInfo]]: url = self.get_enclosed_single_line_field("url.src:", version) if url is None: raise ValueError( f"can't get field url.src: for version {version} of package" f" {self.opam_package} (at url {self.origin.url}) from `opam show`" ) authors_field = self.get_enclosed_single_line_field( "authors:", version) fullname = b"" if authors_field is None else str.encode(authors_field) author = Person.from_fullname(fullname) maintainer_field = self.get_enclosed_single_line_field( "maintainer:", version) fullname = b"" if maintainer_field is None else str.encode( maintainer_field) committer = Person.from_fullname(fullname) with Popen(self._opam_show_args(version) + ["--raw"], stdout=PIPE) as proc: assert proc.stdout is not None metadata = proc.stdout.read() yield self.get_package_name(version), OpamPackageInfo( url=url, filename=None, author=author, committer=committer, version=version, directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( metadata=metadata, format="opam-package-definition", ) ], )
def prepare_person(person: Mapping[str, str]) -> Person: """Prepare person for swh serialization... Args: A person dict Returns: A person ready for storage """ return Person.from_dict( {key: value.encode("utf-8") for (key, value) in person.items()})
def extract_author(p_info: CratesPackageInfo) -> Person: """Extract package author from intrinsic metadata and return it as a `Person` model. Args: p_info: CratesPackageInfo that should contains i_metadata entries Returns: Only one author (Person) of the package. Currently limited by internal detail of the swh stack (see T3887). """ authors = p_info.i_metadata["authors"] fullname = authors[0] # TODO: here we have a list of author, see T3887 return Person.from_fullname(fullname.encode())
def test_pypi_author_empty_email(): data = { "author": "i-am-groot", "author_email": "", } actual_author = author(data) expected_author = Person( fullname=b"i-am-groot", name=b"i-am-groot", email=b"", ) assert actual_author == expected_author
def test_pypi_author_empty_name(): data = { "author": "", "author_email": "*****@*****.**", } actual_author = author(data) expected_author = Person( fullname=b" <*****@*****.**>", name=b"", email=b"*****@*****.**", ) assert actual_author == expected_author
def test_pypi_author_malformed_2(): data = { "author": "[marie, jeanne]", "author_email": "[marie@some, jeanne@thing]", } actual_author = author(data) expected_author = Person( fullname=b"[marie, jeanne] <[marie@some, jeanne@thing]>", name=b"[marie, jeanne]", email=b"[marie@some, jeanne@thing]", ) assert actual_author == expected_author
def db_to_author(fullname: Optional[bytes], name: Optional[bytes], email: Optional[bytes]) -> Optional[Person]: """Convert the DB representation of an author to a swh-model author. Args: fullname (bytes): the author's fullname name (bytes): the author's name email (bytes): the author's email Returns: a Person object, or None if 'fullname' is None. """ if fullname is None: return None if name is None and email is None: # The fullname hasn't been parsed, try that again return Person.from_fullname(fullname) return Person( fullname=fullname, name=name, email=email, )
def test_debian_prepare_person(): actual_author = prepare_person({ "name": "Someone Name", "email": "*****@*****.**", "fullname": "Someone Name <*****@*****.**>", }) assert actual_author == Person( name=b"Someone Name", email=b"*****@*****.**", fullname=b"Someone Name <*****@*****.**>", )
def test_pypi_author_malformed(): data = { "author": "['pierre', 'paul', 'jacques']", "author_email": None, } actual_author = author(data) expected_author = Person( fullname=b"['pierre', 'paul', 'jacques']", name=b"['pierre', 'paul', 'jacques']", email=None, ) assert actual_author == expected_author
def build_release( self, p_info: BasePackageInfo, uncompressed_path: str, directory: Sha1Git, ): return Release( name=p_info.version.encode(), message=b"", author=Person.from_fullname(b""), date=None, target=DIRECTORY_ID, target_type=ObjectType.DIRECTORY, synthetic=False, )
def new_person(draw): """ Hypothesis strategy returning random raw swh person data. """ name = draw( text( min_size=5, max_size=30, alphabet=characters(min_codepoint=0, max_codepoint=255), )) email = "*****@*****.**" % name return Person( name=name.encode(), email=email.encode(), fullname=("%s <%s>" % (name, email)).encode(), )
def test_dulwich_tag_to_release_author_and_date(self): sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef") tagger = b"hey dude <*****@*****.**>" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" date = int( datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp()) tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = None tag.tagger = tagger tag.tag_time = date tag.tag_timezone = 0 assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"*****@*****.**", fullname=b"hey dude <*****@*****.**>", name=b"hey dude", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1196812800, microseconds=0, ), offset_bytes=b"+0000", ), id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release
def test_dulwich_tag_to_release_author_zero_date(self): # to reproduce bug T815 (fixed) sha = hash_to_bytes("6cc1deff5cdcd853428bb63b937f43dd2566c36f") tagger = b"hey dude <*****@*****.**>" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" date = int( datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc).timestamp()) tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = None tag.tagger = tagger tag.tag_time = date tag.tag_timezone = 0 assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"*****@*****.**", fullname=b"hey dude <*****@*****.**>", name=b"hey dude", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=0, microseconds=0, ), offset_bytes=b"+0000", ), id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release