예제 #1
0
def new_revision(draw):
    """
    Hypothesis strategy returning random raw swh revision data
    not ingested into the test archive.
    """
    return Revision(
        directory=draw(sha1().map(hash_to_bytes)),
        author=draw(new_person()),
        committer=draw(new_person()),
        message=draw(
            text(min_size=20, max_size=100).map(lambda t: t.encode())),
        date=TimestampWithTimezone.from_datetime(draw(new_swh_date())),
        committer_date=TimestampWithTimezone.from_datetime(draw(
            new_swh_date())),
        synthetic=False,
        type=RevisionType.GIT,
    )
    def test_commit_to_revision_with_extra_headers_mergetag(self):
        sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703"

        revision = converters.dulwich_commit_to_revision(self.repo[sha1])
        expected_revision = Revision(
            id=hash_to_bytes(sha1.decode()),
            directory=bytes.fromhex(
                "faa4b64a841ca3e3f07d6501caebda2e3e8e544e"),
            type=RevisionType.GIT,
            committer=Person(
                name=b"David Douard",
                fullname=b"David Douard <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            author=Person(
                name=b"David Douard",
                fullname=b"David Douard <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            committer_date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1594138183,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n",
            metadata=None,
            extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag",
                                                           MERGETAG)),
            date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1594138183,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            parents=(
                bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"),
                bytes.fromhex("9768d0b576dbaaecd80abedad6dfd0d72f1476da"),
            ),
            synthetic=False,
        )

        assert revision == expected_revision
def test_svn_date_to_swh_date_epoch():
    """Empty date should be EPOCH (timestamp and offset at 0)."""
    # It should return 0, epoch
    default_tstz = TimestampWithTimezone(timestamp=Timestamp(seconds=0,
                                                             microseconds=0),
                                         offset_bytes=b"+0000")

    assert converters.svn_date_to_swh_date("") == default_tstz
    assert converters.svn_date_to_swh_date(None) == default_tstz
예제 #4
0
def test_normalize_timestamp_datetime(date, seconds, tz, offset, offset_bytes,
                                      microsecond):
    date = date.astimezone(tz).replace(microsecond=microsecond)
    assert TimestampWithTimezone.from_dict(date).to_dict() == {
        "timestamp": {
            "seconds": seconds,
            "microseconds": microsecond
        },
        "offset_bytes": offset_bytes,
    }
    def test_commit_to_revision_with_extra_headers(self):
        sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b"

        revision = converters.dulwich_commit_to_revision(self.repo[sha1])
        expected_revision = Revision(
            id=hash_to_bytes(sha1.decode()),
            directory=bytes.fromhex(
                "f8ec06e4ed7b9fff4918a0241a48023143f30000"),
            type=RevisionType.GIT,
            committer=Person(
                name=b"David Douard",
                fullname=b"David Douard <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            author=Person(
                name=b"David Douard",
                fullname=b"David Douard <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            committer_date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1594137902,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            message=b"Am\xe9lioration du fichier READM\xa4\n",
            metadata=None,
            extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)),
            date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1594136900,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            parents=(
                bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"), ),
            synthetic=False,
        )

        assert revision == expected_revision
    def test_commit_to_revision(self):
        sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da"

        revision = converters.dulwich_commit_to_revision(self.repo[sha1])
        expected_revision = Revision(
            id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"),
            directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca",
            type=RevisionType.GIT,
            committer=Person(
                name=b"Stefano Zacchiroli",
                fullname=b"Stefano Zacchiroli <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            author=Person(
                name=b"Stefano Zacchiroli",
                fullname=b"Stefano Zacchiroli <*****@*****.**>",
                email=b"*****@*****.**",
            ),
            committer_date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1443083765,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            message=b"add submodule dependency\n",
            metadata=None,
            extra_headers=(),
            date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1443083765,
                    microseconds=0,
                ),
                offset_bytes=b"+0200",
            ),
            parents=(
                b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r", ),
            synthetic=False,
        )

        assert revision == expected_revision
예제 #7
0
def test_api_revision_directory_ok_returns_revision(api_client, archive_data,
                                                    revision, person, date):
    rev_path = "foo"
    _dir = Directory(entries=(DirectoryEntry(
        name=rev_path.encode(),
        type="rev",
        target=hash_to_bytes(revision),
        perms=DentryPerms.revision,
    ), ))
    archive_data.directory_add([_dir])

    rev = Revision(
        directory=_dir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([rev])

    revision_id = hash_to_hex(rev.id)
    rev_data = archive_data.revision_get(revision)
    url = reverse(
        "api-1-revision-directory",
        {
            "sha1_git": revision_id,
            "dir_path": rev_path
        },
    )
    rv = check_api_get_responses(api_client, url, status_code=200)

    assert rv.data == {
        "content": enrich_revision(rev_data, request=rv.wsgi_request),
        "path": rev_path,
        "type": "rev",
        "revision": revision_id,
    }
예제 #8
0
def dulwich_tsinfo_to_timestamp(
    timestamp,
    timezone: int,
    timezone_neg_utc: bool,
    timezone_bytes: Optional[bytes],
) -> TimestampWithTimezone:
    """Convert the dulwich timestamp information to a structure compatible with
    Software Heritage."""
    ts = Timestamp(
        seconds=int(timestamp),
        microseconds=0,
    )
    if timezone_bytes is None:
        # Failed to parse from the raw manifest, fallback to what Dulwich managed to
        # parse.
        return TimestampWithTimezone.from_numeric_offset(
            timestamp=ts,
            offset=timezone // 60,
            negative_utc=timezone_neg_utc,
        )
    else:
        return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes)
예제 #9
0
 def build_release(self, p_info: MavenPackageInfo, uncompressed_path: str,
                   directory: Sha1Git) -> Optional[Release]:
     msg = f"Synthetic release for archive at {p_info.url}\n".encode(
         "utf-8")
     normalized_time = TimestampWithTimezone.from_datetime(p_info.time)
     return Release(
         name=p_info.version.encode(),
         message=msg,
         date=normalized_time,
         author=EMPTY_AUTHOR,
         target=directory,
         target_type=ObjectType.DIRECTORY,
         synthetic=True,
     )
예제 #10
0
    def test_revision_submodule(self, swh_storage, cook_extract_revision,
                                ingest_target_revision):
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(
                datetime.timezone.utc).replace(microsecond=0))

        target_rev = Revision(
            message=b"target_rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=bytes.fromhex(
                "3333333333333333333333333333333333333333"),
            metadata={},
            synthetic=True,
        )
        if ingest_target_revision:
            swh_storage.revision_add([target_rev])

        dir = Directory(entries=(DirectoryEntry(
            name=b"submodule",
            type="rev",
            target=target_rev.id,
            perms=0o160000,
        ), ), )
        swh_storage.directory_add([dir])

        rev = Revision(
            message=b"msg",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=dir.id,
            metadata={},
            synthetic=True,
        )
        swh_storage.revision_add([rev])

        with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p):
            ert.checkout(b"HEAD")
            pattern = b"160000 submodule\x00%s" % target_rev.id
            tree = ert.repo[b"HEAD"].tree
            assert pattern in ert.repo[tree].as_raw_string()
    def test_dulwich_tag_to_release_author_and_date(self):
        sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef")
        tagger = b"hey dude <*****@*****.**>"
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some release message"

        date = int(
            datetime.datetime(2007, 12, 5,
                              tzinfo=datetime.timezone.utc).timestamp())

        tag = dulwich.objects.Tag()
        tag.name = b"blah"
        tag.object = (dulwich.objects.Commit, target)
        tag.message = message
        tag.signature = None
        tag.tagger = tagger
        tag.tag_time = date
        tag.tag_timezone = 0
        assert tag.sha().digest() == sha

        # when
        actual_release = converters.dulwich_tag_to_release(tag)

        # then
        expected_release = Release(
            author=Person(
                email=b"*****@*****.**",
                fullname=b"hey dude <*****@*****.**>",
                name=b"hey dude",
            ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=1196812800,
                    microseconds=0,
                ),
                offset_bytes=b"+0000",
            ),
            id=sha,
            message=message,
            metadata=None,
            name=b"blah",
            synthetic=False,
            target=hash_to_bytes(target.decode()),
            target_type=ObjectType.REVISION,
        )

        assert actual_release == expected_release
    def test_dulwich_tag_to_release_author_zero_date(self):
        # to reproduce bug T815 (fixed)
        sha = hash_to_bytes("6cc1deff5cdcd853428bb63b937f43dd2566c36f")
        tagger = b"hey dude <*****@*****.**>"
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some release message"
        date = int(
            datetime.datetime(1970, 1, 1,
                              tzinfo=datetime.timezone.utc).timestamp())
        tag = dulwich.objects.Tag()
        tag.name = b"blah"
        tag.object = (dulwich.objects.Commit, target)
        tag.message = message
        tag.signature = None
        tag.tagger = tagger
        tag.tag_time = date
        tag.tag_timezone = 0
        assert tag.sha().digest() == sha

        # when
        actual_release = converters.dulwich_tag_to_release(tag)

        # then
        expected_release = Release(
            author=Person(
                email=b"*****@*****.**",
                fullname=b"hey dude <*****@*****.**>",
                name=b"hey dude",
            ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(
                    seconds=0,
                    microseconds=0,
                ),
                offset_bytes=b"+0000",
            ),
            id=sha,
            message=message,
            metadata=None,
            name=b"blah",
            synthetic=False,
            target=hash_to_bytes(target.decode()),
            target_type=ObjectType.REVISION,
        )

        assert actual_release == expected_release
예제 #13
0
def svn_date_to_swh_date(strdate: Optional[bytes]) -> TimestampWithTimezone:
    """Convert a string date to an swh one.

    Args:
        strdate: A string representing a date with format like
        ``b'YYYY-mm-DDTHH:MM:SS.800722Z'``

    Returns:
        An swh date format

    """
    if not strdate:  # either None or empty string
        dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
    else:
        dt = iso8601.parse_date(strdate.decode("ascii"))
        assert dt.tzinfo is not None, strdate
    return TimestampWithTimezone.from_datetime(dt)
예제 #14
0
def test_from_release():
    """Convert release model object to a dict should be ok"""
    ts = int(
        datetime.datetime(2015, 1, 1, 22, 0, 0,
                          tzinfo=datetime.timezone.utc).timestamp())
    release_input = Release(
        id=hashutil.hash_to_bytes("aad23fa492a0c5fed0708a6703be875448c86884"),
        target=hashutil.hash_to_bytes(
            "5e46d564378afc44b31bb89f99d5675195fbdf67"),
        target_type=ObjectType.REVISION,
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=ts, microseconds=0),
            offset=0,
            negative_utc=False,
        ),
        author=Person(
            name=b"author name",
            fullname=b"Author Name author@email",
            email=b"author@email",
        ),
        name=b"v0.0.1",
        message=b"some comment on release",
        synthetic=True,
    )

    expected_release = {
        "id": "aad23fa492a0c5fed0708a6703be875448c86884",
        "target": "5e46d564378afc44b31bb89f99d5675195fbdf67",
        "target_type": "revision",
        "date": "2015-01-01T22:00:00+00:00",
        "author": {
            "name": "author name",
            "fullname": "Author Name author@email",
            "email": "author@email",
        },
        "name": "v0.0.1",
        "message": "some comment on release",
        "target_type": "revision",
        "synthetic": True,
    }

    actual_release = converters.from_release(release_input)

    assert actual_release == expected_release
예제 #15
0
    def _make_stub_directory_revision(self, dir_id: Sha1Git) -> Sha1Git:
        author = Person.from_fullname(
            b"swh-vault, git-bare cooker <*****@*****.**>")
        dt = datetime.datetime.now(tz=datetime.timezone.utc)
        dt = dt.replace(microsecond=0)  # not supported by git
        date = TimestampWithTimezone.from_datetime(dt)

        revision = Revision(
            author=author,
            committer=author,
            date=date,
            committer_date=date,
            message=b"Initial commit",
            type=RevisionType.GIT,
            directory=self.obj_id,
            synthetic=True,
        )
        self.write_revision_node(revision)

        return revision.id
예제 #16
0
    def build_release(self, p_info: ArchPackageInfo, uncompressed_path: str,
                      directory: Sha1Git) -> Optional[Release]:
        intrinsic_metadata = extract_intrinsic_metadata(
            Path(uncompressed_path))
        author = Person.from_fullname(intrinsic_metadata["packager"].encode())
        description = intrinsic_metadata["pkgdesc"]

        message = (
            f"Synthetic release for Arch Linux source package {p_info.name} "
            f"version {p_info.version}\n\n"
            f"{description}\n")
        return Release(
            name=p_info.version.encode(),
            author=author,
            date=TimestampWithTimezone.from_iso8601(p_info.last_modified),
            message=message.encode(),
            target_type=ObjectType.DIRECTORY,
            target=directory,
            synthetic=True,
        )
예제 #17
0
    def build_release(self, p_info: NpmPackageInfo, uncompressed_path: str,
                      directory: Sha1Git) -> Optional[Release]:
        # Metadata from NPM is not intrinsic to tarballs.
        # This means two package versions can have the same tarball, but different
        # metadata. To avoid mixing up releases, every field used to build the
        # release object must be part of NpmPackageInfo.MANIFEST_FORMAT.
        i_metadata = extract_intrinsic_metadata(uncompressed_path)
        if not i_metadata:
            return None
        author = extract_npm_package_author(i_metadata)
        assert self.package_name == p_info.package_name
        msg = (
            f"Synthetic release for NPM source package {p_info.package_name} "
            f"version {p_info.version}\n")

        if p_info.date is None:
            url = p_info.url
            artifact_name = os.path.basename(url)
            raise ValueError(
                "Origin %s: Cannot determine upload time for artifact %s." %
                (p_info.url, artifact_name))

        date = TimestampWithTimezone.from_iso8601(p_info.date)

        # FIXME: this is to remain bug-compatible with earlier versions:
        date = attr.evolve(date,
                           timestamp=attr.evolve(date.timestamp,
                                                 microseconds=0))

        r = Release(
            name=p_info.version.encode(),
            message=msg.encode(),
            author=author,
            date=date,
            target=directory,
            target_type=ObjectType.DIRECTORY,
            synthetic=True,
        )
        return r
예제 #18
0
    def build_release(self, p_info: CratesPackageInfo, uncompressed_path: str,
                      directory: Sha1Git) -> Optional[Release]:
        # Extract intrinsic metadata from dir_path/Cargo.toml
        name = p_info.name
        version = p_info.version
        dir_path = Path(uncompressed_path, f"{name}-{version}")
        i_metadata_raw = extract_intrinsic_metadata(dir_path)
        # Get only corresponding key of IntrinsicPackageMetadata
        i_metadata_keys = [
            k for k in IntrinsicPackageMetadata.__annotations__.keys()
        ]
        # We use data only from "package" entry
        i_metadata = {
            k: v
            for k, v in i_metadata_raw["package"].items()
            if k in i_metadata_keys
        }
        p_info.i_metadata = IntrinsicPackageMetadata(
            **i_metadata)  # type: ignore[misc]

        author = extract_author(p_info)
        description = extract_description(p_info)
        message = (f"Synthetic release for Crate source package {p_info.name} "
                   f"version {p_info.version}\n\n"
                   f"{description}\n")
        # The only way to get a value for updated_at is through extrinsic metadata
        updated_at = p_info.e_metadata_version.get("updated_at")

        return Release(
            name=version.encode(),
            author=author,
            date=TimestampWithTimezone.from_iso8601(updated_at),
            message=message.encode(),
            target_type=ObjectType.DIRECTORY,
            target=directory,
            synthetic=True,
        )
    def test_weird_commit(self):
        """Checks raw_manifest is set when the commit cannot fit the data model"""

        # Well-formed manifest
        raw_manifest = (b"tree 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
                        b"author Foo <*****@*****.**> 1640191028 +0200\n"
                        b"committer Foo <*****@*****.**> 1640191028 +0200\n\n"
                        b"some commit message")
        commit = dulwich.objects.Commit.from_raw_string(
            b"commit", raw_manifest)
        date = TimestampWithTimezone(
            timestamp=Timestamp(seconds=1640191028, microseconds=0),
            offset_bytes=b"+0200",
        )
        assert converters.dulwich_commit_to_revision(commit) == Revision(
            message=b"some commit message",
            directory=hash_to_bytes(
                "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            committer=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            raw_manifest=None,
        )

        # Mess with the offset
        raw_manifest2 = raw_manifest.replace(b"+0200", b"+200")
        commit = dulwich.objects.Commit.from_raw_string(
            b"commit", raw_manifest2)
        date = TimestampWithTimezone(
            timestamp=Timestamp(seconds=1640191028, microseconds=0),
            offset_bytes=b"+200",
        )
        assert converters.dulwich_commit_to_revision(commit) == Revision(
            message=b"some commit message",
            directory=hash_to_bytes(
                "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            committer=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            raw_manifest=None,
        )

        # Mess with the rest of the manifest
        raw_manifest2 = raw_manifest.replace(
            b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
            b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
        )
        commit = dulwich.objects.Commit.from_raw_string(
            b"commit", raw_manifest2)
        date = TimestampWithTimezone(
            timestamp=Timestamp(seconds=1640191028, microseconds=0),
            offset_bytes=b"+0200",
        )
        assert converters.dulwich_commit_to_revision(commit) == Revision(
            message=b"some commit message",
            directory=hash_to_bytes(
                "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            committer=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            raw_manifest=b"commit 161\x00" + raw_manifest2,
        )
예제 #20
0
    f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom",
]

REL_MSGS = (
    b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/"
    b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n",
    b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/"
    b"sprova4j/0.1.1/sprova4j-0.1.1-sources.jar\n",
)

REL_DATES = (
    TimestampWithTimezone.from_datetime(
        datetime.datetime(2021,
                          7,
                          12,
                          19,
                          6,
                          59,
                          335000,
                          tzinfo=datetime.timezone.utc)),
    TimestampWithTimezone.from_datetime(
        datetime.datetime(2021,
                          7,
                          12,
                          19,
                          37,
                          5,
                          534000,
                          tzinfo=datetime.timezone.utc)),
)
예제 #21
0
def test_arch_loader_load_one_version(datadir, requests_mock_datadir,
                                      swh_storage):
    loader = ArchLoader(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        artifacts=EXPECTED_PACKAGES[1]["artifacts"],
    )
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4"
    expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc"

    assert expected_snapshot_id == actual_load_status["snapshot_id"]

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz":
            SnapshotBranch(
                target=hash_to_bytes(expected_release_id),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 1,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert swh_storage.release_get([
        hash_to_bytes(expected_release_id)
    ])[0] == Release(
        name=b"1.12-1",
        message=b"Synthetic release for Arch Linux source package gzip version "
        b"1.12-1\n\nGNU compression utility\n",
        target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(
            b"Arch Linux ARM Build System <*****@*****.**>"),
        date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"),
        id=hash_to_bytes(expected_release_id),
    )

    assert_last_visit_matches(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        status="full",
        type="arch",
        snapshot=expected_snapshot.id,
    )
예제 #22
0
            type="file",
            target=CONTENT.sha1_git,
            perms=DentryPerms.content,
        )
    ]),
)

REVISION = Revision(
    id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"),
    message=b"hello",
    author=Person(
        name=b"Nicolas Dandrimont",
        email=b"*****@*****.**",
        fullname=b"Nicolas Dandrimont <*****@*****.**> ",
    ),
    date=TimestampWithTimezone(Timestamp(1234567890, 0),
                               offset_bytes=b"+0200"),
    committer=Person(
        name=b"St\xc3fano Zacchiroli",
        email=b"*****@*****.**",
        fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
    ),
    committer_date=TimestampWithTimezone(Timestamp(1123456789, 0),
                                         offset_bytes=b"-0000"),
    parents=(),
    type=RevisionType.GIT,
    directory=DIRECTORY.id,
    metadata={
        "checksums": {
            "sha1": "tarball-sha1",
            "sha256": "tarball-sha256",
        },
예제 #23
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
def test_ignore_displayname(swh_storage, use_graph):
    """Tests the original authorship information is used instead of
    configured display names; otherwise objects would not match their hash,
    and git-fsck/git-clone would fail.

    This tests both with and without swh-graph, as both configurations use different
    code paths to fetch revisions.
    """

    date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0),
                                                     0, False)
    legacy_person = Person.from_fullname(b"old me <*****@*****.**>")
    current_person = Person.from_fullname(b"me <*****@*****.**>")

    content = Content.from_data(b"foo")
    swh_storage.content_add([content])

    directory = Directory(
        entries=(DirectoryEntry(name=b"file1",
                                type="file",
                                perms=0o100644,
                                target=content.sha1_git), ), )
    swh_storage.directory_add([directory])

    revision = Revision(
        message=b"rev",
        author=legacy_person,
        date=date,
        committer=legacy_person,
        committer_date=date,
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        synthetic=True,
    )
    swh_storage.revision_add([revision])

    release = Release(
        name=b"v1.1.0",
        message=None,
        author=legacy_person,
        date=date,
        target=revision.id,
        target_type=ObjectType.REVISION,
        synthetic=True,
    )
    swh_storage.release_add([release])

    snapshot = Snapshot(
        branches={
            b"refs/tags/v1.1.0":
            SnapshotBranch(target=release.id, target_type=TargetType.RELEASE),
            b"HEAD":
            SnapshotBranch(target=revision.id,
                           target_type=TargetType.REVISION),
        })
    swh_storage.snapshot_add([snapshot])

    # Add all objects to graph
    if use_graph:
        from swh.graph.naive_client import NaiveClient as GraphClient

        nodes = [
            str(x.swhid())
            for x in [content, directory, revision, release, snapshot]
        ]
        edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [
            (directory, content),
            (revision, directory),
            (release, revision),
            (snapshot, release),
            (snapshot, revision),
        ]]
        swh_graph = unittest.mock.Mock(
            wraps=GraphClient(nodes=nodes, edges=edges))
    else:
        swh_graph = None

    # Set a display name
    with swh_storage.db() as db:
        with db.transaction() as cur:
            cur.execute(
                "UPDATE person set displayname = %s where fullname = %s",
                (current_person.fullname, legacy_person.fullname),
            )

    # Check the display name did apply in the storage
    assert swh_storage.revision_get([revision.id])[0] == attr.evolve(
        revision,
        author=current_person,
        committer=current_person,
    )

    # Cook
    cooked_swhid = snapshot.swhid()
    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        # If we are here, it means git-fsck succeeded when called by cooker.cook(),
        # so we already know the original person was used. Let's double-check.

        repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git")

        tag = repo[b"refs/tags/v1.1.0"]
        assert tag.tagger == legacy_person.fullname

        commit = repo[tag.object[1]]
        assert commit.author == legacy_person.fullname
    def test_weird_tag(self):
        """Checks raw_manifest is set when the tag cannot fit the data model"""

        # Well-formed manifest
        raw_manifest = (b"object 641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce\n"
                        b"type commit\n"
                        b"tag blah\n"
                        b"tagger Foo <*****@*****.**> 1640191027 +0200\n\n"
                        b"some release message")
        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest)
        assert converters.dulwich_tag_to_release(tag) == Release(
            name=b"blah",
            message=b"some release message",
            target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            target_type=ObjectType.REVISION,
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(seconds=1640191027, microseconds=0),
                offset_bytes=b"+0200",
            ),
            raw_manifest=None,
        )

        # Mess with the offset (negative UTC)
        raw_manifest2 = raw_manifest.replace(b"+0200", b"-0000")
        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2)
        assert converters.dulwich_tag_to_release(tag) == Release(
            name=b"blah",
            message=b"some release message",
            target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            target_type=ObjectType.REVISION,
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(seconds=1640191027, microseconds=0),
                offset_bytes=b"-0000",
            ),
        )

        # Mess with the offset (other)
        raw_manifest2 = raw_manifest.replace(b"+0200", b"+200")
        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2)
        assert converters.dulwich_tag_to_release(tag) == Release(
            name=b"blah",
            message=b"some release message",
            target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            target_type=ObjectType.REVISION,
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(seconds=1640191027, microseconds=0),
                offset_bytes=b"+200",
            ),
        )

        # Mess with the rest of the manifest
        raw_manifest2 = raw_manifest.replace(
            b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce",
            b"641FB6E08DDB2E4FD096DCF18E80B894BF7E25CE",
        )
        tag = dulwich.objects.Tag.from_raw_string(b"tag", raw_manifest2)
        assert converters.dulwich_tag_to_release(tag) == Release(
            name=b"blah",
            message=b"some release message",
            target=hash_to_bytes("641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
            target_type=ObjectType.REVISION,
            synthetic=False,
            author=Person.from_fullname(b"Foo <*****@*****.**>", ),
            date=TimestampWithTimezone(
                timestamp=Timestamp(seconds=1640191027, microseconds=0),
                offset_bytes=b"+0200",
            ),
            raw_manifest=b"tag 136\x00" + raw_manifest2,
        )
예제 #26
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
예제 #27
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag,
                         weird_branches):
    r"""
    Build objects::

                                     snp
                                    /|||\
                                   / ||| \
                        rel2 <----°  /|\  \----> rel4
                         |          / | \         |
                         v         /  v  \        v
          rev1  <------ rev2 <----°  dir4 \      rel3
           |             |            |    \      |
           v             v            v     \     |
          dir1          dir2         dir3   |     |
           |           /   |          |     |     |
           v          /    v          v     v     v
          cnt1  <----°    cnt2       cnt3  cnt4  cnt5

    If up_to_date_graph is true, then swh-graph contains all objects.
    Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.

    If tag is False, rel2 is excluded.

    If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
    """
    from swh.graph.naive_client import NaiveClient as GraphClient

    # Create objects:

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")
    cnt1 = Content.from_data(b"correct")
    cnt2 = Content.from_data(b"horse")
    cnt3 = Content.from_data(b"battery")
    cnt4 = Content.from_data(b"staple")
    cnt5 = Content.from_data(b"Tr0ub4dor&3")
    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))
    dir2 = Directory(entries=(
        DirectoryEntry(
            name=b"file1",
            type="file",
            perms=DentryPerms.content,
            target=cnt1.sha1_git,
        ),
        DirectoryEntry(
            name=b"file2",
            type="file",
            perms=DentryPerms.content,
            target=cnt2.sha1_git,
        ),
    ))
    dir3 = Directory(entries=(DirectoryEntry(
        name=b"file3",
        type="file",
        perms=DentryPerms.content,
        target=cnt3.sha1_git,
    ), ))
    dir4 = Directory(entries=(DirectoryEntry(
        name=b"directory3",
        type="dir",
        perms=DentryPerms.directory,
        target=dir3.id,
    ), ))
    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )
    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir2.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    rel2 = Release(
        name=b"1.0.0",
        message=b"tag2",
        target_type=ObjectType.REVISION,
        target=rev2.id,
        synthetic=True,
    )
    rel3 = Release(
        name=b"1.0.0-blob",
        message=b"tagged-blob",
        target_type=ObjectType.CONTENT,
        target=cnt5.sha1_git,
        synthetic=True,
    )
    rel4 = Release(
        name=b"1.0.0-weird",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel3.id,
        synthetic=True,
    )
    rel5 = Release(
        name=b"1.0.0:weirdname",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel2.id,
        synthetic=True,
    )

    # Create snapshot:

    branches = {
        b"refs/heads/master":
        SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION),
    }
    if tag:
        branches[b"refs/tags/1.0.0"] = SnapshotBranch(
            target=rel2.id, target_type=TargetType.RELEASE)
    if weird_branches:
        branches[b"refs/heads/tree-ref"] = SnapshotBranch(
            target=dir4.id, target_type=TargetType.DIRECTORY)
        branches[b"refs/heads/blob-ref"] = SnapshotBranch(
            target=cnt4.sha1_git, target_type=TargetType.CONTENT)
        branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
            target=rel4.id, target_type=TargetType.RELEASE)
    snp = Snapshot(branches=branches)

    # "Fill" swh-graph

    if up_to_date_graph:
        nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (rev1, dir1),
            (rev2, dir2),
            (rev2, rev1),
            (snp, rev2),
        ]
        if tag:
            nodes.append(rel2)
            edges.append((rel2, rev2))
            edges.append((snp, rel2))
        if weird_branches:
            nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
            edges.extend([
                (dir3, cnt3),
                (dir4, dir3),
                (snp, dir4),
                (snp, cnt4),
                (snp, rel4),
                (rel4, rel3),
                (rel3, cnt5),
                (rel5, rev2),
            ])
    else:
        nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (dir3, cnt3),
            (rev1, dir1),
        ]
        if tag:
            nodes.append(rel2)
        if weird_branches:
            nodes.extend([cnt3, dir3])
            edges.extend([(dir3, cnt3)])

    nodes = [str(n.swhid()) for n in nodes]
    edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]

    # Add all objects to storage
    swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
    swh_storage.directory_add([dir1, dir2, dir3, dir4])
    swh_storage.revision_add([rev1, rev2])
    swh_storage.release_add([rel2, rel3, rel4, rel5])
    swh_storage.snapshot_add([snp])

    # Add spy on swh_storage, to make sure revision_log is not called
    # (the graph must be used instead)
    swh_storage = unittest.mock.MagicMock(wraps=swh_storage)

    # Add all objects to graph
    swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))

    # Cook
    backend = InMemoryVaultBackend()
    cooked_swhid = {
        RootObjects.SNAPSHOT: snp.swhid(),
        RootObjects.REVISION: rev2.swhid(),
        RootObjects.RELEASE: rel2.swhid(),
        RootObjects.WEIRD_RELEASE: rel5.swhid(),
    }[root_object]
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    if weird_branches:
        # git-fsck now rejects refs pointing to trees and blobs,
        # but some old git repos have them.
        cooker.use_fsck = False

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
            log_head = "master"
        elif root_object == RootObjects.RELEASE:
            log_head = "1.0.0"
        elif root_object == RootObjects.WEIRD_RELEASE:
            log_head = "release"
        else:
            assert False, root_object

        output = subprocess.check_output([
            "git",
            "-C",
            f"{tempdir}/{cooked_swhid}.git",
            "log",
            "--format=oneline",
            "--decorate=",
            log_head,
        ])

        assert output.decode(
        ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"

    # Make sure the graph was used instead of swh_storage.revision_log
    if root_object == RootObjects.SNAPSHOT:
        if up_to_date_graph:
            # The graph has everything, so the first call succeeds and returns
            # all objects transitively pointed by the snapshot
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
            ])
        else:
            # The graph does not have everything, so the first call returns nothing.
            # However, the second call (on the top rev) succeeds and returns
            # all objects but the rev and the rel
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
                unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
            ])
    elif root_object in (
            RootObjects.REVISION,
            RootObjects.RELEASE,
            RootObjects.WEIRD_RELEASE,
    ):
        swh_graph.visit_nodes.assert_has_calls(
            [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")])
    else:
        assert False, root_object

    if up_to_date_graph:
        swh_storage.revision_log.assert_not_called()
        swh_storage.revision_shortlog.assert_not_called()
    else:
        swh_storage.revision_log.assert_called()
예제 #29
0
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir):
    """Test with two versions that have exactly the same tarball"""
    package = "org_version_mismatch"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1"
    release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0"
    versions = [
        ("0.0.3-beta", beta_release_id),
        ("0.0.3", release_id),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.3", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release(
        name=b"0.0.3-beta",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3-beta\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(beta_release_id),
    )

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.3",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    # Check incremental re-load keeps it unchanged

    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status == {
        "status": "uneventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )
def test_checksum_mismatch(swh_storage, mismatch_on):
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")

    wrong_hash = b"\x12\x34" * 10

    cnt1 = Content.from_data(b"Tr0ub4dor&3")
    if mismatch_on == "content":
        cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash)

    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))

    if mismatch_on == "directory":
        dir1 = attr.evolve(dir1, id=wrong_hash)

    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision1":
        rev1 = attr.evolve(rev1, id=wrong_hash)

    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision2":
        rev2 = attr.evolve(rev2, id=wrong_hash)

    cooked_swhid = rev2.swhid()

    swh_storage.content_add([cnt1])
    swh_storage.directory_add([dir1])
    swh_storage.revision_add([rev1, rev2])

    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=None,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if mismatch_on != "revision2":
            # git-log fails if the head revision is corrupted
            # TODO: we need to find a way to make this somewhat usable
            output = subprocess.check_output([
                "git",
                "-C",
                f"{tempdir}/{cooked_swhid}.git",
                "log",
                "--format=oneline",
                "--decorate=",
            ])

            assert output.decode(
            ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"