Exemplo n.º 1
0
    def load_repo_null_fields(self, git_loader):
        # Our schema doesn't enforce a lot of non-null revision fields. We need
        # to check these cases don't break the cooker.
        repo = TestRepo()
        with repo as rp:
            (rp / "file").write_text(TEST_CONTENT)
            c = repo.commit("initial commit")
            loader = git_loader(str(rp))
            loader.load()
            repo.repo.refs[b"HEAD"].decode()
            dir_id_hex = repo.repo[c].tree.decode()
            dir_id = hashutil.hash_to_bytes(dir_id_hex)

        test_revision = Revision(
            message=b"",
            author=Person(name=None, email=None, fullname=b""),
            date=None,
            committer=Person(name=None, email=None, fullname=b""),
            committer_date=None,
            parents=(),
            type=RevisionType.GIT,
            directory=dir_id,
            metadata={},
            synthetic=True,
        )

        storage = loader.storage
        storage.revision_add([test_revision])
        return (loader, test_revision.swhid())
Exemplo n.º 2
0
def cook_extract_directory_gitfast(storage, swhid, fsck=True):
    """Context manager that cooks a revision containing a directory and extract it,
    using RevisionGitfastCooker"""
    test_repo = TestRepo()
    with test_repo as p:
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(datetime.timezone.utc))
        revision = Revision(
            directory=swhid.object_id,
            message=b"dummy message",
            author=Person.from_fullname(b"someone"),
            committer=Person.from_fullname(b"someone"),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            synthetic=False,
        )
        storage.revision_add([revision])

    with cook_stream_revision_gitfast(
            storage, revision.swhid()) as stream, test_repo as p:
        processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
        processor.import_stream(stream)
        test_repo.checkout(b"HEAD")
        shutil.rmtree(p / ".git")
        yield p
Exemplo n.º 3
0
    def test_revision_submodule(self, swh_storage, cook_extract_revision,
                                ingest_target_revision):
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(
                datetime.timezone.utc).replace(microsecond=0))

        target_rev = Revision(
            message=b"target_rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=bytes.fromhex(
                "3333333333333333333333333333333333333333"),
            metadata={},
            synthetic=True,
        )
        if ingest_target_revision:
            swh_storage.revision_add([target_rev])

        dir = Directory(entries=(DirectoryEntry(
            name=b"submodule",
            type="rev",
            target=target_rev.id,
            perms=0o160000,
        ), ), )
        swh_storage.directory_add([dir])

        rev = Revision(
            message=b"msg",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=dir.id,
            metadata={},
            synthetic=True,
        )
        swh_storage.revision_add([rev])

        with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p):
            ert.checkout(b"HEAD")
            pattern = b"160000 submodule\x00%s" % target_rev.id
            tree = ert.repo[b"HEAD"].tree
            assert pattern in ert.repo[tree].as_raw_string()
Exemplo n.º 4
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
def test_checksum_mismatch(swh_storage, mismatch_on):
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")

    wrong_hash = b"\x12\x34" * 10

    cnt1 = Content.from_data(b"Tr0ub4dor&3")
    if mismatch_on == "content":
        cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash)

    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))

    if mismatch_on == "directory":
        dir1 = attr.evolve(dir1, id=wrong_hash)

    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision1":
        rev1 = attr.evolve(rev1, id=wrong_hash)

    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision2":
        rev2 = attr.evolve(rev2, id=wrong_hash)

    cooked_swhid = rev2.swhid()

    swh_storage.content_add([cnt1])
    swh_storage.directory_add([dir1])
    swh_storage.revision_add([rev1, rev2])

    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=None,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if mismatch_on != "revision2":
            # git-log fails if the head revision is corrupted
            # TODO: we need to find a way to make this somewhat usable
            output = subprocess.check_output([
                "git",
                "-C",
                f"{tempdir}/{cooked_swhid}.git",
                "log",
                "--format=oneline",
                "--decorate=",
            ])

            assert output.decode(
            ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag,
                         weird_branches):
    r"""
    Build objects::

                                     snp
                                    /|||\
                                   / ||| \
                        rel2 <----°  /|\  \----> rel4
                         |          / | \         |
                         v         /  v  \        v
          rev1  <------ rev2 <----°  dir4 \      rel3
           |             |            |    \      |
           v             v            v     \     |
          dir1          dir2         dir3   |     |
           |           /   |          |     |     |
           v          /    v          v     v     v
          cnt1  <----°    cnt2       cnt3  cnt4  cnt5

    If up_to_date_graph is true, then swh-graph contains all objects.
    Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.

    If tag is False, rel2 is excluded.

    If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
    """
    from swh.graph.naive_client import NaiveClient as GraphClient

    # Create objects:

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")
    cnt1 = Content.from_data(b"correct")
    cnt2 = Content.from_data(b"horse")
    cnt3 = Content.from_data(b"battery")
    cnt4 = Content.from_data(b"staple")
    cnt5 = Content.from_data(b"Tr0ub4dor&3")
    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))
    dir2 = Directory(entries=(
        DirectoryEntry(
            name=b"file1",
            type="file",
            perms=DentryPerms.content,
            target=cnt1.sha1_git,
        ),
        DirectoryEntry(
            name=b"file2",
            type="file",
            perms=DentryPerms.content,
            target=cnt2.sha1_git,
        ),
    ))
    dir3 = Directory(entries=(DirectoryEntry(
        name=b"file3",
        type="file",
        perms=DentryPerms.content,
        target=cnt3.sha1_git,
    ), ))
    dir4 = Directory(entries=(DirectoryEntry(
        name=b"directory3",
        type="dir",
        perms=DentryPerms.directory,
        target=dir3.id,
    ), ))
    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )
    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir2.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    rel2 = Release(
        name=b"1.0.0",
        message=b"tag2",
        target_type=ObjectType.REVISION,
        target=rev2.id,
        synthetic=True,
    )
    rel3 = Release(
        name=b"1.0.0-blob",
        message=b"tagged-blob",
        target_type=ObjectType.CONTENT,
        target=cnt5.sha1_git,
        synthetic=True,
    )
    rel4 = Release(
        name=b"1.0.0-weird",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel3.id,
        synthetic=True,
    )
    rel5 = Release(
        name=b"1.0.0:weirdname",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel2.id,
        synthetic=True,
    )

    # Create snapshot:

    branches = {
        b"refs/heads/master":
        SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION),
    }
    if tag:
        branches[b"refs/tags/1.0.0"] = SnapshotBranch(
            target=rel2.id, target_type=TargetType.RELEASE)
    if weird_branches:
        branches[b"refs/heads/tree-ref"] = SnapshotBranch(
            target=dir4.id, target_type=TargetType.DIRECTORY)
        branches[b"refs/heads/blob-ref"] = SnapshotBranch(
            target=cnt4.sha1_git, target_type=TargetType.CONTENT)
        branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
            target=rel4.id, target_type=TargetType.RELEASE)
    snp = Snapshot(branches=branches)

    # "Fill" swh-graph

    if up_to_date_graph:
        nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (rev1, dir1),
            (rev2, dir2),
            (rev2, rev1),
            (snp, rev2),
        ]
        if tag:
            nodes.append(rel2)
            edges.append((rel2, rev2))
            edges.append((snp, rel2))
        if weird_branches:
            nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
            edges.extend([
                (dir3, cnt3),
                (dir4, dir3),
                (snp, dir4),
                (snp, cnt4),
                (snp, rel4),
                (rel4, rel3),
                (rel3, cnt5),
                (rel5, rev2),
            ])
    else:
        nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (dir3, cnt3),
            (rev1, dir1),
        ]
        if tag:
            nodes.append(rel2)
        if weird_branches:
            nodes.extend([cnt3, dir3])
            edges.extend([(dir3, cnt3)])

    nodes = [str(n.swhid()) for n in nodes]
    edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]

    # Add all objects to storage
    swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
    swh_storage.directory_add([dir1, dir2, dir3, dir4])
    swh_storage.revision_add([rev1, rev2])
    swh_storage.release_add([rel2, rel3, rel4, rel5])
    swh_storage.snapshot_add([snp])

    # Add spy on swh_storage, to make sure revision_log is not called
    # (the graph must be used instead)
    swh_storage = unittest.mock.MagicMock(wraps=swh_storage)

    # Add all objects to graph
    swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))

    # Cook
    backend = InMemoryVaultBackend()
    cooked_swhid = {
        RootObjects.SNAPSHOT: snp.swhid(),
        RootObjects.REVISION: rev2.swhid(),
        RootObjects.RELEASE: rel2.swhid(),
        RootObjects.WEIRD_RELEASE: rel5.swhid(),
    }[root_object]
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    if weird_branches:
        # git-fsck now rejects refs pointing to trees and blobs,
        # but some old git repos have them.
        cooker.use_fsck = False

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
            log_head = "master"
        elif root_object == RootObjects.RELEASE:
            log_head = "1.0.0"
        elif root_object == RootObjects.WEIRD_RELEASE:
            log_head = "release"
        else:
            assert False, root_object

        output = subprocess.check_output([
            "git",
            "-C",
            f"{tempdir}/{cooked_swhid}.git",
            "log",
            "--format=oneline",
            "--decorate=",
            log_head,
        ])

        assert output.decode(
        ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"

    # Make sure the graph was used instead of swh_storage.revision_log
    if root_object == RootObjects.SNAPSHOT:
        if up_to_date_graph:
            # The graph has everything, so the first call succeeds and returns
            # all objects transitively pointed by the snapshot
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
            ])
        else:
            # The graph does not have everything, so the first call returns nothing.
            # However, the second call (on the top rev) succeeds and returns
            # all objects but the rev and the rel
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
                unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
            ])
    elif root_object in (
            RootObjects.REVISION,
            RootObjects.RELEASE,
            RootObjects.WEIRD_RELEASE,
    ):
        swh_graph.visit_nodes.assert_has_calls(
            [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")])
    else:
        assert False, root_object

    if up_to_date_graph:
        swh_storage.revision_log.assert_not_called()
        swh_storage.revision_shortlog.assert_not_called()
    else:
        swh_storage.revision_log.assert_called()