def test_origin_visit_stats_upsert_batch(self, swh_scheduler) -> None:
        """Batch upsert is ok"""
        visit_stats = [
            OriginVisitStats(
                url="foo",
                visit_type="git",
                last_eventful=utcnow(),
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
                last_snapshot=hash_to_bytes(
                    "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
            ),
            OriginVisitStats(
                url="bar",
                visit_type="git",
                last_eventful=None,
                last_uneventful=utcnow(),
                last_notfound=None,
                last_failed=None,
                last_snapshot=hash_to_bytes(
                    "fffcc0710eb6cf9efd5b920a8453e1e07157bfff"),
            ),
        ]

        swh_scheduler.origin_visit_stats_upsert(visit_stats)

        for visit_stat in swh_scheduler.origin_visit_stats_get([
            (vs.url, vs.visit_type) for vs in visit_stats
        ]):
            assert visit_stat is not None
示例#2
0
    def diff_revisions(
        self,
        rev_from,
        rev_to,
        from_dir_model,
        to_dir_model,
        expected_changes,
        mock_get_dir,
        mock_get_rev,
    ):

        rev_from_bytes = hash_to_bytes(rev_from)
        rev_to_bytes = hash_to_bytes(rev_to)

        def _get_rev(*args, **kwargs):
            if args[1] == rev_from_bytes:
                return {"directory": from_dir_model["target"]}
            else:
                return {"directory": to_dir_model["target"]}

        def _get_dir(*args, **kwargs):
            from_dir = from_dir_model.get_hash_data(args[1])
            to_dir = to_dir_model.get_hash_data(args[1])
            return from_dir if from_dir != None else to_dir

        mock_get_rev.side_effect = _get_rev
        mock_get_dir.side_effect = _get_dir

        changes = diff.diff_revisions(
            None, rev_from_bytes, rev_to_bytes, track_renaming=True
        )

        self.assertEqual(changes, expected_changes)
def test_hash_collision_exception():
    hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a"
    hash_id = hashutil.hash_to_bytes(hex_hash_id)

    content = {
        "blake2s256":
        hashutil.hash_to_bytes(
            "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2"
        ),
        "sha1_git":
        hashutil.hash_to_bytes("ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"),
        "sha256":
        hashutil.hash_to_bytes(
            "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0"
        ),
        "sha1":
        hash_id,
    }

    exc = HashCollision("sha1", hash_id, [content])

    assert exc.algo == "sha1"
    assert exc.hash_id == hex_hash_id
    assert exc.colliding_contents == [content_hex_hashes(content)]

    assert exc.colliding_content_hashes() == [content]
def test_journal_client_origin_visit_status_from_journal_last_successful(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "bar",
            "visit": 1,
            "status": "partial",
            "date": utcnow(),
            "type": "git",
            "snapshot":
            hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        },
        {
            "origin": "foo",
            "visit": 1,
            "status": "full",
            "date": DATE1,
            "type": "git",
            "snapshot":
            hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"),
        },
        {
            "origin": "foo",
            "visit": 2,
            "status": "partial",
            "date": DATE2,
            "type": "git",
            "snapshot":
            hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"),
        },
        {
            "origin": "foo",
            "visit": 3,
            "status": "full",
            "date": DATE3,
            "type": "git",
            "snapshot":
            hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE3,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
            next_position_offset=0,
            successive_visits=3,
        ),
    )
示例#5
0
def fill_idx_storage(idx_storage: IndexerStorageInterface,
                     nb_rows: int) -> List[int]:
    tools: List[Dict[str, Any]] = [{
        "tool_name": "tool %d" % i,
        "tool_version": "0.0.1",
        "tool_configuration": {},
    } for i in range(2)]
    tools = idx_storage.indexer_configuration_add(tools)

    origin_metadata = [
        OriginIntrinsicMetadataRow(
            id="file://dev/%04d" % origin_id,
            from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)),
            indexer_configuration_id=tools[origin_id % 2]["id"],
            metadata={"name": "origin %d" % origin_id},
            mappings=["mapping%d" % (origin_id % 10)],
        ) for origin_id in range(nb_rows)
    ]
    revision_metadata = [
        RevisionIntrinsicMetadataRow(
            id=hash_to_bytes("abcd{:0>36}".format(origin_id)),
            indexer_configuration_id=tools[origin_id % 2]["id"],
            metadata={"name": "origin %d" % origin_id},
            mappings=["mapping%d" % (origin_id % 10)],
        ) for origin_id in range(nb_rows)
    ]

    idx_storage.revision_intrinsic_metadata_add(revision_metadata)
    idx_storage.origin_intrinsic_metadata_add(origin_metadata)

    return [tool["id"] for tool in tools]
示例#6
0
def complete_deposit(sample_archive, deposit_collection, authenticated_client):
    """Returns a completed deposit (load success)"""
    deposit = create_deposit(
        authenticated_client,
        deposit_collection.name,
        sample_archive,
        external_id="external-id-complete",
        deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS,
    )
    origin = "https://hal.archives-ouvertes.fr/hal-01727745"
    directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b"
    release_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10")
    snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0")
    deposit.swhid = f"swh:1:dir:{directory_id}"
    deposit.swhid_context = str(
        QualifiedSWHID(
            object_type=ObjectType.DIRECTORY,
            object_id=hash_to_bytes(directory_id),
            origin=origin,
            visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id),
            anchor=CoreSWHID(object_type=ObjectType.RELEASE, object_id=release_id),
            path=b"/",
        )
    )
    deposit.save()
    return deposit
def check_revisions_ordering(
    mocker,
    rev_walker_type,
    expected_result,
    truncated_history,
    revisions_list=_revisions_list,
):
    storage = mocker.patch("swh.storage.postgresql.storage.Storage")

    if not truncated_history:
        storage.revision_log.return_value = revisions_list
    else:
        revs_lists_truncated = [
            None if hash_to_hex(rev["id"]) == _rev_missing else rev
            for rev in revisions_list
        ]

        storage.revision_log.return_value = revs_lists_truncated

    revs_walker = get_revisions_walker(
        rev_walker_type, storage, hash_to_bytes(_rev_start)
    )

    assert list(map(hash_to_bytes, expected_result)) == [
        rev["id"] for rev in revs_walker
    ]

    assert revs_walker.is_history_truncated() == truncated_history

    if truncated_history:
        missing_revs = revs_walker.missing_revisions()
        assert missing_revs == {hash_to_bytes(_rev_missing)}
    else:
        assert revs_walker.missing_revisions() == set()
    def setUp(self):
        super().setUp()
        # replace actual license computation with a mock
        self.orig_compute_license = fossology_license.compute_license
        fossology_license.compute_license = mock_compute_license

        self.indexer = FossologyLicenseIndexer(CONFIG)
        self.indexer.catch_exceptions = False
        self.idx_storage = self.indexer.idx_storage
        fill_storage(self.indexer.storage)
        fill_obj_storage(self.indexer.objstorage)

        self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5"
        self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15"
        self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709"  # empty content

        tool = {
            k.replace("tool_", ""): v
            for (k, v) in self.indexer.tool.items()
        }
        # then
        self.expected_results = [
            *[
                ContentLicenseRow(
                    id=hash_to_bytes(self.id0), tool=tool, license=license)
                for license in SHA1_TO_LICENSES[self.id0]
            ],
            *[
                ContentLicenseRow(
                    id=hash_to_bytes(self.id1), tool=tool, license=license)
                for license in SHA1_TO_LICENSES[self.id1]
            ],
            *[],  # self.id2
        ]
    def test_dulwich_tag_to_release_no_author_no_date(self):
        sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136")
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some release message"
        tag = dulwich.objects.Tag()
        tag.name = b"blah"
        tag.object = (dulwich.objects.Commit, target)
        tag.message = message
        tag.signature = None
        tag.tagger = None
        tag.tag_time = None
        tag.tag_timezone = None
        assert tag.sha().digest() == sha

        # when
        actual_release = converters.dulwich_tag_to_release(tag)

        # then
        expected_release = Release(
            author=None,
            date=None,
            id=sha,
            message=message,
            metadata=None,
            name=b"blah",
            synthetic=False,
            target=hash_to_bytes(target.decode()),
            target_type=ObjectType.REVISION,
        )

        assert actual_release == expected_release
    def test_dulwich_tag_to_release_signature(self):
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some release message"
        sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
        tag = dulwich.objects.Tag()
        tag.name = b"blah"
        tag.object = (dulwich.objects.Commit, target)
        tag.message = message
        tag.signature = GPGSIG
        tag.tagger = None
        tag.tag_time = None
        tag.tag_timezone = None
        assert tag.sha().digest() == sha

        # when
        actual_release = converters.dulwich_tag_to_release(tag)

        # then
        expected_release = Release(
            author=None,
            date=None,
            id=sha,
            message=message + GPGSIG,
            metadata=None,
            name=b"blah",
            synthetic=False,
            target=hash_to_bytes(target.decode()),
            target_type=ObjectType.REVISION,
        )

        assert actual_release == expected_release
def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler):
    """A duplicated message must be ignored"""
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE1,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE1,
            last_visit=DATE1,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
            successive_visits=1,
        ),
    )
示例#12
0
def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir):
    """With no time upload, artifact is skipped"""
    package = "jammit-express"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("33b8f105d48ce16b6c59158af660e0cc78bcbef4")

    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    # artifact is used
    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.ALIAS, target=b"releases/0.0.1"
            ),
            b"releases/0.0.1": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("3e3b800570869fa9b3dbc302500553e62400cc06"),
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)
    def test_weird_tree(self):
        """Tests a tree with entries the wrong order"""

        raw_manifest = (
            b"0644 file2\x00"
            b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce"
            b"0644 file1\x00"
            b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce"
        )

        tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_manifest)

        assert converters.dulwich_tree_to_directory(tree) == Directory(
            entries=(
                # in alphabetical order, as it should be
                DirectoryEntry(
                    name=b"file1",
                    type="file",
                    target=hash_to_bytes(
                        "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
                    perms=0o644,
                ),
                DirectoryEntry(
                    name=b"file2",
                    type="file",
                    target=hash_to_bytes(
                        "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"),
                    perms=0o644,
                ),
            ),
            raw_manifest=b"tree 62\x00" + raw_manifest,
        )
示例#14
0
def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir):
    """With no prior visit, load a pypi project ends up with 1 snapshot"""
    url = "https://pypi.org/project/nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("1394b2e59351a944cc763bd9d26d90ce8e8121a8")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch(
                target=hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch(
                target=hash_to_bytes("732bb9dc087e6015884daaebb8b82559be729b5a"),
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)
    def test_load(self):
        with requests_mock.Mocker() as m:
            for file_ in self.files.values():
                path = os.path.join(RESOURCES_PATH, file_['name'])
                with open(path, 'rb') as fd:
                    m.get(file_['uri'], content=fd.read())
            self._load()

        self.assertCountSnapshots(1)
        self.assertCountReleases(0)  # FIXME: Why not 1?
        self.assertCountRevisions(1)
        self.assertCountDirectories(14)
        self.assertCountContents(315)

        # Check the root dir was loaded, and contains 'src/'
        root_hash = 'c906789049d2327a69b81cca6a1c1737321c836f'
        ls_root = list(self.storage.directory_ls(hash_to_bytes(root_hash)))
        src_dirs = [x for x in ls_root if x['name'] == b'src']
        self.assertEqual(len(src_dirs), 1, src_dirs)

        # Check 'src/hello.c' exists
        src_dir_hash = src_dirs[0]['target']
        ls_src = list(self.storage.directory_ls(src_dir_hash))
        hello_c = [x for x in ls_src if x['name'] == b'hello.c']
        self.assertEqual(len(hello_c), 1, hello_c)

        # Check 'src.hello.c' was loaded and has the right hash
        hello_c_hash = 'b60a061ac9dd25b29d57b756b5959aadc1fe6386'
        self.assertEqual(hello_c[0]['sha1'], hash_to_bytes(hello_c_hash))

        missing = list(
            self.storage.content_missing([{
                'sha1': hash_to_bytes(hello_c_hash)
            }]))
        self.assertEqual(missing, [])
示例#16
0
    def test_load_empty_tree(self):
        empty_dir_id = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"

        # Check the empty tree does not already exist for some reason
        # (it would make this test pointless)
        assert list(
            self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)])
        ) == [hash_to_bytes(empty_dir_id)]

        empty_tree = dulwich.objects.Tree()
        assert empty_tree.id.decode() == empty_dir_id
        self.repo.object_store.add_object(empty_tree)

        self.repo.do_commit(b"remove all bugs\n", tree=empty_tree.id)

        res = self.loader.load()
        assert res == {"status": "eventful"}

        assert (
            list(self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)]))
            == []
        )
        results = self.loader.storage.directory_get_entries(hash_to_bytes(empty_dir_id))
        assert results.next_page_token is None
        assert results.results == []
def test_eoferror(swh_storage, requests_mock_datadir):
    """Load a truncated archive which is invalid to make the uncompress
    function raising the exception EOFError. We then check if a
    snapshot is created, meaning this error is well managed.

    """
    sources = (
        "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json"  # noqa
    )
    loader = NixGuixLoader(swh_storage, sources)
    loader.load()

    expected_snapshot = Snapshot(
        id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"),
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
                target_type=TargetType.REVISION,
            ),
        },
    )

    check_snapshot(expected_snapshot, storage=swh_storage)
示例#18
0
def test_api_vault_cook_notfound(api_client, mocker, directory, revision,
                                 unknown_directory, unknown_revision):
    mock_vault = mocker.patch("swh.web.common.archive.vault")
    mock_vault.cook.side_effect = NotFoundExc("object not found")
    mock_vault.fetch.side_effect = NotFoundExc("cooked archive not found")
    mock_vault.progress.side_effect = NotFoundExc("cooking request not found")

    for obj_type, obj_id in (
        ("directory", directory),
        ("revision_gitfast", revision),
    ):

        obj_name = obj_type.split("_")[0]

        url = reverse(
            f"api-1-vault-cook-{obj_type}",
            url_args={f"{obj_type[:3]}_id": obj_id},
        )

        rv = check_api_get_responses(api_client, url, status_code=404)

        assert rv.data["exception"] == "NotFoundExc"
        assert (rv.data["reason"] ==
                f"Cooking of {obj_name} '{obj_id}' was never requested.")
        mock_vault.progress.assert_called_with(obj_type,
                                               hashutil.hash_to_bytes(obj_id))

    for obj_type, obj_id in (
        ("directory", unknown_directory),
        ("revision_gitfast", unknown_revision),
    ):
        obj_name = obj_type.split("_")[0]

        url = reverse(f"api-1-vault-cook-{obj_type}",
                      url_args={f"{obj_type[:3]}_id": obj_id})
        rv = check_api_post_responses(api_client,
                                      url,
                                      data=None,
                                      status_code=404)

        assert rv.data["exception"] == "NotFoundExc"
        assert rv.data["reason"] == f"{obj_name.title()} '{obj_id}' not found."
        mock_vault.cook.assert_called_with(obj_type,
                                           hashutil.hash_to_bytes(obj_id),
                                           email=None)

        fetch_url = reverse(
            f"api-1-vault-fetch-{obj_type}",
            url_args={f"{obj_type[:3]}_id": obj_id},
        )

        rv = check_api_get_responses(api_client, fetch_url, status_code=404)
        assert rv.data["exception"] == "NotFoundExc"
        assert (rv.data["reason"] ==
                f"Cooked archive for {obj_name} '{obj_id}' not found.")
        mock_vault.fetch.assert_called_with(obj_type,
                                            hashutil.hash_to_bytes(obj_id))
示例#19
0
    def test_api_release_target_type_not_a_revision(self, new_rel1, new_rel2,
                                                    new_rel3, content,
                                                    directory, release):

        for new_rel_id, target_type, target in (
                (new_rel1, 'content', content),
                (new_rel2, 'directory', directory),
                (new_rel3, 'release', release)):

            if target_type == 'content':
                target = target['sha1_git']

            sample_release = {
                'author': {
                    'email': b'*****@*****.**',
                    'fullname': b'author <*****@*****.**>',
                    'name': b'author'
                },
                'date': {
                    'timestamp': int(datetime.now().timestamp()),
                    'offset': 0,
                    'negative_utc': False,
                },
                'id': hash_to_bytes(new_rel_id),
                'message': b'sample release message',
                'name': b'sample release',
                'synthetic': False,
                'target': hash_to_bytes(target),
                'target_type': target_type
            }

            self.storage.release_add([sample_release])

            url = reverse('api-release', url_args={'sha1_git': new_rel_id})

            rv = self.client.get(url)

            expected_release = self.release_get(new_rel_id)

            author_id = expected_release['author']['id']
            author_url = reverse('api-person',
                                 url_args={'person_id': author_id})

            if target_type == 'content':
                url_args = {'q': 'sha1_git:%s' % target}
            else:
                url_args = {'sha1_git': target}

            target_url = reverse('api-%s' % target_type,
                                 url_args=url_args)
            expected_release['author_url'] = author_url
            expected_release['target_url'] = target_url

            self.assertEqual(rv.status_code, 200)
            self.assertEqual(rv['Content-Type'], 'application/json')
            self.assertEqual(rv.data, expected_release)
示例#20
0
    def test_from_release(self):
        release_input = {
            'id':
            hashutil.hash_to_bytes('aad23fa492a0c5fed0708a6703be875448c86884'),
            'target':
            hashutil.hash_to_bytes('5e46d564378afc44b31bb89f99d5675195fbdf67'),
            'target_type':
            'revision',
            'date': {
                'timestamp':
                datetime.datetime(2015,
                                  1,
                                  1,
                                  22,
                                  0,
                                  0,
                                  tzinfo=datetime.timezone.utc).timestamp(),
                'offset':
                0,
                'negative_utc':
                False,
            },
            'author': {
                'name': b'author name',
                'fullname': b'Author Name author@email',
                'email': b'author@email',
            },
            'name':
            b'v0.0.1',
            'message':
            b'some comment on release',
            'synthetic':
            True,
        }

        expected_release = {
            'id': 'aad23fa492a0c5fed0708a6703be875448c86884',
            'target': '5e46d564378afc44b31bb89f99d5675195fbdf67',
            'target_type': 'revision',
            'date': '2015-01-01T22:00:00+00:00',
            'author': {
                'name': 'author name',
                'fullname': 'Author Name author@email',
                'email': 'author@email',
            },
            'name': 'v0.0.1',
            'message': 'some comment on release',
            'target_type': 'revision',
            'synthetic': True,
        }

        # when
        actual_release = converters.from_release(release_input)

        # then
        self.assertEqual(actual_release, expected_release)
示例#21
0
def test_api_vault_cook(api_client, mocker, directory, revision):
    mock_archive = mocker.patch("swh.web.api.views.vault.archive")

    for obj_type, obj_id in (
        ("directory", directory),
        ("revision_gitfast", revision),
    ):

        fetch_url = reverse(
            f"api-1-vault-fetch-{obj_type}",
            url_args={f"{obj_type[:3]}_id": obj_id},
        )
        stub_cook = {
            "type": obj_type,
            "progress_msg": None,
            "task_id": 1,
            "task_status": "done",
            "object_id": obj_id,
        }
        stub_fetch = b"content"

        mock_archive.vault_cook.return_value = stub_cook
        mock_archive.vault_fetch.return_value = stub_fetch

        email = "*****@*****.**"
        url = reverse(
            f"api-1-vault-cook-{obj_type}",
            url_args={f"{obj_type[:3]}_id": obj_id},
            query_params={"email": email},
        )

        rv = check_api_post_responses(api_client,
                                      url,
                                      data=None,
                                      status_code=200)
        assert rv.data == {
            "fetch_url": rv.wsgi_request.build_absolute_uri(fetch_url),
            "obj_type": obj_type,
            "progress_message": None,
            "id": 1,
            "status": "done",
            "obj_id": obj_id,
        }
        mock_archive.vault_cook.assert_called_with(
            obj_type, hashutil.hash_to_bytes(obj_id), email)

        rv = check_http_get_response(api_client, fetch_url, status_code=200)
        assert rv["Content-Type"] == "application/gzip"
        assert rv.content == stub_fetch
        mock_archive.vault_fetch.assert_called_with(
            obj_type, hashutil.hash_to_bytes(obj_id))
示例#22
0
def test_pypi_visit_with_missing_artifact(
    swh_storage, requests_mock_datadir_missing_one
):
    """Load a pypi project with some missing artifacts ends up with 1 snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("00785a38479abe5fbfa402df96be26d2ddf89c97")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage,
        url,
        status="partial",
        type="pypi",
        snapshot=expected_snapshot_id,
    )

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)

    assert {
        "content": 3,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
    def process_put(
        self,
        request,
        headers: ParsedRequestHeaders,
        collection_name: str,
        deposit: Deposit,
    ) -> None:
        """Update the deposit with status and SWHIDs

        Returns:
            204 No content
            400 Bad request if checks fail

        """
        data = request.data

        status = data["status"]
        deposit.status = status
        if status == DEPOSIT_STATUS_LOAD_SUCCESS:
            origin_url = data["origin_url"]
            directory_id = data["directory_id"]
            release_id = data["release_id"]
            dir_id = CoreSWHID(object_type=ObjectType.DIRECTORY,
                               object_id=hash_to_bytes(directory_id))
            snp_id = CoreSWHID(
                object_type=ObjectType.SNAPSHOT,
                object_id=hash_to_bytes(data["snapshot_id"]),
            )
            rel_id = CoreSWHID(object_type=ObjectType.RELEASE,
                               object_id=hash_to_bytes(release_id))

            deposit.swhid = str(dir_id)
            # new id with contextual information
            deposit.swhid_context = str(
                QualifiedSWHID(
                    object_type=ObjectType.DIRECTORY,
                    object_id=hash_to_bytes(directory_id),
                    origin=origin_url,
                    visit=snp_id,
                    anchor=rel_id,
                    path="/",
                ))
        else:  # rejected
            deposit.status = status

        if "status_detail" in data:
            deposit.status_detail = data["status_detail"]

        deposit.save()
示例#24
0
def test_lookup_missing_hashes_some_present(archive_data, content, directory):
    missing_rev = random_sha1()
    missing_rel = random_sha1()
    missing_snp = random_sha1()

    grouped_swhids = {
        CONTENT: [hash_to_bytes(content["sha1_git"])],
        DIRECTORY: [hash_to_bytes(directory)],
        REVISION: [hash_to_bytes(missing_rev)],
        RELEASE: [hash_to_bytes(missing_rel)],
        SNAPSHOT: [hash_to_bytes(missing_snp)],
    }

    actual_result = archive.lookup_missing_hashes(grouped_swhids)

    assert actual_result == {missing_rev, missing_rel, missing_snp}
 def test_commit_without_manifest(self):
     """Tests a Release can still be produced when the manifest is not understood
     by the custom parser in dulwich_commit_to_revision."""
     target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
     message = b"some commit message"
     author = Person(fullname=b"Foo <*****@*****.**>",
                     name=b"Foo",
                     email=b"*****@*****.**")
     commit = dulwich.objects.Commit()
     commit.tree = target
     commit.message = message
     commit.author = commit.committer = b"Foo <*****@*****.**>"
     commit.author_time = commit.commit_time = 1641980946
     commit.author_timezone = commit.commit_timezone = 3600
     assert converters.dulwich_commit_to_revision(commit) == Revision(
         message=b"some commit message",
         author=author,
         committer=author,
         date=TimestampWithTimezone(
             timestamp=Timestamp(seconds=1641980946, microseconds=0),
             offset_bytes=b"+0100",
         ),
         committer_date=TimestampWithTimezone(
             timestamp=Timestamp(seconds=1641980946, microseconds=0),
             offset_bytes=b"+0100",
         ),
         type=RevisionType.GIT,
         directory=hash_to_bytes(target.decode()),
         synthetic=False,
         metadata=None,
         parents=(),
     )
    def test_corrupt_commit(self, attribute):
        sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9")
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some commit message"
        commit = dulwich.objects.Commit()
        commit.tree = target
        commit.message = message
        commit.gpgsig = GPGSIG
        commit.author = commit.committer = b"Foo <*****@*****.**>"
        commit.author_time = commit.commit_time = 1641980946
        commit.author_timezone = commit.commit_timezone = 3600
        converters.dulwich_commit_to_revision(commit)
        assert commit.sha().digest() == sha

        original_sha = commit.sha()

        setattr(commit, attribute, b"abcde")
        commit.sha()  # reset tag._needs_serialization
        commit._sha = original_sha  # force the wrong hash

        with pytest.raises(converters.HashMismatch):
            converters.dulwich_commit_to_revision(commit)

        if attribute == "_gpgsig":
            setattr(commit, attribute, None)
            commit.sha()  # reset tag._needs_serialization
            commit._sha = original_sha  # force the wrong hash
            with pytest.raises(converters.HashMismatch):
                converters.dulwich_commit_to_revision(commit)
    def test_corrupt_tag(self, attribute):
        sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71")
        target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"
        message = b"some release message"
        tag = dulwich.objects.Tag()
        tag.name = b"blah"
        tag.object = (dulwich.objects.Commit, target)
        tag.message = message
        tag.signature = GPGSIG
        tag.tagger = None
        tag.tag_time = None
        tag.tag_timezone = None
        assert tag.sha().digest() == sha
        converters.dulwich_tag_to_release(tag)

        original_sha = tag.sha()

        setattr(tag, attribute, b"abcde")
        tag.sha()  # reset tag._needs_serialization
        tag._sha = original_sha  # force the wrong hash
        with pytest.raises(converters.HashMismatch):
            converters.dulwich_tag_to_release(tag)

        if attribute == "signature":
            setattr(tag, attribute, None)
            tag.sha()  # reset tag._needs_serialization
            tag._sha = original_sha  # force the wrong hash
            with pytest.raises(converters.HashMismatch):
                converters.dulwich_tag_to_release(tag)
示例#28
0
 def revision_log(self, rev_id, limit=None):
     rev_id_bytes = hash_to_bytes(rev_id)
     return list(
         map(
             converters.from_revision,
             self.storage.revision_log([rev_id_bytes], limit=limit),
         ))
示例#29
0
 def content_find(self, content: Dict[str, Any]) -> Dict[str, Any]:
     cnt_ids_bytes = {
         algo_hash: hash_to_bytes(content[algo_hash])
         for algo_hash in ALGORITHMS if content.get(algo_hash)
     }
     cnt = self.storage.content_find(cnt_ids_bytes)
     return converters.from_content(cnt[0].to_dict()) if cnt else cnt
示例#30
0
    def _list_contents_to_index(self, partition_id: int, nb_partitions: int,
                                indexed: Set[Sha1]) -> Iterable[Sha1]:
        """Compute from storage the new contents to index in the partition_id . The already
           indexed contents are skipped.

        Args:
            partition_id: Index of the partition to fetch data from
            nb_partitions: Total number of partition
            indexed: Set of content already indexed.

        Yields:
            Sha1 id (bytes) of contents to index

        """
        if not isinstance(partition_id, int) or not isinstance(
                nb_partitions, int):
            raise TypeError(
                f"identifiers must be int, not {partition_id!r} and {nb_partitions!r}."
            )
        next_page_token = None
        while True:
            result = self.storage.content_get_partition(
                partition_id, nb_partitions, page_token=next_page_token)
            contents = result.results
            for c in contents:
                _id = hashutil.hash_to_bytes(c.sha1)
                if _id in indexed:
                    continue
                yield _id
            next_page_token = result.next_page_token
            if next_page_token is None:
                break