示例#1
0
def test_stream_results_kwarg(stream_results):
    def paged_results(page_token):
        assert False, "should not be called"

    with pytest.raises(TypeError):
        actual_data = stream_results(paged_results, page_token=42)
        list(actual_data)
示例#2
0
def test_stream_results_pagination():
    input_data = [
        {
            "url": "something"
        },
        {
            "url": "something2"
        },
    ]
    input_data2 = [
        {
            "url": "something3"
        },
    ]
    input_data3 = [
        {
            "url": "something4"
        },
    ]

    def page_results2(page_token=None) -> TestPagedResult:
        result_per_token = {
            None: TestPagedResult(results=input_data, next_page_token=b"two"),
            b"two": TestPagedResult(results=input_data2,
                                    next_page_token=b"three"),
            b"three": TestPagedResult(results=input_data3,
                                      next_page_token=None),
        }
        return result_per_token[page_token]

    # multiple calls to solve the pagination calls
    actual_data = stream_results(page_results2)
    assert list(actual_data) == input_data + input_data2 + input_data3
示例#3
0
def test_stream_results_no_result():
    def paged_results(page_token) -> TestPagedResult:
        return TestPagedResult(results=[], next_page_token=None)

    # only 1 call, no pagination
    actual_data = stream_results(paged_results)
    assert list(actual_data) == []
示例#4
0
    def projects_last_modified(self) -> ProjectsLastModifiedCache:
        if not self.incremental:
            # No point in loading the previous results if we're doing a full run
            return {}
        if self._project_last_modified is not None:
            return self._project_last_modified
        # We know there will be at least that many origins
        stream = stream_results(self.scheduler.get_listed_origins,
                                self.lister_obj.id,
                                limit=300_000)
        listed_origins = dict()
        # Projects can have slashes in them if they're subprojects, but the
        # mointpoint (last component) cannot.
        url_match = re.compile(
            r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*")
        for origin in stream:
            url = origin.url
            match = url_match.match(url)
            assert match is not None
            matches = match.groupdict()
            namespace = matches["namespace"]
            project = matches["project"]
            # "Last modified" dates are the same across all VCS (tools, even)
            # within a project or subproject. An assertion here would be overkill.
            last_modified = origin.last_update
            assert last_modified is not None
            listed_origins[(namespace, project)] = last_modified.date()

        self._project_last_modified = listed_origins
        return listed_origins
示例#5
0
def test_fill_test_data(swh_scheduler):
    for task_type in TASK_TYPES.values():
        swh_scheduler.create_task_type(task_type)

    simulator.fill_test_data(swh_scheduler, num_origins=NUM_ORIGINS)

    origins = list(stream_results(swh_scheduler.get_listed_origins))
    assert len(origins) == NUM_ORIGINS

    res = swh_scheduler.search_tasks()
    assert len(res) == NUM_ORIGINS
    def test_content_get_partition_murmur3_collision(self, swh_storage, mocker,
                                                     sample_data):
        """The Murmur3 token is used as link from index tables to the main table; and
        non-matching contents with colliding murmur3-hash are filtered-out when reading
        the main table.

        This test checks the content_get_partition endpoints return all contents, even
        the collisions.

        """
        called = 0

        rows: Dict[int, Dict] = {}
        for tok, content in enumerate(sample_data.contents):
            cont = attr.evolve(content, data=None, ctime=now())
            row_d = {**cont.to_dict(), "tok": tok}
            rows[tok] = row_d

        # For all tokens, always return cont

        def mock_content_get_token_range(range_start, range_end, limit):
            nonlocal called
            called += 1

            for tok in list(
                    rows.keys()) * 3:  # yield multiple times the same tok
                row_d = dict(rows[tok].items())
                row_d.pop("tok")
                yield (tok, ContentRow(**row_d))

        mocker.patch.object(
            swh_storage._cql_runner,
            "content_get_token_range",
            mock_content_get_token_range,
        )

        actual_results = list(
            stream_results(swh_storage.content_get_partition,
                           partition_id=0,
                           nb_partitions=1))

        assert called > 0

        # everything is listed, even collisions
        assert len(actual_results) == 3 * len(sample_data.contents)
        # as we duplicated the returned results, dropping duplicate should yield
        # the original length
        assert len(set(actual_results)) == len(sample_data.contents)
示例#7
0
def test_stream_results_no_pagination():
    input_data = [
        {
            "url": "something"
        },
        {
            "url": "something2"
        },
    ]

    def paged_results(page_token) -> TestPagedResult:
        return TestPagedResult(results=input_data, next_page_token=None)

    # only 1 call, no pagination
    actual_data = stream_results(paged_results)
    assert list(actual_data) == input_data
示例#8
0
    def projects_last_modified(self) -> ProjectsLastModifiedCache:
        if not self.incremental:
            # No point in loading the previous results if we're doing a full run
            return {}
        if self._project_last_modified is not None:
            return self._project_last_modified
        # We know there will be at least that many origins
        stream = stream_results(self.scheduler.get_listed_origins,
                                self.lister_obj.id,
                                limit=300_000)
        listed_origins = dict()
        # Projects can have slashes in them if they're subprojects, but the
        # mointpoint (last component) cannot.
        url_match = re.compile(
            r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*")
        bzr_url_match = re.compile(
            r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzr/([^/]+)")
        cvs_url_match = re.compile(
            r"rsync://a.cvs.sourceforge.net/cvsroot/(?P<project>.+)/([^/]+)")

        for origin in stream:
            url = origin.url
            match = url_match.match(url)
            if match is None:
                # Could be a bzr or cvs special endpoint
                bzr_match = bzr_url_match.match(url)
                cvs_match = cvs_url_match.match(url)
                matches = None
                if bzr_match is not None:
                    matches = bzr_match.groupdict()
                elif cvs_match is not None:
                    matches = cvs_match.groupdict()
                assert matches
                project = matches["project"]
                namespace = "p"  # no special namespacing for bzr and cvs projects
            else:
                matches = match.groupdict()
                namespace = matches["namespace"]
                project = matches["project"]
            # "Last modified" dates are the same across all VCS (tools, even)
            # within a project or subproject. An assertion here would be overkill.
            last_modified = origin.last_update
            assert last_modified is not None
            listed_origins[(namespace, project)] = last_modified.date()

        self._project_last_modified = listed_origins
        return listed_origins
示例#9
0
    def assert_results_ok(self, partition_id, nb_partitions, actual_results):
        expected_ids = [
            c.sha1
            for c in stream_results(
                self.indexer.storage.content_get_partition,
                partition_id=partition_id,
                nb_partitions=nb_partitions,
            )
        ]

        actual_results = list(actual_results)
        for indexed_data in actual_results:
            _id = indexed_data.id
            assert _id in expected_ids

            _tool_id = indexed_data.indexer_configuration_id
            assert _tool_id == self.indexer.tool["id"]
示例#10
0
    def indexed_contents_in_partition(
        self,
        partition_id: int,
        nb_partitions: int,
    ) -> Iterable[Sha1]:
        """Retrieve indexed content ids within partition_id.

        Args:
            partition_id: Index of the partition to fetch
            nb_partitions: Total number of partitions to split into
            page_token: opaque token used for pagination
        """
        return stream_results(
            self.idx_storage.content_mimetype_get_partition,
            self.tool["id"],
            partition_id,
            nb_partitions,
        )
    def indexed_contents_in_partition(
            self,
            partition_id: int,
            nb_partitions: int,
            page_token: Optional[str] = None) -> Iterable[Sha1]:
        """Retrieve indexed content id within the partition id

        Args:
            partition_id: Index of the partition to fetch
            nb_partitions: Total number of partitions to split into
            page_token: opaque token used for pagination
        """
        return stream_results(
            self.idx_storage.content_fossology_license_get_partition,
            self.tool["id"],
            partition_id,
            nb_partitions,
        )
示例#12
0
def test_task_schedule_origins_with_limit(swh_scheduler, storage):
    """Tests support of extra keyword-arguments."""
    _fill_storage_with_origins(storage, 50)
    limit = 20
    expected_origins = list(islice(stream_results(storage.origin_list), limit))
    nb_origins = len(expected_origins)

    assert nb_origins == limit
    max_task_size = 5
    nb_tasks, remainder = divmod(nb_origins, max_task_size)
    assert remainder == 0  # made the numbers go round

    result = invoke(
        swh_scheduler,
        False,
        [
            "task",
            "schedule_origins",
            "swh-test-ping",
            "--batch-size",
            max_task_size,
            "--limit",
            limit,
        ],
    )

    # Check the output
    expected = rf"""
Scheduled {nb_tasks} tasks \({nb_origins} origins\).
Done.
""".lstrip()
    assert result.exit_code == 0, result.output
    assert re.fullmatch(expected, result.output,
                        re.MULTILINE), repr(result.output)

    tasks = swh_scheduler.search_tasks()
    _assert_origin_tasks_contraints(tasks, max_task_size, nb_origins,
                                    expected_origins)