예제 #1
0
def test_task_lister_gitlab(
    task_name,
    incremental,
    swh_scheduler_celery_app,
    swh_scheduler_celery_worker,
    mocker,
):
    stats = ListerStats(pages=10, origins=200)
    mock_lister = mocker.patch("swh.lister.gitlab.tasks.GitLabLister")
    mock_lister.from_configfile.return_value = mock_lister
    mock_lister.run.return_value = ListerStats(pages=10, origins=200)

    kwargs = dict(url="https://gitweb.torproject.org/")
    res = swh_scheduler_celery_app.send_task(
        f"swh.lister.gitlab.tasks.{task_name}",
        kwargs=kwargs,
    )
    assert res
    res.wait()
    assert res.successful()

    mock_lister.from_configfile.assert_called_once_with(
        incremental=incremental, **kwargs)
    mock_lister.run.assert_called_once_with()
    assert res.result == stats.dict()
예제 #2
0
def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
    # setup the mocked CratesLister
    lister = mocker.patch("swh.lister.crates.tasks.CratesLister")
    lister.from_configfile.return_value = lister
    stats = ListerStats(pages=42, origins=42)
    lister.run.return_value = stats

    res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask")
    assert res
    res.wait()
    assert res.successful()
    assert res.result == stats.dict()

    lister.from_configfile.assert_called_once_with()
    lister.run.assert_called_once_with()
예제 #3
0
def test_lister_gitlab(datadir, swh_scheduler, requests_mock):
    """Gitlab lister supports full listing"""
    instance = "gitlab.com"
    lister = GitLabLister(swh_scheduler, url=api_url(instance), instance=instance)

    response = gitlab_page_response(datadir, instance, 1)

    requests_mock.get(
        lister.page_url(),
        [{"json": response}],
        additional_matcher=_match_request,
    )

    listed_result = lister.run()
    expected_nb_origins = len(response)
    assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)

    scheduler_origins = lister.scheduler.get_listed_origins(
        lister.lister_obj.id
    ).results
    assert len(scheduler_origins) == expected_nb_origins

    for listed_origin in scheduler_origins:
        assert listed_origin.visit_type == "git"
        assert listed_origin.url.startswith(f"https://{instance}")
        assert listed_origin.last_update is not None
예제 #4
0
def test_lister_gitlab_http_error_500(swh_scheduler, requests_mock, datadir):
    """Gitlab lister should skip buggy URL and move to next page."""
    instance = "gite.lirmm.fr"
    url = api_url(instance)
    lister = GitLabLister(swh_scheduler, url=url, instance=instance)

    url_page1 = lister.page_url()
    response1 = gitlab_page_response(datadir, instance, 1)
    url_page2 = lister.page_url(lister.per_page)
    url_page3 = lister.page_url(2 * lister.per_page)
    response3 = gitlab_page_response(datadir, instance, 3)

    requests_mock.get(
        url_page1,
        [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}],
        additional_matcher=_match_request,
    )
    requests_mock.get(
        url_page2,
        [
            {"status_code": 500},
        ],
        additional_matcher=_match_request,
    )

    requests_mock.get(
        url_page3,
        [{"json": response3}],
        additional_matcher=_match_request,
    )

    listed_result = lister.run()

    expected_nb_origins = len(response1) + len(response3)
    assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
예제 #5
0
def test_ratelimit_once_recovery(
    swh_scheduler,
    caplog,
    requests_ratelimited,
    num_ratelimit,
    monkeypatch_sleep_calls,
    lister_credentials,
):
    """Check that the lister recovers from hitting the rate-limit once"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler,
                          credentials=lister_credentials)

    res = lister.run()
    # check that we used all the pages
    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    token_users = []
    for record in caplog.records:
        if "Using authentication token" in record.message:
            token_users.append(record.args[0])

    # check that we used one more token than we saw rate limited requests
    assert len(token_users) == 1 + num_ratelimit

    # check that we slept for one second between our token uses
    assert monkeypatch_sleep_calls == [1]
예제 #6
0
def test_ratelimit_reset_sleep(
    swh_scheduler,
    caplog,
    requests_ratelimited,
    monkeypatch_sleep_calls,
    num_before_ratelimit,
    ratelimit_reset,
    github_credentials,
    lister_credentials,
):
    """Check that the lister properly handles rate-limiting when providing it with
    authentication tokens"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler,
                          credentials=lister_credentials)

    res = lister.run()
    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    # We sleep 1 second every time we change credentials, then we sleep until
    # ratelimit_reset + 1
    expected_sleep_calls = len(github_credentials) * [1] + [
        ratelimit_reset + 1
    ]
    assert monkeypatch_sleep_calls == expected_sleep_calls

    found_exhaustion_message = False
    for record in caplog.records:
        if record.levelname == "INFO":
            if "Rate limits exhausted for all tokens" in record.message:
                found_exhaustion_message = True
                break

    assert found_exhaustion_message
예제 #7
0
def test_incremental(swh_scheduler, caplog, requests_mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Number of origins to skip
    skip_origins = 2000
    expected_origins = ORIGIN_COUNT - skip_origins

    # Bump the last_seen_id in the scheduler backend
    set_lister_state(swh_scheduler, {"last_seen_id": skip_origins})

    # Run the lister in incremental mode
    lister = GitHubLister(scheduler=swh_scheduler)
    res = lister.run()

    # add 1 page to the number of full_pages if partial_page_len is not 0
    full_pages, partial_page_len = divmod(expected_origins,
                                          GitHubLister.PAGE_SIZE)
    expected_pages = full_pages + bool(partial_page_len)

    assert res == ListerStats(pages=expected_pages, origins=expected_origins)

    listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins +
                                                      1)
    assert len(listed_origins.results) == expected_origins
    assert listed_origins.next_page_token is None

    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}

    check_origin_4321(swh_scheduler, lister_data)
    check_origin_5555(swh_scheduler, lister_data)
예제 #8
0
def test_launchpad_full_listing_task(swh_scheduler_celery_app,
                                     swh_scheduler_celery_worker, mocker):
    lister = mocker.patch("swh.lister.launchpad.tasks.LaunchpadLister")
    lister.from_configfile.return_value = lister
    stats = ListerStats(pages=1, origins=28000)
    lister.run.return_value = stats

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.launchpad.tasks.FullLaunchpadLister")
    assert res
    res.wait()
    assert res.successful()
    assert res.result == stats.dict()

    lister.from_configfile.assert_called_once_with()
    lister.run.assert_called_once_with()
예제 #9
0
def test_full_lister_task(
    swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
    stats = ListerStats(pages=10, origins=900)
    mock_lister = mocker.patch("swh.lister.npm.tasks.NpmLister")
    mock_lister.from_configfile.return_value = mock_lister
    mock_lister.run.return_value = stats

    res = swh_scheduler_celery_app.send_task("swh.lister.npm.tasks.NpmListerTask")
    assert res
    res.wait()
    assert res.successful()

    mock_lister.from_configfile.assert_called_once_with(incremental=False)
    mock_lister.run.assert_called_once_with()
    assert res.result == stats.dict()
예제 #10
0
def test_incremental_listing(swh_scheduler_celery_app,
                             swh_scheduler_celery_worker, mocker):
    stats = ListerStats(pages=1, origins=90)
    mock_lister = mocker.patch(lister_module)
    mock_lister.from_configfile.return_value = mock_lister
    mock_lister.run.return_value = stats

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.sourceforge.tasks.IncrementalSourceForgeLister")
    assert res
    res.wait()
    assert res.successful()

    mock_lister.from_configfile.assert_called_once()
    mock_lister.run.assert_called_once()
    assert res.result == stats.dict()
예제 #11
0
def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
    lister = mocker.patch("swh.lister.packagist.tasks.PackagistLister")
    lister.from_configfile.return_value = lister
    stats = ListerStats(pages=1, origins=286500)
    lister.run.return_value = stats

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.packagist.tasks.PackagistListerTask"
    )
    assert res
    res.wait()
    assert res.successful()
    assert res.result == stats.dict()

    lister.from_configfile.assert_called_once_with()
    lister.run.assert_called_once_with()
예제 #12
0
def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock):
    """Heptapod lister happily lists hg, hg_git as hg and git origins"""
    name = "heptapod"
    instance = "foss.heptapod.net"
    lister = GitLabLister(
        swh_scheduler, url=api_url(instance), name=name, instance=instance
    )
    assert lister.LISTER_NAME == name

    response = gitlab_page_response(datadir, instance, 1)

    requests_mock.get(
        lister.page_url(),
        [{"json": response}],
        additional_matcher=_match_request,
    )

    listed_result = lister.run()
    expected_nb_origins = len(response)

    for entry in response:
        assert entry["vcs_type"] in ("hg", "hg_git")

    assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)

    scheduler_origins = lister.scheduler.get_listed_origins(
        lister.lister_obj.id
    ).results
    assert len(scheduler_origins) == expected_nb_origins

    for listed_origin in scheduler_origins:
        assert listed_origin.visit_type == "hg"
        assert listed_origin.url.startswith(f"https://{instance}")
        assert listed_origin.last_update is not None
예제 #13
0
def test_lister_cgit_with_base_git_url(url, base_git_url, expected_nb_origins,
                                       requests_mock_datadir, swh_scheduler):
    """With base git url provided, listed urls should be the computed origin urls

    """
    lister_cgit = CGitLister(
        swh_scheduler,
        url=url,
        base_git_url=base_git_url,
    )

    stats = lister_cgit.run()

    assert stats == ListerStats(pages=1, origins=expected_nb_origins)

    # test page parsing
    scheduler_origins = swh_scheduler.get_listed_origins(
        lister_cgit.lister_obj.id).results
    assert len(scheduler_origins) == expected_nb_origins

    # test listed repositories
    for listed_origin in scheduler_origins:
        assert listed_origin.visit_type == "git"
        assert listed_origin.url.startswith(base_git_url)
        assert (listed_origin.url.startswith(url) is
                False), f"url should be mapped to {base_git_url}"
예제 #14
0
def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir):
    """Gitlab lister supports pagination"""
    instance = "gite.lirmm.fr"
    lister = GitLabLister(swh_scheduler, url=api_url(instance))

    response1 = gitlab_page_response(datadir, instance, 1)
    response2 = gitlab_page_response(datadir, instance, 2)

    requests_mock.get(
        lister.page_url(),
        [{"json": response1, "headers": {"Link": f"<{lister.page_url(2)}>; rel=next"}}],
        additional_matcher=_match_request,
    )

    requests_mock.get(
        lister.page_url(2),
        [{"json": response2}],
        additional_matcher=_match_request,
    )

    listed_result = lister.run()

    expected_nb_origins = len(response1) + len(response2)
    assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)

    scheduler_origins = lister.scheduler.get_listed_origins(
        lister.lister_obj.id
    ).results
    assert len(scheduler_origins) == expected_nb_origins

    for listed_origin in scheduler_origins:
        assert listed_origin.visit_type == "git"
        assert listed_origin.url.startswith(f"https://{instance}")
        assert listed_origin.last_update is not None
예제 #15
0
def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
    """cgit lister supports pagination"""

    url = "https://git.tizen/cgit/"
    lister_cgit = CGitLister(swh_scheduler, url=url)

    stats = lister_cgit.run()

    expected_nb_origins = 16
    assert stats == ListerStats(pages=3, origins=expected_nb_origins)

    # test page parsing
    scheduler_origins = swh_scheduler.get_listed_origins(
        lister_cgit.lister_obj.id).results
    assert len(scheduler_origins) == expected_nb_origins

    # test listed repositories
    for listed_origin in scheduler_origins:
        assert listed_origin.visit_type == "git"
        assert listed_origin.url.startswith("https://git.tizen")

    # test user agent content
    assert len(requests_mock_datadir.request_history) != 0
    for request in requests_mock_datadir.request_history:
        assert "User-Agent" in request.headers
        user_agent = request.headers["User-Agent"]
        assert "Software Heritage Lister" in user_agent
        assert __version__ in user_agent
예제 #16
0
def test_lister_cgit_run_populates_last_update(requests_mock_datadir,
                                               swh_scheduler):
    """cgit lister returns last updated date"""

    url = "https://git.tizen/cgit"

    urls_without_date = [
        f"https://git.tizen.org/cgit/{suffix_url}" for suffix_url in [
            "All-Projects",
            "All-Users",
            "Lock-Projects",
        ]
    ]

    lister_cgit = CGitLister(swh_scheduler, url=url)

    stats = lister_cgit.run()

    expected_nb_origins = 16
    assert stats == ListerStats(pages=3, origins=expected_nb_origins)

    # test page parsing
    scheduler_origins = swh_scheduler.get_listed_origins(
        lister_cgit.lister_obj.id).results
    assert len(scheduler_origins) == expected_nb_origins

    # test listed repositories
    for listed_origin in scheduler_origins:
        if listed_origin.url in urls_without_date:
            assert listed_origin.last_update is None
        else:
            assert listed_origin.last_update is not None
예제 #17
0
def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir,
                                  mocker):
    """Gitlab lister supports rate-limit

    """
    instance = "gite.lirmm.fr"
    url = api_url(instance)
    lister = GitLabLister(swh_scheduler, url=url, instance=instance)

    url_page1 = lister.page_url()
    response1 = gitlab_page_response(datadir, instance, 1)
    url_page2 = lister.page_url(2)
    response2 = gitlab_page_response(datadir, instance, 2)

    requests_mock.get(
        url_page1,
        [{
            "json": response1,
            "headers": {
                "Link": f"<{url_page2}>; rel=next"
            }
        }],
        additional_matcher=_match_request,
    )
    requests_mock.get(
        url_page2,
        [
            # rate limited twice
            {
                "status_code": codes.forbidden,
                "headers": {
                    "RateLimit-Remaining": "0"
                }
            },
            {
                "status_code": codes.forbidden,
                "headers": {
                    "RateLimit-Remaining": "0"
                }
            },
            # ok
            {
                "json": response2
            },
        ],
        additional_matcher=_match_request,
    )

    # To avoid this test being too slow, we mock sleep within the retry behavior
    mock_sleep = mocker.patch.object(lister.get_page_result.retry, "sleep")

    listed_result = lister.run()

    expected_nb_origins = len(response1) + len(response2)
    assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)

    assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE])
예제 #18
0
def test_lister_cgit_get_origin_from_repo_failing(
        requests_mock_datadir_missing_url, swh_scheduler):
    url = "https://git.tizen/cgit/"
    lister_cgit = CGitLister(swh_scheduler, url=url)

    stats = lister_cgit.run()

    expected_nb_origins = 15
    assert stats == ListerStats(pages=3, origins=expected_nb_origins)
예제 #19
0
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
    # setup the mocked PypiLister
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=1, origins=0)

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.pypi.tasks.PyPIListerTask")
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with()
    lister.run.assert_called_once_with()
예제 #20
0
def test_phabricator_lister_task(lister, swh_scheduler_celery_app,
                                 swh_scheduler_celery_worker):
    # setup the mocked PhabricatorLister
    lister.from_configfile.return_value = lister
    lister_stats = ListerStats(pages=2, origins=200)
    lister.run.return_value = lister_stats

    task_params = {
        "url": "https://forge.softwareheritage.org",
        "instance": "swh",
        "api_token": None,
    }

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.phabricator.tasks.FullPhabricatorLister",
        kwargs=task_params)
    assert res
    res.wait()
    assert res.successful()
    assert res.result == lister_stats.dict()

    lister.from_configfile.assert_called_once_with(**task_params)
예제 #21
0
def test_incremental(lister, swh_scheduler_celery_app,
                     swh_scheduler_celery_worker):
    # setup the mocked GitHubLister
    lister.from_configfile.return_value = lister
    lister.state = GitHubListerState()
    lister.run.return_value = ListerStats(pages=5, origins=5000)

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.github.tasks.IncrementalGitHubLister")
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with()
예제 #22
0
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    lister = GitHubLister(scheduler=swh_scheduler)
    assert lister.anonymous
    assert "using anonymous mode" in caplog.records[-1].message
    caplog.clear()

    res = lister.run()
    assert res == ListerStats(pages=0, origins=0)

    last_log = caplog.records[-1]
    assert last_log.levelname == "WARNING"
    assert "No X-Ratelimit-Reset value found in responses" in last_log.message
예제 #23
0
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
    # setup the mocked DebianLister
    lister.from_configfile.return_value = lister
    stats = ListerStats(pages=12, origins=35618)
    lister.run.return_value = stats

    kwargs = dict(
        mirror_url=
        "http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/",
        distribution="Ubuntu",
        suites=["xenial", "bionic", "focal"],
        components=["main", "multiverse", "restricted", "universe"],
    )

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs)
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with(**kwargs)
    lister.run.assert_called_once_with()

    assert res.result == stats.dict()
예제 #24
0
def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
    # setup the mocked GitHubLister
    lister.return_value = lister
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=5, origins=5000)

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.github.tasks.RangeGitHubLister",
        kwargs=dict(first_id=12, last_id=42),
    )
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with(first_id=12, last_id=42)
    lister.run.assert_called_once_with()
예제 #25
0
def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Only set this state as a canary: in the currently tested mode, the lister
    # should not be touching it.
    set_lister_state(swh_scheduler, {"last_seen_id": 123})

    # Use "relisting" mode to list origins between id 10 and 1011
    lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011)
    res = lister.run()

    # Make sure we got two full pages of results
    assert res == ListerStats(pages=2, origins=2000)

    # Check that the relisting mode hasn't touched the stored state.
    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": 123}
예제 #26
0
def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=5, origins=5000)

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.bitbucket.tasks.FullBitBucketRelister",
        kwargs=dict(
            page_size=100,
            username="******",
            password="******",
        ),
    )
    assert res
    res.wait()
    assert res.successful()

    lister.run.assert_called_once()
예제 #27
0
def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker,
                      mocker):
    lister = mocker.patch("swh.lister.tuleap.tasks.TuleapLister")
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=10, origins=500)

    kwargs = dict(url="https://tuleap.net")
    res = swh_scheduler_celery_app.send_task(
        "swh.lister.tuleap.tasks.FullTuleapLister",
        kwargs=kwargs,
    )
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with(**kwargs)
    lister.run.assert_called_once_with()
예제 #28
0
def test_from_empty_state(swh_scheduler, caplog,
                          requests_mocker: requests_mock.Mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Run the lister in incremental mode
    lister = GitHubLister(scheduler=swh_scheduler)
    res = lister.run()

    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1)
    assert len(listed_origins.results) == ORIGIN_COUNT
    assert listed_origins.next_page_token is None

    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}

    check_origin_4321(swh_scheduler, lister_data)
    check_origin_5555(swh_scheduler, lister_data)
예제 #29
0
def test_cgit_lister_task(
    swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
    # setup the mocked CGitLister
    lister = mocker.patch("swh.lister.cgit.tasks.CGitLister")
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=10, origins=500)

    kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None)

    res = swh_scheduler_celery_app.send_task(
        "swh.lister.cgit.tasks.CGitListerTask",
        kwargs=kwargs,
    )
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with(**kwargs)
    lister.run.assert_called_once_with()
예제 #30
0
def test_full_listing_params(lister, swh_scheduler_celery_app,
                             swh_scheduler_celery_worker):
    lister.from_configfile.return_value = lister
    lister.run.return_value = ListerStats(pages=10, origins=500)

    kwargs = dict(
        url="https://0xacab.org/api/v4",
        instance="0xacab",
        api_token="test",
        page_size=50,
    )
    res = swh_scheduler_celery_app.send_task(
        "swh.lister.gitea.tasks.FullGiteaRelister",
        kwargs=kwargs,
    )
    assert res
    res.wait()
    assert res.successful()

    lister.from_configfile.assert_called_once_with(**kwargs)
    lister.run.assert_called_once_with()