Пример #1
0
def test_ratelimit_once_recovery(
    swh_scheduler,
    caplog,
    requests_ratelimited,
    num_ratelimit,
    monkeypatch_sleep_calls,
    lister_credentials,
):
    """Check that the lister recovers from hitting the rate-limit once"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler,
                          credentials=lister_credentials)

    res = lister.run()
    # check that we used all the pages
    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    token_users = []
    for record in caplog.records:
        if "Using authentication token" in record.message:
            token_users.append(record.args[0])

    # check that we used one more token than we saw rate limited requests
    assert len(token_users) == 1 + num_ratelimit

    # check that we slept for one second between our token uses
    assert monkeypatch_sleep_calls == [1]
Пример #2
0
def test_incremental(swh_scheduler, caplog, requests_mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Number of origins to skip
    skip_origins = 2000
    expected_origins = ORIGIN_COUNT - skip_origins

    # Bump the last_seen_id in the scheduler backend
    set_lister_state(swh_scheduler, {"last_seen_id": skip_origins})

    # Run the lister in incremental mode
    lister = GitHubLister(scheduler=swh_scheduler)
    res = lister.run()

    # add 1 page to the number of full_pages if partial_page_len is not 0
    full_pages, partial_page_len = divmod(expected_origins,
                                          GitHubLister.PAGE_SIZE)
    expected_pages = full_pages + bool(partial_page_len)

    assert res == ListerStats(pages=expected_pages, origins=expected_origins)

    listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins +
                                                      1)
    assert len(listed_origins.results) == expected_origins
    assert listed_origins.next_page_token is None

    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}

    check_origin_4321(swh_scheduler, lister_data)
    check_origin_5555(swh_scheduler, lister_data)
Пример #3
0
def test_ratelimit_reset_sleep(
    swh_scheduler,
    caplog,
    requests_ratelimited,
    monkeypatch_sleep_calls,
    num_before_ratelimit,
    ratelimit_reset,
    github_credentials,
    lister_credentials,
):
    """Check that the lister properly handles rate-limiting when providing it with
    authentication tokens"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    lister = GitHubLister(scheduler=swh_scheduler,
                          credentials=lister_credentials)

    res = lister.run()
    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    # We sleep 1 second every time we change credentials, then we sleep until
    # ratelimit_reset + 1
    expected_sleep_calls = len(github_credentials) * [1] + [
        ratelimit_reset + 1
    ]
    assert monkeypatch_sleep_calls == expected_sleep_calls

    found_exhaustion_message = False
    for record in caplog.records:
        if record.levelname == "INFO":
            if "Rate limits exhausted for all tokens" in record.message:
                found_exhaustion_message = True
                break

    assert found_exhaustion_message
Пример #4
0
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    lister = GitHubLister(scheduler=swh_scheduler)
    assert lister.anonymous
    assert "using anonymous mode" in caplog.records[-1].message
    caplog.clear()

    res = lister.run()
    assert res == ListerStats(pages=0, origins=0)

    last_log = caplog.records[-1]
    assert last_log.levelname == "WARNING"
    assert "No X-Ratelimit-Reset value found in responses" in last_log.message
Пример #5
0
def list_github_full(self, split: Optional[int] = None) -> str:
    """Full update of GitHub

    It's not to be called for an initial listing.

    """
    lister = GitHubLister.from_configfile()
    last_index = lister.state.last_seen_id

    bounds = list(range(0, last_index + 1, split or GROUP_SPLIT))
    if bounds[-1] != last_index:
        bounds.append(last_index)

    ranges = list(zip(bounds[:-1], bounds[1:]))
    random.shuffle(ranges)
    promise = group(
        _range_github_lister.s(first_id=minv, last_id=maxv)
        for minv, maxv in ranges)()
    self.log.debug("%s OK (spawned %s subtasks)" % (self.name, len(ranges)))
    try:
        promise.save()  # so that we can restore the GroupResult in tests
    except (NotImplementedError, AttributeError):
        self.log.info("Unable to call save_group with current result backend.")
    # FIXME: what to do in terms of return here?
    return promise.id
Пример #6
0
def test_relister(swh_scheduler, caplog, requests_mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Only set this state as a canary: in the currently tested mode, the lister
    # should not be touching it.
    set_lister_state(swh_scheduler, {"last_seen_id": 123})

    # Use "relisting" mode to list origins between id 10 and 1011
    lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011)
    res = lister.run()

    # Make sure we got two full pages of results
    assert res == ListerStats(pages=2, origins=2000)

    # Check that the relisting mode hasn't touched the stored state.
    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": 123}
Пример #7
0
def test_from_empty_state(swh_scheduler, caplog,
                          requests_mocker: requests_mock.Mocker) -> None:
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    # Run the lister in incremental mode
    lister = GitHubLister(scheduler=swh_scheduler)
    res = lister.run()

    assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT)

    listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1)
    assert len(listed_origins.results) == ORIGIN_COUNT
    assert listed_origins.next_page_token is None

    lister_data = get_lister_data(swh_scheduler)
    assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT}

    check_origin_4321(swh_scheduler, lister_data)
    check_origin_5555(swh_scheduler, lister_data)
Пример #8
0
def test_authenticated_credentials(
    swh_scheduler, caplog, github_credentials, lister_credentials, all_tokens
):
    """Test credentials management when the lister is authenticated"""
    caplog.set_level(logging.DEBUG, "swh.lister.github.lister")

    lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials)
    assert lister.token_index == 0
    assert sorted(lister.credentials, key=lambda t: t["username"]) == github_credentials
    assert lister.session.headers["Authorization"] in [
        "token %s" % t for t in all_tokens
    ]
Пример #9
0
def new_lister(api_baseurl='https://api.github.com', **kw):
    return GitHubLister(api_baseurl=api_baseurl, **kw)
Пример #10
0
def _range_github_lister(first_id: int, last_id: int) -> Dict[str, int]:
    lister = GitHubLister.from_configfile(first_id=first_id, last_id=last_id)
    return lister.run().dict()
Пример #11
0
def list_github_incremental() -> Dict[str, int]:
    "Incremental update of GitHub"
    lister = GitHubLister.from_configfile()
    return lister.run().dict()