예제 #1
0
def test_github_session_ratelimit_reset_sleep(
    caplog,
    requests_ratelimited,
    monkeypatch_sleep_calls,
    num_before_ratelimit,
    num_ratelimit,
    ratelimit_reset,
    github_credentials,
):
    """GitHubSession should handle rate-limit with authentication tokens."""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    github_session = GitHubSession(
        user_agent="GitHub Session Test", credentials=github_credentials
    )

    for _ in range(num_ratelimit):
        github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10")

    # We sleep 1 second every time we change credentials, then we sleep until
    # ratelimit_reset + 1
    expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1]
    assert monkeypatch_sleep_calls == expected_sleep_calls

    found_exhaustion_message = False
    for record in caplog.records:
        if record.levelname == "INFO":
            if "Rate limits exhausted for all tokens" in record.message:
                found_exhaustion_message = True
                break

    assert found_exhaustion_message is True
예제 #2
0
def test_github_session_ratelimit_once_recovery(
    caplog,
    requests_ratelimited,
    num_ratelimit,
    monkeypatch_sleep_calls,
    github_credentials,
):
    """GitHubSession should recover from hitting the rate-limit once"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    github_session = GitHubSession(
        user_agent="GitHub Session Test", credentials=github_credentials
    )

    res = github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10")
    assert res.status_code == 200

    token_users = []
    for record in caplog.records:
        if "Using authentication token" in record.message:
            token_users.append(record.args[0])

    # check that we used one more token than we saw rate limited requests
    assert len(token_users) == 1 + num_ratelimit

    # check that we slept for one second between our token uses
    assert monkeypatch_sleep_calls == [1]
예제 #3
0
    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str,
        index_url: str = None,
        instance: Optional[str] = None,
        credentials: CredentialsType = None,
        incremental: bool = True,
    ):
        """Lister class for Maven repositories.

        Args:
            url: main URL of the Maven repository, i.e. url of the base index
                used to fetch maven artifacts. For Maven central use
                https://repo1.maven.org/maven2/
            index_url: the URL to download the exported text indexes from.
                Would typically be a local host running the export docker image.
                See README.md in this directory for more information.
            instance: Name of maven instance. Defaults to url's network location
                if unset.
            incremental: bool, defaults to True. Defines if incremental listing
                is activated or not.

        """
        self.BASE_URL = url
        self.INDEX_URL = index_url
        self.incremental = incremental

        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.session = requests.Session()
        self.session.headers.update(
            {
                "Accept": "application/json",
                "User-Agent": USER_AGENT,
            }
        )

        self.jar_origins: Dict[str, ListedOrigin] = {}
        self.github_session = GitHubSession(
            credentials=self.credentials, user_agent=USER_AGENT
        )
예제 #4
0
def test_get_canonical_github_origin_url(
    user_repo, expected_url, requests_mock, github_credentials
):
    """It should return a canonical github origin when it exists, None otherwise"""
    for protocol in ["https", "git", "http"]:
        html_input_url = _url_github_html(user_repo, protocol=protocol)
        html_url = _url_github_html(user_repo)
        api_url = _url_github_api(_sanitize_github_url(user_repo))

        if expected_url is not None:
            status_code = 200
            response = {"html_url": _sanitize_github_url(html_url)}
        else:
            status_code = 404
            response = {}

        requests_mock.get(api_url, [{"status_code": status_code, "json": response}])

        # anonymous
        assert get_canonical_github_origin_url(html_input_url) == expected_url

        # with credentials
        assert (
            get_canonical_github_origin_url(
                html_input_url, credentials=github_credentials
            )
            == expected_url
        )

        # anonymous
        assert (
            GitHubSession(
                user_agent="GitHub Session Test",
            ).get_canonical_url(html_input_url)
            == expected_url
        )

        # with credentials
        assert (
            GitHubSession(
                user_agent="GitHub Session Test", credentials=github_credentials
            ).get_canonical_url(html_input_url)
            == expected_url
        )
예제 #5
0
def test_github_session_anonymous_session():
    user_agent = ("GitHub Session Test",)
    github_session = GitHubSession(
        user_agent=user_agent,
    )
    assert github_session.anonymous is True

    actual_headers = github_session.session.headers
    assert actual_headers["Accept"] == "application/vnd.github.v3+json"
    assert actual_headers["User-Agent"] == user_agent
예제 #6
0
def test_get_canonical_github_origin_url_not_gh_origin():
    """It should return the input url when that origin is not a github one"""
    url = "https://example.org"
    assert get_canonical_github_origin_url(url) == url

    assert (
        GitHubSession(
            user_agent="GitHub Session Test",
        ).get_canonical_url(url)
        == url
    )
예제 #7
0
    def __init__(
        self,
        scheduler: SchedulerInterface,
        credentials: CredentialsType = None,
        first_id: Optional[int] = None,
        last_id: Optional[int] = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=self.API_URL,
            instance="github",
        )

        self.first_id = first_id
        self.last_id = last_id

        self.relisting = self.first_id is not None or self.last_id is not None

        self.github_session = GitHubSession(credentials=self.credentials,
                                            user_agent=USER_AGENT)
예제 #8
0
def test_github_session_authenticated_credentials(
    caplog, github_credentials, all_tokens
):
    """GitHubSession should have Authorization headers set in authenticated mode"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    github_session = GitHubSession(
        "GitHub Session Test", credentials=github_credentials
    )

    assert github_session.anonymous is False
    assert github_session.token_index == 0
    assert (
        sorted(github_session.credentials, key=lambda t: t["username"])
        == github_credentials
    )
    assert github_session.session.headers["Authorization"] in [
        f"token {t}" for t in all_tokens
    ]
예제 #9
0
class MavenLister(Lister[MavenListerState, RepoPage]):
    """List origins from a Maven repository.

    Maven Central provides artifacts for Java builds.
    It includes POM files and source archives, which we download to get
    the source code of artifacts and links to their scm repository.

    This lister yields origins of types: git/svn/hg or whatever the Artifacts
    use as repository type, plus maven types for the maven loader (tgz, jar)."""

    LISTER_NAME = "maven"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str,
        index_url: str = None,
        instance: Optional[str] = None,
        credentials: CredentialsType = None,
        incremental: bool = True,
    ):
        """Lister class for Maven repositories.

        Args:
            url: main URL of the Maven repository, i.e. url of the base index
                used to fetch maven artifacts. For Maven central use
                https://repo1.maven.org/maven2/
            index_url: the URL to download the exported text indexes from.
                Would typically be a local host running the export docker image.
                See README.md in this directory for more information.
            instance: Name of maven instance. Defaults to url's network location
                if unset.
            incremental: bool, defaults to True. Defines if incremental listing
                is activated or not.

        """
        self.BASE_URL = url
        self.INDEX_URL = index_url
        self.incremental = incremental

        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.session = requests.Session()
        self.session.headers.update(
            {
                "Accept": "application/json",
                "User-Agent": USER_AGENT,
            }
        )

        self.jar_origins: Dict[str, ListedOrigin] = {}
        self.github_session = GitHubSession(
            credentials=self.credentials, user_agent=USER_AGENT
        )

    def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
        return MavenListerState(**d)

    def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
        return asdict(state)

    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
    def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:

        logger.info("Fetching URL %s with params %s", url, params)

        response = self.session.get(url, params=params)
        if response.status_code != 200:
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        response.raise_for_status()

        return response

    def get_pages(self) -> Iterator[RepoPage]:
        """Retrieve and parse exported maven indexes to
        identify all pom files and src archives.
        """

        # Example of returned RepoPage's:
        # [
        #   {
        #     "type": "maven",
        #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
        #     "time": 1626109619335,
        #     "gid": "org.xwiki.platform",
        #     "aid": "xwiki-platform-wikistream-events-xwiki",
        #     "version": "5.4.2"
        #   },
        #   {
        #     "type": "scm",
        #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
        #     "project": "openengsb-framework",
        #   },
        #   ...
        # ]

        # Download the main text index file.
        logger.info("Downloading computed index from %s.", self.INDEX_URL)
        assert self.INDEX_URL is not None
        response = requests.get(self.INDEX_URL, stream=True)
        if response.status_code != 200:
            logger.error("Index %s not found, stopping", self.INDEX_URL)
            response.raise_for_status()

        # Prepare regexes to parse index exports.

        # Parse doc id.
        # Example line: "doc 13"
        re_doc = re.compile(r"^doc (?P<doc>\d+)$")

        # Parse gid, aid, version, classifier, extension.
        # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
        re_val = re.compile(
            r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
            + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
        )

        # Parse last modification time.
        # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
        re_time = re.compile(
            r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
            + r"\|([^|]+)\|([^|]+)$"
        )

        # Read file line by line and process it
        out_pom: Dict = {}
        jar_src: Dict = {}
        doc_id: int = 0
        jar_src["doc"] = None
        url_src = None

        iterator = response.iter_lines(chunk_size=1024)
        for line_bytes in iterator:
            # Read the index text export and get URLs and SCMs.
            line = line_bytes.decode(errors="ignore")
            m_doc = re_doc.match(line)
            if m_doc is not None:
                doc_id = int(m_doc.group("doc"))
                # jar_src["doc"] contains the id of the current document, whatever
                # its type (scm or jar).
                jar_src["doc"] = doc_id
            else:
                m_val = re_val.match(line)
                if m_val is not None:
                    (gid, aid, version, classifier, ext) = m_val.groups()
                    ext = ext.strip()
                    path = "/".join(gid.split("."))
                    if classifier == "NA" and ext.lower() == "pom":
                        # If incremental mode, we don't record any line that is
                        # before our last recorded doc id.
                        if (
                            self.incremental
                            and self.state
                            and self.state.last_seen_pom
                            and self.state.last_seen_pom >= doc_id
                        ):
                            continue
                        url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
                        url_pom = urljoin(
                            self.BASE_URL,
                            url_path,
                        )
                        out_pom[url_pom] = doc_id
                    elif (
                        classifier.lower() == "sources" or ("src" in classifier)
                    ) and ext.lower() in ("zip", "jar"):
                        url_path = (
                            f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
                        )
                        url_src = urljoin(self.BASE_URL, url_path)
                        jar_src["gid"] = gid
                        jar_src["aid"] = aid
                        jar_src["version"] = version
                else:
                    m_time = re_time.match(line)
                    if m_time is not None and url_src is not None:
                        time = m_time.group("mtime")
                        jar_src["time"] = int(time)
                        artifact_metadata_d = {
                            "type": "maven",
                            "url": url_src,
                            **jar_src,
                        }
                        logger.debug(
                            "* Yielding jar %s: %s", url_src, artifact_metadata_d
                        )
                        yield artifact_metadata_d
                        url_src = None

        logger.info("Found %s poms.", len(out_pom))

        # Now fetch pom files and scan them for scm info.

        logger.info("Fetching poms..")
        for pom in out_pom:
            try:
                response = self.page_request(pom, {})
                project = xmltodict.parse(response.content)
                project_d = project.get("project", {})
                scm_d = project_d.get("scm")
                if scm_d is not None:
                    connection = scm_d.get("connection")
                    if connection is not None:
                        artifact_metadata_d = {
                            "type": "scm",
                            "doc": out_pom[pom],
                            "url": connection,
                        }
                        logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
                        yield artifact_metadata_d
                    else:
                        logger.debug("No scm.connection in pom %s", pom)
                else:
                    logger.debug("No scm in pom %s", pom)
            except requests.HTTPError:
                logger.warning(
                    "POM info page could not be fetched, skipping project '%s'",
                    pom,
                )
            except xmltodict.expat.ExpatError as error:
                logger.info("Could not parse POM %s XML: %s. Next.", pom, error)

    def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
        """Retrieve scm origin out of the page information. Only called when type of the
        page is scm.

        Try and detect an scm/vcs repository. Note that official format is in the form:
        scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
        the repo url (without the "scm:type"), so we have to check against the content
        to extract the type and url properly.

        Raises
            AssertionError when the type of the page is not 'scm'

        Returns
            ListedOrigin with proper canonical scm url (for github) if any is found,
            None otherwise.

        """

        assert page["type"] == "scm"
        visit_type: Optional[str] = None
        url: Optional[str] = None
        m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
        if m_scm is None:
            return None

        scm_type = m_scm.group("type")
        if scm_type and scm_type in SUPPORTED_SCM_TYPES:
            url = m_scm.group("url")
            visit_type = scm_type
        elif page["url"].endswith(".git"):
            url = page["url"].lstrip("scm:")
            visit_type = "git"
        else:
            return None

        if url and visit_type == "git":
            # Non-github urls will be returned as is, github ones will be canonical ones
            url = self.github_session.get_canonical_url(url)

        if not url:
            return None

        assert visit_type is not None
        assert self.lister_obj.id is not None
        return ListedOrigin(
            lister_id=self.lister_obj.id,
            url=url,
            visit_type=visit_type,
        )

    def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:

        """Convert a page of Maven repositories into a list of ListedOrigins."""
        if page["type"] == "scm":
            listed_origin = self.get_scm(page)
            if listed_origin:
                yield listed_origin
        else:
            # Origin is gathering source archives:
            last_update_dt = None
            last_update_iso = ""
            try:
                last_update_seconds = str(page["time"])[:-3]
                last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
                last_update_dt = last_update_dt.astimezone(timezone.utc)
            except (OverflowError, ValueError):
                logger.warning("- Failed to convert datetime %s.", last_update_seconds)
            if last_update_dt:
                last_update_iso = last_update_dt.isoformat()

            # Origin URL will target page holding sources for all versions of
            # an artifactId (package name) inside a groupId (namespace)
            path = "/".join(page["gid"].split("."))
            origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}")

            artifact = {
                **{k: v for k, v in page.items() if k != "doc"},
                "time": last_update_iso,
                "base_url": self.BASE_URL,
            }

            if origin_url not in self.jar_origins:
                # Create ListedOrigin instance if we did not see that origin yet
                assert self.lister_obj.id is not None
                jar_origin = ListedOrigin(
                    lister_id=self.lister_obj.id,
                    url=origin_url,
                    visit_type=page["type"],
                    last_update=last_update_dt,
                    extra_loader_arguments={"artifacts": [artifact]},
                )
                self.jar_origins[origin_url] = jar_origin
            else:
                # Update list of source artifacts for that origin otherwise
                jar_origin = self.jar_origins[origin_url]
                artifacts = jar_origin.extra_loader_arguments["artifacts"]
                if artifact not in artifacts:
                    artifacts.append(artifact)

            if (
                jar_origin.last_update
                and last_update_dt
                and last_update_dt > jar_origin.last_update
            ):
                jar_origin.last_update = last_update_dt

            if not self.incremental or (
                self.state and page["doc"] > self.state.last_seen_doc
            ):
                # Yield origin with updated source artifacts, multiple instances of
                # ListedOrigin for the same origin URL but with different artifacts
                # list will be sent to the scheduler but it will deduplicate them and
                # take the latest one to upsert in database
                yield jar_origin

    def commit_page(self, page: RepoPage) -> None:
        """Update currently stored state using the latest listed doc.

        Note: this is a noop for full listing mode

        """
        if self.incremental and self.state:
            # We need to differentiate the two state counters according
            # to the type of origin.
            if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
                self.state.last_seen_doc = page["doc"]
            elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
                self.state.last_seen_doc = page["doc"]
                self.state.last_seen_pom = page["doc"]

    def finalize(self) -> None:
        """Finalize the lister state, set update if any progress has been made.

        Note: this is a noop for full listing mode

        """
        if self.incremental and self.state:
            last_seen_doc = self.state.last_seen_doc
            last_seen_pom = self.state.last_seen_pom

            scheduler_state = self.get_state_from_scheduler()
            if last_seen_doc and last_seen_pom:
                if (scheduler_state.last_seen_doc < last_seen_doc) or (
                    scheduler_state.last_seen_pom < last_seen_pom
                ):
                    self.updated = True
예제 #10
0
class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
    """List origins from GitHub.

    By default, the lister runs in incremental mode: it lists all repositories,
    starting with the `last_seen_id` stored in the scheduler backend.

    Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
    that mode, the lister finds the origins present in the range **excluding**
    `first_id` and **including** `last_id`. In this mode, the lister can overrun the
    `last_id`: it will always record all the origins seen in a given page. As the lister
    is fully idempotent, this is not a practical problem. Once relisting completes, the
    lister state in the scheduler backend is not updated.

    When the config contains a set of credentials, we shuffle this list at the beginning
    of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
    same token over and over again, until its rate limit runs out. Once that happens, we
    switch to the next token over in our shuffled list.

    When a request fails with a rate limit exception for all tokens, we pause the
    listing until the largest value for X-Ratelimit-Reset over all tokens.

    When the credentials aren't set in the lister config, the lister can run in
    anonymous mode too (e.g. for testing purposes).

    .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits


    Args:
      first_id: the id of the first repo to list
      last_id: stop listing after seeing a repo with an id higher than this value.

    """  # noqa: B950

    LISTER_NAME = "github"

    API_URL = "https://api.github.com/repositories"
    PAGE_SIZE = 1000

    def __init__(
        self,
        scheduler: SchedulerInterface,
        credentials: CredentialsType = None,
        first_id: Optional[int] = None,
        last_id: Optional[int] = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=self.API_URL,
            instance="github",
        )

        self.first_id = first_id
        self.last_id = last_id

        self.relisting = self.first_id is not None or self.last_id is not None

        self.github_session = GitHubSession(credentials=self.credentials,
                                            user_agent=USER_AGENT)

    def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
        return GitHubListerState(**d)

    def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
        return asdict(state)

    def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
        current_id = 0
        if self.first_id is not None:
            current_id = self.first_id
        elif self.state is not None:
            current_id = self.state.last_seen_id

        current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"

        while self.last_id is None or current_id < self.last_id:
            logger.debug("Getting page %s", current_url)

            try:
                response = self.github_session.request(current_url)
            except MissingRateLimitReset:
                # Give up
                break

            # We've successfully retrieved a (non-ratelimited) `response`. We
            # still need to check it for validity.

            if response.status_code != 200:
                logger.warning(
                    "Got unexpected status_code %s: %s",
                    response.status_code,
                    response.content,
                )
                break

            yield response.json()

            if "next" not in response.links:
                # No `next` link, we've reached the end of the world
                logger.debug(
                    "No next link found in the response headers, all caught up"
                )
                break

            # GitHub strongly advises to use the next link directly. We still
            # parse it to get the id of the last repository we've reached so
            # far.
            next_url = response.links["next"]["url"]
            parsed_url = urlparse(next_url)
            if not parsed_url.query:
                logger.warning("Failed to parse url %s", next_url)
                break

            parsed_query = parse_qs(parsed_url.query)
            current_id = int(parsed_query["since"][0])
            current_url = next_url

    def get_origins_from_page(
            self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]:
        """Convert a page of GitHub repositories into a list of ListedOrigins.

        This records the html_url, as well as the pushed_at value if it exists.
        """
        assert self.lister_obj.id is not None

        seen_in_page: Set[str] = set()

        for repo in page:
            if not repo:
                # null repositories in listings happen sometimes...
                continue

            if repo["html_url"] in seen_in_page:
                continue
            seen_in_page.add(repo["html_url"])

            pushed_at_str = repo.get("pushed_at")
            pushed_at: Optional[datetime.datetime] = None
            if pushed_at_str:
                pushed_at = iso8601.parse_date(pushed_at_str)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["html_url"],
                visit_type="git",
                last_update=pushed_at,
            )

    def commit_page(self, page: List[Dict[str, Any]]):
        """Update the currently stored state using the latest listed page"""
        if self.relisting:
            # Don't update internal state when relisting
            return

        if not page:
            # Sometimes, when you reach the end of the world, GitHub returns an empty
            # page of repositories
            return

        last_id = page[-1]["id"]

        if last_id > self.state.last_seen_id:
            self.state.last_seen_id = last_id

    def finalize(self):
        if self.relisting:
            return

        # Pull fresh lister state from the scheduler backend
        scheduler_state = self.get_state_from_scheduler()

        # Update the lister state in the backend only if the last seen id of
        # the current run is higher than that stored in the database.
        if self.state.last_seen_id > scheduler_state.last_seen_id:
            self.updated = True