예제 #1
0
    def normalize_version(self, component: Component, info: dict,
                          releases: dict) -> bool:
        """
        Normalize a PyPI metadata blob to our schema and save it
        to the database.

        Params:
            purl: The PackageURL we're tying this all to.
            data: The version-specific sub-tree from the Pypi registry.
            top_level: The full tree from the Pypi registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if info is None:
            raise ValueError("Missing info.")
        if releases is None:
            raise ValueError("Missing release.")

        version_str = info.get("version")
        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=version_str)  # type: ComponentVersion, bool

        version.update_metadata(MetadataType.SOURCE, "data-source", "pypi.org")

        if created:
            logger.debug("Adding PyPI: %s@%s", component.name, version_str)
        else:
            logger.debug("Reloading PyPI: %s@%s", component.name, version_str)

        version.description = info.get("description") or info.get(
            "summary") or ""

        # Add author and maintainer information
        version.add_maintainer(info.get("author_email"), info.get("author"))
        version.add_maintainer(info.get("maintainer_email"),
                               info.get("maintainer"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(info, "home_page")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
예제 #2
0
class PyPIImporter(BaseImporter):
    """Imports a PyPI package into OpenSSFMetric.

    This function is time-consuming, making one HTTP call per version of the
    project being imported. As such, it should not be called from a process that
    services web calls.
    """

    PYPI_API_ENDPOINT = "https://pypi.org/pypi"

    def import_component(self, component_purl: PackageURL) -> bool:
        """Import all versions of a component specified by component_purl.

        Returns:
            The number of components imported, either as new or as updates to
            existing content in the database.

        Raises:
            HTTPError if we were unable to connect to the PyPI endpoint.
            ValueError if any other error caused us to not be able to import anything.
        """
        if not component_purl:
            raise ValueError("No component specified.")

        endpoint = f"{self.PYPI_API_ENDPOINT}/{component_purl.name}/json"

        response = requests.get(endpoint)
        if response.status_code != 200:
            logger.warning("Error retrieving endpoint %s, status=%d", endpoint,
                           response.status_code)
            return False

        result = response.json()

        # Common metadata for all releases
        info = result.get("info")
        if info is None:
            logger.warning('The "info" field is missing the metadata JSON.')
            raise ValueError(f'Missing "info" field in {component_purl.name}')

        # Remove everything but the purl type and component name
        purl = PackageURL(type="pypi", name=component_purl.name)
        component, _ = Component.objects.get_or_create(
            component_purl=str(purl))

        num_imported = 0

        # Add or update the top-level component
        self.normalize_component(component, info)

        # Iterate through each release
        for version, _ in result.get("releases", {}).items():
            logger.debug("Importing: %s@%s", purl, version)

            # Load the endpoint for this specific version
            # This is needed because the versionless API endpoint gives back an info
            # dictionary that contains information for only the latest version.
            endpoint = f"{self.PYPI_API_ENDPOINT}/{component_purl.name}/{version}/json"
            response = requests.get(endpoint)
            if response.status_code != 200:
                logger.warning("Error retrieving endpoint %s, status=%d",
                               endpoint, response.status_code)
                continue

            version_result = response.json()

            version_info = version_result.get("info")
            version_releases = version_result.get("releases", {}).get(version)

            try:
                if self.normalize_version(component, version_info,
                                          version_releases):
                    num_imported += 1
            except ValueError:
                logger.warning("Unable to normalize %s", purl, exc_info=True)

        return num_imported

    def normalize_component(self, component: Component, info: dict) -> bool:
        """Load a component from the 'info' dictionary in the PyPI API response."""
        if component is None:
            raise ValueError("Missing component.")
        if info is None:
            raise ValueError("Missing info.")

        component.name = info.get("name")
        component.updated_dt = datetime.datetime.now()
        component.update_metadata(MetadataType.SOURCE, "data-source",
                                  "pypi.org")
        component.save()

        return True

    def normalize_version(self, component: Component, info: dict,
                          releases: dict) -> bool:
        """
        Normalize a PyPI metadata blob to our schema and save it
        to the database.

        Params:
            purl: The PackageURL we're tying this all to.
            data: The version-specific sub-tree from the Pypi registry.
            top_level: The full tree from the Pypi registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if info is None:
            raise ValueError("Missing info.")
        if releases is None:
            raise ValueError("Missing release.")

        version_str = info.get("version")
        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=version_str)  # type: ComponentVersion, bool

        version.update_metadata(MetadataType.SOURCE, "data-source", "pypi.org")

        if created:
            logger.debug("Adding PyPI: %s@%s", component.name, version_str)
        else:
            logger.debug("Reloading PyPI: %s@%s", component.name, version_str)

        version.description = info.get("description") or info.get(
            "summary") or ""

        # Add author and maintainer information
        version.add_maintainer(info.get("author_email"), info.get("author"))
        version.add_maintainer(info.get("maintainer_email"),
                               info.get("maintainer"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(info, "home_page")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
        if url := check_url(get_complex(info, "project_urls.Homepage")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
예제 #3
0
    def normalize_version(self, component: Component, data: dict,
                          top_data: dict) -> bool:
        """
        Normalize a NPM metadata blob to our schema and save it
        to the database.

        Params:
            component: the Component we're tying this all to.
            data: the version data from the NPM registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")
        if top_data is None:
            raise ValueError("Missing top_data.")

        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=data.get("version"),
        )  # type: ComponentVersion, bool

        # Data Source
        component.update_metadata(MetadataType.SOURCE, "data-source",
                                  "registry.npmjs.org")

        if created:
            logger.debug("Adding NPM: %s@%s", component.name,
                         data.get("version"))
        else:
            logger.debug("Reloading NPM: %s@%s", component.name,
                         data.get("version"))

        version.description = data.get("description", "")

        # Add author and maintainer information
        maintainers = get_complex(data, "contributors", []) + get_complex(
            data, "maintainers", [])
        authors = get_complex(data, "author")
        if isinstance(authors, dict):
            maintainers += [authors]
        elif isinstance(authors, str):
            maintainers += [{"name": authors}]
        elif isinstance(authors, list):
            maintainers += authors
        else:
            logger.warning("Invalid type for 'authors': %s, %s", type(authors),
                           authors)

        maintainers = [m for m in maintainers if m != []]

        for maintainer in maintainers:
            version.add_maintainer(maintainer.get("email"),
                                   maintainer.get("name"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(data, "homepage")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
예제 #4
0
        # Add author and maintainer information
        version.add_maintainer(info.get("author_email"), info.get("author"))
        version.add_maintainer(info.get("maintainer_email"),
                               info.get("maintainer"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(info, "home_page")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
        if url := check_url(get_complex(info, "project_urls.Homepage")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
        if url := check_url(get_complex(info, "project_urls.Download")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.DOWNLOAD,
                                          url=url)[0])
        if url := check_url(get_complex(info, "docs_url")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.DOCUMENTATION,
                                          url=url)[0])
        if url := check_url(get_complex(info, "project_urls.Documentation")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.DOCUMENTATION,
                                          url=url)[0])
        if url := check_url(get_complex(info, "bugtrack_url")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER,
                                          url=url)[0])
예제 #5
0
class NPMImporter(BaseImporter):
    """Imports an NPM package into OpenSSFMetric.
    """

    NPM_API_ENDPOINT = "https://registry.npmjs.org"

    def import_component(self, component_purl: PackageURL) -> bool:
        """Import all versions of a component specified by component_purl.

        Returns:
            The number of components imported, either as new or as updates to
            existing content in the database.

        Raises:
            HTTPError if we were unable to connect to the PyPI endpoint.
            ValueError if any other error caused us to not be able to import anything.
        """
        if not component_purl:
            raise ValueError("No component specified.")

        endpoint = f"{self.NPM_API_ENDPOINT}/{component_purl.name}"

        response = requests.get(endpoint)
        response.raise_for_status()
        result = response.json()

        # Remove everything but the purl type and component name
        purl = PackageURL(type="npm", name=component_purl.name)
        component, _ = Component.objects.get_or_create(
            component_purl=str(purl))

        num_imported = 0

        # Add or update the top-level component
        self.normalize_component(component, result)

        # Iterate through each release
        for version, version_info in result.get("versions", {}).items():
            logger.debug("Importing: %s@%s", purl, version)

            try:
                if self.normalize_version(component, version_info, result):
                    num_imported += 1
            except ValueError:
                logger.warning("Unable to normalize %s", purl, exc_info=True)

        return num_imported

    def normalize_component(self, component: Component, data: dict) -> bool:
        """Load a component from the result of the NPM API response."""
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")

        component.name = data.get("name")
        component.updated_dt = datetime.datetime.now()
        component.update_metadata(MetadataType.SOURCE, "data-source",
                                  "registry.npmjs.org")
        component.save()

        return True

    def normalize_version(self, component: Component, data: dict,
                          top_data: dict) -> bool:
        """
        Normalize a NPM metadata blob to our schema and save it
        to the database.

        Params:
            component: the Component we're tying this all to.
            data: the version data from the NPM registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")
        if top_data is None:
            raise ValueError("Missing top_data.")

        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=data.get("version"),
        )  # type: ComponentVersion, bool

        # Data Source
        component.update_metadata(MetadataType.SOURCE, "data-source",
                                  "registry.npmjs.org")

        if created:
            logger.debug("Adding NPM: %s@%s", component.name,
                         data.get("version"))
        else:
            logger.debug("Reloading NPM: %s@%s", component.name,
                         data.get("version"))

        version.description = data.get("description", "")

        # Add author and maintainer information
        maintainers = get_complex(data, "contributors", []) + get_complex(
            data, "maintainers", [])
        authors = get_complex(data, "author")
        if isinstance(authors, dict):
            maintainers += [authors]
        elif isinstance(authors, str):
            maintainers += [{"name": authors}]
        elif isinstance(authors, list):
            maintainers += authors
        else:
            logger.warning("Invalid type for 'authors': %s, %s", type(authors),
                           authors)

        maintainers = [m for m in maintainers if m != []]

        for maintainer in maintainers:
            version.add_maintainer(maintainer.get("email"),
                                   maintainer.get("name"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(data, "homepage")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
        if url := check_url(get_complex(data, "bugs.url")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER,
                                          url=url)[0])
예제 #6
0
        for maintainer in maintainers:
            version.add_maintainer(maintainer.get("email"),
                                   maintainer.get("name"))

        # Add relevant URLs test
        urls = []
        if url := check_url(get_complex(data, "homepage")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
        if url := check_url(get_complex(data, "bugs.url")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER,
                                          url=url)[0])
        if url := check_url(get_complex(data, "repository.url")):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.SOURCE_REPO,
                                          url=url)[0])
        version.urls.add(*urls)

        # Declared dependencies
        dependencies = []
        if data.get("dependencies"):
            for dependency, _ in data.get("dependencies", {}).items():
                dependencies.append(dependency)
            version.update_metadata(MetadataType.SOURCE, "dependencies",
                                    dependencies)

        dependencies = []
        if data.get("devDependencies"):
class GitHubImporter(BaseImporter):
    """Imports a GitHub repository into OpenSSFMetric.
    """

    GITHUB_API_ENDPOINT = "https://api.github.com"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        if "GITHUB_TOKEN" not in self.config:
            raise Exception(
                "GITHUB_TOKEN not defined, unable to import without it.")

    def import_component(self, component_purl: PackageURL) -> bool:
        """Import all versions of a component specified by component_purl.

        Returns:
            The number of components imported, either as new or as updates to
            existing content in the database.

        Raises:
            HTTPError if we were unable to connect to the PyPI endpoint.
            ValueError if any other error caused us to not be able to import anything.
        """
        if not component_purl:
            raise ValueError("No component specified.")

        github = Github(
            self.config.get("GITHUB_TOKEN"),
            # base_url=self.GITHUB_API_ENDPOINT,
            # per_page=100,
            # retry=3,
        )  # type: Github

        repo = github.get_repo(
            f"{component_purl.namespace}/{component_purl.name}")

        # Remove everything but the purl type and component name
        purl = PackageURL(type="github",
                          namespace=component_purl.namespace,
                          name=component_purl.name)
        component, _ = Component.objects.get_or_create(
            component_purl=str(purl))

        num_imported = 0

        # Add or update the top-level component
        self.normalize_component(component, repo)

        # Iterate through each release
        for release in repo.get_releases():  # type: GitRelease
            logger.debug("Importing: %s@%s", purl, release.tag_name)

            try:
                if self.normalize_release(component, release, repo):
                    num_imported += 1
            except ValueError:
                logger.warning("Unable to normalize %s", purl, exc_info=True)

        return num_imported

    def normalize_component(self, component: Component,
                            data: Repository) -> bool:
        """Load a component from the result of the GitHub API response."""
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")

        component.name = data.full_name
        component.updated_dt = timezone.now()
        component.update_metadata(MetadataType.SOURCE, "data-source",
                                  "api.github.com")

        # @TODO There is a lot of metadata that we can add here.

        # Last Updated
        component.update_metadata(MetadataType.SOURCE, "is-fork", data.fork)
        component.update_metadata(MetadataType.SOURCE, "forks-count",
                                  data.forks_count)
        component.update_metadata(MetadataType.SOURCE, "push.latest",
                                  data.pushed_at.isoformat())
        component.update_metadata(MetadataType.SOURCE, "size", data.size)
        component.save()
        return True

    def normalize_release(self, component: Component, data: GitRelease,
                          top_data: Repository) -> bool:
        """
        Normalize GitHub data to our schema and save it to the database.

        Params:
            component: the Component we're tying this all to.
            data: the version data from the NPM registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")
        if top_data is None:
            raise ValueError("Missing top_data.")

        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=data.tag_name,
        )  # type: ComponentVersion, bool

        # Data Source
        version.update_metadata(MetadataType.SOURCE, "data-source",
                                "api.github.com")

        if created:
            logger.debug("Adding GitHub: %s@%s", component.name, data.tag_name)
        else:
            logger.debug("Reloading GitHub: %s@%s", component.name,
                         data.tag_name)

        version.description = data.body
        version.maintainers.clear()

        author = data.author
        maintainer, _ = Maintainer.objects.get_or_create(
            metadata__SOURCE__contains={
                "scoped-username.github": author.login
            })  # type: Maintainer, bool

        maintainer.add_name(author.name)
        maintainer.add_email(author.email)

        maintainer.update_metadata(MetadataType.SOURCE, "twitter_username",
                                   author.twitter_username)
        maintainer.update_metadata(MetadataType.SOURCE, "avatar_url",
                                   author.avatar_url)
        maintainer.save()
        version.maintainers.add(maintainer)

        # Additional maintainers
        year_ago = timezone.now() - datetime.timedelta(days=365)
        year_ago = year_ago.replace(hour=0, minute=0, second=0,
                                    microsecond=0)  # cache-friendly
        commits = top_data.get_commits(since=year_ago)
        seen_commits = set()
        for commit in commits:
            if commit.author.login in seen_commits:
                continue
            seen_commits.add(commit.author.login)

            maintainer, _ = Maintainer.objects.get_or_create(
                metadata__SOURCE__contains={
                    "scoped-username.github": commit.author.login
                })

            maintainer.add_name(commit.author.name)
            maintainer.add_email(commit.author.email)
            maintainer.update_metadata(MetadataType.SOURCE, "avatar_url",
                                       commit.author.avatar_url)
            maintainer.save()
            version.maintainers.add(maintainer)

        # Add relevant URLs test
        urls = []
        if url := check_url(top_data.homepage):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])

        if top_data.has_issues:
            if url := check_url(top_data.html_url + "/issues"):
                urls.append(
                    Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER,
                                              url=url)[0])
    def normalize_release(self, component: Component, data: GitRelease,
                          top_data: Repository) -> bool:
        """
        Normalize GitHub data to our schema and save it to the database.

        Params:
            component: the Component we're tying this all to.
            data: the version data from the NPM registry.
        """
        if component is None:
            raise ValueError("Missing component.")
        if data is None:
            raise ValueError("Missing data.")
        if top_data is None:
            raise ValueError("Missing top_data.")

        version, created = ComponentVersion.objects.get_or_create(
            component=component,
            version=data.tag_name,
        )  # type: ComponentVersion, bool

        # Data Source
        version.update_metadata(MetadataType.SOURCE, "data-source",
                                "api.github.com")

        if created:
            logger.debug("Adding GitHub: %s@%s", component.name, data.tag_name)
        else:
            logger.debug("Reloading GitHub: %s@%s", component.name,
                         data.tag_name)

        version.description = data.body
        version.maintainers.clear()

        author = data.author
        maintainer, _ = Maintainer.objects.get_or_create(
            metadata__SOURCE__contains={
                "scoped-username.github": author.login
            })  # type: Maintainer, bool

        maintainer.add_name(author.name)
        maintainer.add_email(author.email)

        maintainer.update_metadata(MetadataType.SOURCE, "twitter_username",
                                   author.twitter_username)
        maintainer.update_metadata(MetadataType.SOURCE, "avatar_url",
                                   author.avatar_url)
        maintainer.save()
        version.maintainers.add(maintainer)

        # Additional maintainers
        year_ago = timezone.now() - datetime.timedelta(days=365)
        year_ago = year_ago.replace(hour=0, minute=0, second=0,
                                    microsecond=0)  # cache-friendly
        commits = top_data.get_commits(since=year_ago)
        seen_commits = set()
        for commit in commits:
            if commit.author.login in seen_commits:
                continue
            seen_commits.add(commit.author.login)

            maintainer, _ = Maintainer.objects.get_or_create(
                metadata__SOURCE__contains={
                    "scoped-username.github": commit.author.login
                })

            maintainer.add_name(commit.author.name)
            maintainer.add_email(commit.author.email)
            maintainer.update_metadata(MetadataType.SOURCE, "avatar_url",
                                       commit.author.avatar_url)
            maintainer.save()
            version.maintainers.add(maintainer)

        # Add relevant URLs test
        urls = []
        if url := check_url(top_data.homepage):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])
            version.maintainers.add(maintainer)

        # Add relevant URLs test
        urls = []
        if url := check_url(top_data.homepage):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.HOME_PAGE,
                                          url=url)[0])

        if top_data.has_issues:
            if url := check_url(top_data.html_url + "/issues"):
                urls.append(
                    Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER,
                                              url=url)[0])

        if url := check_url(top_data.clone_url):
            urls.append(
                Url.objects.get_or_create(url_type=UrlType.SOURCE_REPO,
                                          url=url)[0])
        version.urls.add(*urls)

        # Deprecation notices
        version.update_metadata(MetadataType.SOURCE, "deprecation-notice",
                                top_data.archived)

        # Release-specific data
        for asset in data.get_assets():
            filename = os.path.basename(
                urlparse(asset.browser_download_url).path)
            artifact, created = Artifact.objects.get_or_create(
                component_version=version,