def normalize_version(self, component: Component, info: dict, releases: dict) -> bool: """ Normalize a PyPI metadata blob to our schema and save it to the database. Params: purl: The PackageURL we're tying this all to. data: The version-specific sub-tree from the Pypi registry. top_level: The full tree from the Pypi registry. """ if component is None: raise ValueError("Missing component.") if info is None: raise ValueError("Missing info.") if releases is None: raise ValueError("Missing release.") version_str = info.get("version") version, created = ComponentVersion.objects.get_or_create( component=component, version=version_str) # type: ComponentVersion, bool version.update_metadata(MetadataType.SOURCE, "data-source", "pypi.org") if created: logger.debug("Adding PyPI: %s@%s", component.name, version_str) else: logger.debug("Reloading PyPI: %s@%s", component.name, version_str) version.description = info.get("description") or info.get( "summary") or "" # Add author and maintainer information version.add_maintainer(info.get("author_email"), info.get("author")) version.add_maintainer(info.get("maintainer_email"), info.get("maintainer")) # Add relevant URLs test urls = [] if url := check_url(get_complex(info, "home_page")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0])
class PyPIImporter(BaseImporter): """Imports a PyPI package into OpenSSFMetric. This function is time-consuming, making one HTTP call per version of the project being imported. As such, it should not be called from a process that services web calls. """ PYPI_API_ENDPOINT = "https://pypi.org/pypi" def import_component(self, component_purl: PackageURL) -> bool: """Import all versions of a component specified by component_purl. Returns: The number of components imported, either as new or as updates to existing content in the database. Raises: HTTPError if we were unable to connect to the PyPI endpoint. ValueError if any other error caused us to not be able to import anything. """ if not component_purl: raise ValueError("No component specified.") endpoint = f"{self.PYPI_API_ENDPOINT}/{component_purl.name}/json" response = requests.get(endpoint) if response.status_code != 200: logger.warning("Error retrieving endpoint %s, status=%d", endpoint, response.status_code) return False result = response.json() # Common metadata for all releases info = result.get("info") if info is None: logger.warning('The "info" field is missing the metadata JSON.') raise ValueError(f'Missing "info" field in {component_purl.name}') # Remove everything but the purl type and component name purl = PackageURL(type="pypi", name=component_purl.name) component, _ = Component.objects.get_or_create( component_purl=str(purl)) num_imported = 0 # Add or update the top-level component self.normalize_component(component, info) # Iterate through each release for version, _ in result.get("releases", {}).items(): logger.debug("Importing: %s@%s", purl, version) # Load the endpoint for this specific version # This is needed because the versionless API endpoint gives back an info # dictionary that contains information for only the latest version. endpoint = f"{self.PYPI_API_ENDPOINT}/{component_purl.name}/{version}/json" response = requests.get(endpoint) if response.status_code != 200: logger.warning("Error retrieving endpoint %s, status=%d", endpoint, response.status_code) continue version_result = response.json() version_info = version_result.get("info") version_releases = version_result.get("releases", {}).get(version) try: if self.normalize_version(component, version_info, version_releases): num_imported += 1 except ValueError: logger.warning("Unable to normalize %s", purl, exc_info=True) return num_imported def normalize_component(self, component: Component, info: dict) -> bool: """Load a component from the 'info' dictionary in the PyPI API response.""" if component is None: raise ValueError("Missing component.") if info is None: raise ValueError("Missing info.") component.name = info.get("name") component.updated_dt = datetime.datetime.now() component.update_metadata(MetadataType.SOURCE, "data-source", "pypi.org") component.save() return True def normalize_version(self, component: Component, info: dict, releases: dict) -> bool: """ Normalize a PyPI metadata blob to our schema and save it to the database. Params: purl: The PackageURL we're tying this all to. data: The version-specific sub-tree from the Pypi registry. top_level: The full tree from the Pypi registry. """ if component is None: raise ValueError("Missing component.") if info is None: raise ValueError("Missing info.") if releases is None: raise ValueError("Missing release.") version_str = info.get("version") version, created = ComponentVersion.objects.get_or_create( component=component, version=version_str) # type: ComponentVersion, bool version.update_metadata(MetadataType.SOURCE, "data-source", "pypi.org") if created: logger.debug("Adding PyPI: %s@%s", component.name, version_str) else: logger.debug("Reloading PyPI: %s@%s", component.name, version_str) version.description = info.get("description") or info.get( "summary") or "" # Add author and maintainer information version.add_maintainer(info.get("author_email"), info.get("author")) version.add_maintainer(info.get("maintainer_email"), info.get("maintainer")) # Add relevant URLs test urls = [] if url := check_url(get_complex(info, "home_page")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if url := check_url(get_complex(info, "project_urls.Homepage")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0])
def normalize_version(self, component: Component, data: dict, top_data: dict) -> bool: """ Normalize a NPM metadata blob to our schema and save it to the database. Params: component: the Component we're tying this all to. data: the version data from the NPM registry. """ if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") if top_data is None: raise ValueError("Missing top_data.") version, created = ComponentVersion.objects.get_or_create( component=component, version=data.get("version"), ) # type: ComponentVersion, bool # Data Source component.update_metadata(MetadataType.SOURCE, "data-source", "registry.npmjs.org") if created: logger.debug("Adding NPM: %s@%s", component.name, data.get("version")) else: logger.debug("Reloading NPM: %s@%s", component.name, data.get("version")) version.description = data.get("description", "") # Add author and maintainer information maintainers = get_complex(data, "contributors", []) + get_complex( data, "maintainers", []) authors = get_complex(data, "author") if isinstance(authors, dict): maintainers += [authors] elif isinstance(authors, str): maintainers += [{"name": authors}] elif isinstance(authors, list): maintainers += authors else: logger.warning("Invalid type for 'authors': %s, %s", type(authors), authors) maintainers = [m for m in maintainers if m != []] for maintainer in maintainers: version.add_maintainer(maintainer.get("email"), maintainer.get("name")) # Add relevant URLs test urls = [] if url := check_url(get_complex(data, "homepage")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0])
# Add author and maintainer information version.add_maintainer(info.get("author_email"), info.get("author")) version.add_maintainer(info.get("maintainer_email"), info.get("maintainer")) # Add relevant URLs test urls = [] if url := check_url(get_complex(info, "home_page")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if url := check_url(get_complex(info, "project_urls.Homepage")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if url := check_url(get_complex(info, "project_urls.Download")): urls.append( Url.objects.get_or_create(url_type=UrlType.DOWNLOAD, url=url)[0]) if url := check_url(get_complex(info, "docs_url")): urls.append( Url.objects.get_or_create(url_type=UrlType.DOCUMENTATION, url=url)[0]) if url := check_url(get_complex(info, "project_urls.Documentation")): urls.append( Url.objects.get_or_create(url_type=UrlType.DOCUMENTATION, url=url)[0]) if url := check_url(get_complex(info, "bugtrack_url")): urls.append( Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER, url=url)[0])
class NPMImporter(BaseImporter): """Imports an NPM package into OpenSSFMetric. """ NPM_API_ENDPOINT = "https://registry.npmjs.org" def import_component(self, component_purl: PackageURL) -> bool: """Import all versions of a component specified by component_purl. Returns: The number of components imported, either as new or as updates to existing content in the database. Raises: HTTPError if we were unable to connect to the PyPI endpoint. ValueError if any other error caused us to not be able to import anything. """ if not component_purl: raise ValueError("No component specified.") endpoint = f"{self.NPM_API_ENDPOINT}/{component_purl.name}" response = requests.get(endpoint) response.raise_for_status() result = response.json() # Remove everything but the purl type and component name purl = PackageURL(type="npm", name=component_purl.name) component, _ = Component.objects.get_or_create( component_purl=str(purl)) num_imported = 0 # Add or update the top-level component self.normalize_component(component, result) # Iterate through each release for version, version_info in result.get("versions", {}).items(): logger.debug("Importing: %s@%s", purl, version) try: if self.normalize_version(component, version_info, result): num_imported += 1 except ValueError: logger.warning("Unable to normalize %s", purl, exc_info=True) return num_imported def normalize_component(self, component: Component, data: dict) -> bool: """Load a component from the result of the NPM API response.""" if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") component.name = data.get("name") component.updated_dt = datetime.datetime.now() component.update_metadata(MetadataType.SOURCE, "data-source", "registry.npmjs.org") component.save() return True def normalize_version(self, component: Component, data: dict, top_data: dict) -> bool: """ Normalize a NPM metadata blob to our schema and save it to the database. Params: component: the Component we're tying this all to. data: the version data from the NPM registry. """ if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") if top_data is None: raise ValueError("Missing top_data.") version, created = ComponentVersion.objects.get_or_create( component=component, version=data.get("version"), ) # type: ComponentVersion, bool # Data Source component.update_metadata(MetadataType.SOURCE, "data-source", "registry.npmjs.org") if created: logger.debug("Adding NPM: %s@%s", component.name, data.get("version")) else: logger.debug("Reloading NPM: %s@%s", component.name, data.get("version")) version.description = data.get("description", "") # Add author and maintainer information maintainers = get_complex(data, "contributors", []) + get_complex( data, "maintainers", []) authors = get_complex(data, "author") if isinstance(authors, dict): maintainers += [authors] elif isinstance(authors, str): maintainers += [{"name": authors}] elif isinstance(authors, list): maintainers += authors else: logger.warning("Invalid type for 'authors': %s, %s", type(authors), authors) maintainers = [m for m in maintainers if m != []] for maintainer in maintainers: version.add_maintainer(maintainer.get("email"), maintainer.get("name")) # Add relevant URLs test urls = [] if url := check_url(get_complex(data, "homepage")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if url := check_url(get_complex(data, "bugs.url")): urls.append( Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER, url=url)[0])
for maintainer in maintainers: version.add_maintainer(maintainer.get("email"), maintainer.get("name")) # Add relevant URLs test urls = [] if url := check_url(get_complex(data, "homepage")): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if url := check_url(get_complex(data, "bugs.url")): urls.append( Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER, url=url)[0]) if url := check_url(get_complex(data, "repository.url")): urls.append( Url.objects.get_or_create(url_type=UrlType.SOURCE_REPO, url=url)[0]) version.urls.add(*urls) # Declared dependencies dependencies = [] if data.get("dependencies"): for dependency, _ in data.get("dependencies", {}).items(): dependencies.append(dependency) version.update_metadata(MetadataType.SOURCE, "dependencies", dependencies) dependencies = [] if data.get("devDependencies"):
class GitHubImporter(BaseImporter): """Imports a GitHub repository into OpenSSFMetric. """ GITHUB_API_ENDPOINT = "https://api.github.com" def __init__(self, **kwargs): super().__init__(**kwargs) if "GITHUB_TOKEN" not in self.config: raise Exception( "GITHUB_TOKEN not defined, unable to import without it.") def import_component(self, component_purl: PackageURL) -> bool: """Import all versions of a component specified by component_purl. Returns: The number of components imported, either as new or as updates to existing content in the database. Raises: HTTPError if we were unable to connect to the PyPI endpoint. ValueError if any other error caused us to not be able to import anything. """ if not component_purl: raise ValueError("No component specified.") github = Github( self.config.get("GITHUB_TOKEN"), # base_url=self.GITHUB_API_ENDPOINT, # per_page=100, # retry=3, ) # type: Github repo = github.get_repo( f"{component_purl.namespace}/{component_purl.name}") # Remove everything but the purl type and component name purl = PackageURL(type="github", namespace=component_purl.namespace, name=component_purl.name) component, _ = Component.objects.get_or_create( component_purl=str(purl)) num_imported = 0 # Add or update the top-level component self.normalize_component(component, repo) # Iterate through each release for release in repo.get_releases(): # type: GitRelease logger.debug("Importing: %s@%s", purl, release.tag_name) try: if self.normalize_release(component, release, repo): num_imported += 1 except ValueError: logger.warning("Unable to normalize %s", purl, exc_info=True) return num_imported def normalize_component(self, component: Component, data: Repository) -> bool: """Load a component from the result of the GitHub API response.""" if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") component.name = data.full_name component.updated_dt = timezone.now() component.update_metadata(MetadataType.SOURCE, "data-source", "api.github.com") # @TODO There is a lot of metadata that we can add here. # Last Updated component.update_metadata(MetadataType.SOURCE, "is-fork", data.fork) component.update_metadata(MetadataType.SOURCE, "forks-count", data.forks_count) component.update_metadata(MetadataType.SOURCE, "push.latest", data.pushed_at.isoformat()) component.update_metadata(MetadataType.SOURCE, "size", data.size) component.save() return True def normalize_release(self, component: Component, data: GitRelease, top_data: Repository) -> bool: """ Normalize GitHub data to our schema and save it to the database. Params: component: the Component we're tying this all to. data: the version data from the NPM registry. """ if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") if top_data is None: raise ValueError("Missing top_data.") version, created = ComponentVersion.objects.get_or_create( component=component, version=data.tag_name, ) # type: ComponentVersion, bool # Data Source version.update_metadata(MetadataType.SOURCE, "data-source", "api.github.com") if created: logger.debug("Adding GitHub: %s@%s", component.name, data.tag_name) else: logger.debug("Reloading GitHub: %s@%s", component.name, data.tag_name) version.description = data.body version.maintainers.clear() author = data.author maintainer, _ = Maintainer.objects.get_or_create( metadata__SOURCE__contains={ "scoped-username.github": author.login }) # type: Maintainer, bool maintainer.add_name(author.name) maintainer.add_email(author.email) maintainer.update_metadata(MetadataType.SOURCE, "twitter_username", author.twitter_username) maintainer.update_metadata(MetadataType.SOURCE, "avatar_url", author.avatar_url) maintainer.save() version.maintainers.add(maintainer) # Additional maintainers year_ago = timezone.now() - datetime.timedelta(days=365) year_ago = year_ago.replace(hour=0, minute=0, second=0, microsecond=0) # cache-friendly commits = top_data.get_commits(since=year_ago) seen_commits = set() for commit in commits: if commit.author.login in seen_commits: continue seen_commits.add(commit.author.login) maintainer, _ = Maintainer.objects.get_or_create( metadata__SOURCE__contains={ "scoped-username.github": commit.author.login }) maintainer.add_name(commit.author.name) maintainer.add_email(commit.author.email) maintainer.update_metadata(MetadataType.SOURCE, "avatar_url", commit.author.avatar_url) maintainer.save() version.maintainers.add(maintainer) # Add relevant URLs test urls = [] if url := check_url(top_data.homepage): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if top_data.has_issues: if url := check_url(top_data.html_url + "/issues"): urls.append( Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER, url=url)[0])
def normalize_release(self, component: Component, data: GitRelease, top_data: Repository) -> bool: """ Normalize GitHub data to our schema and save it to the database. Params: component: the Component we're tying this all to. data: the version data from the NPM registry. """ if component is None: raise ValueError("Missing component.") if data is None: raise ValueError("Missing data.") if top_data is None: raise ValueError("Missing top_data.") version, created = ComponentVersion.objects.get_or_create( component=component, version=data.tag_name, ) # type: ComponentVersion, bool # Data Source version.update_metadata(MetadataType.SOURCE, "data-source", "api.github.com") if created: logger.debug("Adding GitHub: %s@%s", component.name, data.tag_name) else: logger.debug("Reloading GitHub: %s@%s", component.name, data.tag_name) version.description = data.body version.maintainers.clear() author = data.author maintainer, _ = Maintainer.objects.get_or_create( metadata__SOURCE__contains={ "scoped-username.github": author.login }) # type: Maintainer, bool maintainer.add_name(author.name) maintainer.add_email(author.email) maintainer.update_metadata(MetadataType.SOURCE, "twitter_username", author.twitter_username) maintainer.update_metadata(MetadataType.SOURCE, "avatar_url", author.avatar_url) maintainer.save() version.maintainers.add(maintainer) # Additional maintainers year_ago = timezone.now() - datetime.timedelta(days=365) year_ago = year_ago.replace(hour=0, minute=0, second=0, microsecond=0) # cache-friendly commits = top_data.get_commits(since=year_ago) seen_commits = set() for commit in commits: if commit.author.login in seen_commits: continue seen_commits.add(commit.author.login) maintainer, _ = Maintainer.objects.get_or_create( metadata__SOURCE__contains={ "scoped-username.github": commit.author.login }) maintainer.add_name(commit.author.name) maintainer.add_email(commit.author.email) maintainer.update_metadata(MetadataType.SOURCE, "avatar_url", commit.author.avatar_url) maintainer.save() version.maintainers.add(maintainer) # Add relevant URLs test urls = [] if url := check_url(top_data.homepage): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0])
version.maintainers.add(maintainer) # Add relevant URLs test urls = [] if url := check_url(top_data.homepage): urls.append( Url.objects.get_or_create(url_type=UrlType.HOME_PAGE, url=url)[0]) if top_data.has_issues: if url := check_url(top_data.html_url + "/issues"): urls.append( Url.objects.get_or_create(url_type=UrlType.ISSUE_TRACKER, url=url)[0]) if url := check_url(top_data.clone_url): urls.append( Url.objects.get_or_create(url_type=UrlType.SOURCE_REPO, url=url)[0]) version.urls.add(*urls) # Deprecation notices version.update_metadata(MetadataType.SOURCE, "deprecation-notice", top_data.archived) # Release-specific data for asset in data.get_assets(): filename = os.path.basename( urlparse(asset.browser_download_url).path) artifact, created = Artifact.objects.get_or_create( component_version=version,