def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None prev_origin_url = None for repo in page: origin_url = repo.git_https_url # filter out origins with invalid URL or origin previously listed # (last modified repository will be listed twice by launchpadlib) if not origin_url.startswith("https://") or origin_url == prev_origin_url: continue last_update = repo.date_last_modified self.date_last_modified = last_update logger.debug("Found origin %s last updated on %s", origin_url, last_update) prev_origin_url = origin_url yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="git", url=origin_url, last_update=last_update, )
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None seen_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) if origin_url in seen_urls: # prevent multiple listing of an origin, # most recent version will be listed first continue seen_urls.add(origin_url) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [{ "url": artifact_url, "version": package_info["Version"] }] }, )
def deposit_listed_origin(deposit_lister): return ListedOrigin( lister_id=deposit_lister.id, url="https://example.org/project", visit_type="deposit", extra_loader_arguments={"deposit_id": "some-d-id"}, )
def get_origins_from_page( self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]: """Convert a page of Npm repositories into a list of ListedOrigin.""" assert self.lister_obj.id is not None for package in page: # no source code to archive here if not package["doc"].get("versions", {}): continue package_name = package["doc"]["name"] package_latest_version = (package["doc"].get("dist-tags", {}).get("latest", "")) last_update = None if package_latest_version in package["doc"].get("time", {}): last_update = iso8601.parse_date( package["doc"]["time"][package_latest_version]) yield ListedOrigin( lister_id=self.lister_obj.id, url=self.PACKAGE_URL_TEMPLATE.format( package_name=package_name), visit_type="npm", last_update=last_update, )
def get_origins_from_page( self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]: """Convert a page of GitHub repositories into a list of ListedOrigins. This records the html_url, as well as the pushed_at value if it exists. """ assert self.lister_obj.id is not None seen_in_page: Set[str] = set() for repo in page: if not repo: # null repositories in listings happen sometimes... continue if repo["html_url"] in seen_in_page: continue seen_in_page.add(repo["html_url"]) pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: pushed_at = iso8601.parse_date(pushed_at_str) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["html_url"], visit_type="git", last_update=pushed_at, )
def generate_listed_origin( lister_id: uuid.UUID, now: Optional[datetime] = None ) -> ListedOrigin: """Returns a globally unique new origin. Seed the `last_update` value according to the OriginModel and the passed timestamp. Arguments: lister: instance of the lister that generated this origin now: time of listing, to emulate last_update (defaults to :func:`datetime.now`) """ global _nb_generated_origins _nb_generated_origins += 1 assert _nb_generated_origins < 10**6, "Too many origins!" if now is None: now = datetime.now(tz=timezone.utc) url = f"https://example.com/{_nb_generated_origins:06d}.git" visit_type = "test-git" origin = OriginModel(visit_type, url) return ListedOrigin( lister_id=lister_id, url=url, visit_type=visit_type, last_update=origin.get_last_update(now), )
def debian_listed_origin(debian_lister): return ListedOrigin( lister_id=debian_lister.id, url="https://debian.example.org/package", visit_type="debian", extra_loader_arguments={"packages": {}}, )
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for repo in page: url = get_repo_url(repo["attachments"]["uris"]["uris"]) if url is None: short_name: Optional[str] = None for field in "shortName", "name", "callsign": short_name = repo["fields"].get(field) if short_name: break logger.warning( "No valid url for repository [%s] (phid=%s)", short_name or repo["phid"], repo["phid"], ) continue yield ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=repo["fields"]["vcs"], # The "dateUpdated" field returned by the Phabricator API only refers to # the repository metadata; We can't use it for our purposes. last_update=None, )
def get_origins_from_page( self, page: CratesListerPage) -> Iterator[ListedOrigin]: """Iterate on all crate pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None url = self.CRATE_API_URL_PATTERN.format(crate=page[0]["name"]) last_update = page[0]["last_update"] artifacts = [] for version in page: filename = urlparse(version["crate_file"]).path.split("/")[-1] # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 artifact = { "filename": f"{filename}", "checksums": { "sha256": f"{version['checksum']}", }, "url": version["crate_file"], "version": version["version"], } artifacts.append(artifact) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, }, )
def opam_listed_origin(opam_lister): return ListedOrigin( lister_id=opam_lister.id, url=OPAM_LOADER_ARGS["url"], visit_type="opam", extra_loader_arguments={ k: v for k, v in OPAM_LOADER_ARGS.items() if k != "url" }, )
def maven_listed_origin(maven_lister): return ListedOrigin( lister_id=maven_lister.id, url=MVN_ARTIFACTS[0]["url"], visit_type="maven", extra_loader_arguments={ "artifacts": MVN_ARTIFACTS, }, )
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: """Convert a page of OpamLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None # a page is just a package name url = "opam+{}/packages/{}/".format(self.url, page) # print("adding url", url) yield ListedOrigin(lister_id=self.lister_obj.id, visit_type="opam", url=url, last_update=None)
def archive_listed_origin(archive_lister): return ListedOrigin( lister_id=archive_lister.id, url="https://example.org/archives", visit_type="tar", extra_loader_arguments={ "artifacts": [], "snapshot_append": True, }, )
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]: """Convert a page of Tuleap repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None yield ListedOrigin( lister_id=self.lister_obj.id, url=page["uri"], visit_type=page["type"], last_update=iso8601.parse_date(page["last_update_date"]), )
def get_origins_from_page( self, page: SourceForgeListerPage) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, last_update=iso8601.parse_date(hit.last_modified), enabled=False, )
def get_origins_from_page( self, page_result: PageResult) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None repositories = page_result.repositories if page_result.repositories else [] for repo in repositories: yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["http_url_to_repo"], visit_type="git", last_update=iso8601.parse_date(repo["last_activity_at"]), )
def get_origins_from_page( self, packages: PackageListPage) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for origin, last_update in packages: yield ListedOrigin( lister_id=self.lister_obj.id, url=origin, visit_type="pypi", last_update=last_update, )
def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]: """Convert a page of Gitea repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in page: last_update = iso8601.parse_date(repo["updated_at"]) yield ListedOrigin( lister_id=self.lister_obj.id, url=repo["clone_url"], visit_type="git", last_update=last_update, )
def get_origins_from_page( self, packages_name: PackageListPage) -> Iterator[ListedOrigin]: """Convert a page of PyPI repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for package_name in packages_name: package_url = self.PACKAGE_URL.format(package_name=package_name) yield ListedOrigin( lister_id=self.lister_obj.id, url=package_url, visit_type="pypi", last_update=None, # available on PyPI JSON API )
def get_origins_from_page(self, page: ArchListerPage) -> Iterator[ListedOrigin]: """Iterate on all arch pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None for origin in page: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=origin["last_modified"], extra_loader_arguments={ "artifacts": origin["versions"], }, )
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]: """Retrieve scm origin out of the page information. Only called when type of the page is scm. Try and detect an scm/vcs repository. Note that official format is in the form: scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put the repo url (without the "scm:type"), so we have to check against the content to extract the type and url properly. Raises AssertionError when the type of the page is not 'scm' Returns ListedOrigin with proper canonical scm url (for github) if any is found, None otherwise. """ assert page["type"] == "scm" visit_type: Optional[str] = None url: Optional[str] = None m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"]) if m_scm is None: return None scm_type = m_scm.group("type") if scm_type and scm_type in SUPPORTED_SCM_TYPES: url = m_scm.group("url") visit_type = scm_type elif page["url"].endswith(".git"): url = page["url"].lstrip("scm:") visit_type = "git" else: return None if url and visit_type == "git": # Non-github urls will be returned as is, github ones will be canonical ones url = self.github_session.get_canonical_url(url) if not url: return None assert visit_type is not None assert self.lister_obj.id is not None return ListedOrigin( lister_id=self.lister_obj.id, url=url, visit_type=visit_type, )
def get_origins_from_page( self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]: """Convert a page of Bitbucket repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in page: last_update = iso8601.parse_date(repo["updated_on"]) origin_url = repo["links"]["clone"][0]["href"] origin_type = repo["scm"] yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=origin_type, last_update=last_update, )
def get_origins_from_page( self, repositories: Repositories) -> Iterator[ListedOrigin]: """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None for repo in repositories: origin_url = repo[ "git_url"] or self._get_origin_from_repository_url(repo["url"]) if origin_url is None: continue yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="git", last_update=_parse_last_updated_date(repo), )
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: """Convert a page of OpamLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None # a page is just a package name url = f"opam+{self.url}/packages/{page}/" yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="opam", url=url, last_update=None, extra_loader_arguments={ "opam_root": self.opam_root, "opam_instance": self.instance, "opam_url": self.url, "opam_package": page, }, )
def get_origins_from_page( self, page: NewForgeListerPage) -> Iterator[ListedOrigin]: """Convert a page of NewForgeLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None for element in page: yield ListedOrigin( # Required. Should use this value. lister_id=self.lister_obj.id, # Required. Visit type of the currently processed origin visit_type=self.VISIT_TYPE, # Required. URL corresponding to the origin for loaders to ingest url=..., # Should get it if the service provides it and if it induces no # substantial additional processing cost last_update=..., )
def listed_origins_by_type( stored_lister: Lister, visit_types: List[str] ) -> Dict[str, List[ListedOrigin]]: """A fixed list of `ListedOrigin`s, for each `visit_type`.""" count_per_type = 1000 assert stored_lister.id return { visit_type: [ ListedOrigin( lister_id=stored_lister.id, url=f"https://{visit_type}.example.com/{i:04d}", visit_type=visit_type, last_update=datetime( 2020, 6, 15, 16, 0, 0, j * count_per_type + i, tzinfo=timezone.utc ), ) for i in range(count_per_type) ] for j, visit_type in enumerate(visit_types) }
def test_journal_client_origin_visit_status_after_grab_next_visits( swh_scheduler, stored_lister): """Ensure OriginVisitStat entries created in the db as a result of calling grab_next_visits() do not mess the OriginVisitStats upsert mechanism. """ listed_origins = [ ListedOrigin(lister_id=stored_lister.id, url=url, visit_type=visit_type) for (url, visit_type) in set( (v["origin"], v["type"]) for v in VISIT_STATUSES_2) ] swh_scheduler.record_listed_origins(listed_origins) before = utcnow() swh_scheduler.grab_next_visits(visit_type="git", count=10, policy="oldest_scheduled_first") after = utcnow() assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [] assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git") ])[0] is not None process_journal_objects({"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler) for url in ("cavabarder", "iciaussi"): ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert before <= ovs.last_scheduled <= after ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0] assert ovs.last_scheduled is None ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] assert ovs.last_successful == DATE1 + 5 * ONE_DAY assert ovs.last_visit == DATE1 + 5 * ONE_DAY assert ovs.last_visit_status == LastVisitStatus.successful assert ovs.last_snapshot == hash_to_bytes( "5555555555555555555555555555555555555555")
def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: """ Iterate on all GNU projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None assert self.gnu_tree is not None artifacts = self.gnu_tree.artifacts for project_name, project_info in page.items(): origin_url = project_info["url"] last_update = iso8601.parse_date(project_info["time_modified"]) logger.debug("Found origin %s last updated on %s", origin_url, last_update) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", last_update=last_update, extra_loader_arguments={"artifacts": artifacts[project_name]}, )
def get_origins_from_page( self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ Iterate on all git repositories and yield ListedOrigin instances. """ assert self.lister_obj.id is not None vcs_type, repos = page try: for repo in repos: origin_url = origin(vcs_type, repo) # filter out origins with invalid URL if not origin_url.startswith("https://"): continue last_update = repo.date_last_modified self.date_last_modified[vcs_type] = last_update logger.debug( "Found origin %s with type %s last updated on %s", origin_url, vcs_type, last_update, ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=vcs_type, url=origin_url, last_update=last_update, ) except RestfulError as e: logger.warning("Listing %s origins raised %s", vcs_type, e)
def svn_listed_origin(svn_lister): return ListedOrigin(lister_id=svn_lister.id, url="svn://example.org/repo", visit_type="svn")