def _assert_runtime_handler_list_resources( runtime_kind, expected_crds=None, expected_pods=None, expected_services=None, ): runtime_handler = get_runtime_handler(runtime_kind) resources = runtime_handler.list_resources() crd_group, crd_version, crd_plural = runtime_handler._get_crd_info() get_k8s().v1api.list_namespaced_pod.assert_called_once_with( get_k8s().resolve_namespace(), label_selector=runtime_handler._get_default_label_selector(), ) if expected_crds: get_k8s( ).crdapi.list_namespaced_custom_object.assert_called_once_with( crd_group, crd_version, get_k8s().resolve_namespace(), crd_plural, label_selector=runtime_handler._get_default_label_selector(), ) if expected_services: get_k8s().v1api.list_namespaced_service.assert_called_once_with( get_k8s().resolve_namespace(), label_selector=runtime_handler._get_default_label_selector(), ) TestRuntimeHandlerBase._assert_list_resources_response( resources, expected_crds=expected_crds, expected_pods=expected_pods, expected_services=expected_services, )
def custom_setup(self): self.runtime_handler = get_runtime_handler(RuntimeKinds.spark) # initializing them here to save space in tests self.running_crd_dict = self._generate_sparkjob_crd( self.project, self.run_uid, self._get_running_crd_status(), ) self.completed_crd_dict = self._generate_sparkjob_crd( self.project, self.run_uid, self._get_completed_crd_status(), ) self.failed_crd_dict = self._generate_sparkjob_crd( self.project, self.run_uid, self._get_failed_crd_status(), ) executor_pod_labels = { "mlrun/class": "spark", "mlrun/function": "my-spark-jdbc", "mlrun/job": "my-spark-jdbc-2ea432f1", "mlrun/name": "my-spark-jdbc", "mlrun/project": self.project, "mlrun/uid": self.run_uid, "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "spark-app-selector": "spark-12f88a73cb544ce298deba34947226a4", "spark-exec-id": "1", "spark-role": "executor", "sparkoperator.k8s.io/app-name": "my-spark-jdbc-2ea432f1", "sparkoperator.k8s.io/launched-by-spark-operator": "true", "sparkoperator.k8s.io/submission-id": "44343f6b-42ca-41d4-b01a-66052cc5c919", } executor_pod_name = "my-spark-jdbc-2ea432f1-1597760338437-exec-1" self.executor_pod = self._generate_pod( executor_pod_name, executor_pod_labels, PodPhases.running, ) driver_pod_labels = { "mlrun/class": "spark", "mlrun/function": "my-spark-jdbc", "mlrun/job": "my-spark-jdbc-2ea432f1", "mlrun/name": "my-spark-jdbc", "mlrun/project": self.project, "mlrun/uid": self.run_uid, "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "spark-app-selector": "spark-12f88a73cb544ce298deba34947226a4", "spark-role": "driver", "sparkoperator.k8s.io/app-name": "my-spark-jdbc-2ea432f1", "sparkoperator.k8s.io/launched-by-spark-operator": "true", "sparkoperator.k8s.io/submission-id": "44343f6b-42ca-41d4-b01a-66052cc5c919", } driver_pod_name = "my-spark-jdbc-2ea432f1-driver" self.driver_pod = self._generate_pod( driver_pod_name, driver_pod_labels, PodPhases.running, ) self.pod_label_selector = self._generate_get_logger_pods_label_selector( self.runtime_handler )
def list_runtimes(label_selector: str = None): runtimes = [] for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) runtimes.append({"kind": kind, "resources": resources}) return runtimes
def custom_setup(self): config.mpijob_crd_version = MPIJobCRDVersions.v1 self.runtime_handler = get_runtime_handler(RuntimeKinds.mpijob) self.runtime_handler.wait_for_deletion_interval = 0 # initializing them here to save space in tests self.active_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_active_crd_status(), ) self.succeeded_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_succeeded_crd_status(), ) self.failed_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_failed_crd_status(), ) self.no_status_crd_dict = self._generate_mpijob_crd(self.project, self.run_uid,) launcher_pod_labels = { "group-name": "kubeflow.org", "mlrun/class": "mpijob", "mlrun/function": "trainer", "mlrun/job": "trainer-1b019005", "mlrun/name": "trainer", "mlrun/owner": "iguazio", "mlrun/project": self.project, "mlrun/scrape-metrics": "True", "mlrun/tag": "latest", "mlrun/uid": self.run_uid, "mpi-job-name": "trainer-1b019005", "mpi-job-role": "launcher", } launcher_pod_name = "trainer-1b019005-launcher" self.launcher_pod = self._generate_pod( launcher_pod_name, launcher_pod_labels, PodPhases.running, ) worker_pod_labels = { "group-name": "kubeflow.org", "mlrun/class": "mpijob", "mlrun/function": "trainer", "mlrun/job": "trainer-1b019005", "mlrun/name": "trainer", "mlrun/owner": "iguazio", "mlrun/project": self.project, "mlrun/scrape-metrics": "True", "mlrun/tag": "latest", "mlrun/uid": self.run_uid, "mpi-job-name": "trainer-1b019005", "mpi-job-role": "worker", } worker_pod_name = "trainer-1b019005-worker-0" self.worker_pod = self._generate_pod( worker_pod_name, worker_pod_labels, PodPhases.running, ) self.pod_label_selector = self._generate_get_logger_pods_label_selector( self.runtime_handler )
def delete_runtimes(label_selector: str = None, force: bool = False, db_session: Session = Depends(deps.get_db_session)): for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force) return Response(status_code=status.HTTP_204_NO_CONTENT)
def _cleanup_runtimes(): logger.debug('Cleaning runtimes') db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session) finally: close_session(db_session)
def delete_runtimes( label_selector: str = None, force: bool = False, grace_period: int = config.runtime_resources_deletion_grace_period, db_session: Session = Depends(deps.get_db_session), ): for kind in RuntimeKinds.runtime_with_handlers(): runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force, grace_period) return Response(status_code=HTTPStatus.NO_CONTENT.value)
def get_runtime(kind: str, label_selector: str = None): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(HTTPStatus.BAD_REQUEST.value, kind=kind, err="Invalid runtime kind") runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) return { "kind": kind, "resources": resources, }
def get_runtime(kind: str, label_selector: str = None): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(status.HTTP_400_BAD_REQUEST, kind=kind, err='Invalid runtime kind') runtime_handler = get_runtime_handler(kind) resources = runtime_handler.list_resources(label_selector) return { 'kind': kind, 'resources': resources, }
def delete_runtime(kind: str, label_selector: str = None, force: bool = False, db_session: Session = Depends(deps.get_db_session)): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(status.HTTP_400_BAD_REQUEST, kind=kind, err='Invalid runtime kind') runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force) return Response(status_code=status.HTTP_204_NO_CONTENT)
def _cleanup_runtimes(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session) except Exception as exc: logger.warning("Failed deleting resources. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def _monitor_runs(): db_session = create_session() try: for kind in RuntimeKinds.runtime_with_handlers(): try: runtime_handler = get_runtime_handler(kind) runtime_handler.monitor_runs(get_db(), db_session) except Exception as exc: logger.warning("Failed monitoring runs. Ignoring", exc=str(exc), kind=kind) finally: close_session(db_session)
def _assert_runtime_handler_list_resources( self, runtime_kind, expected_crds=None, expected_pods=None, expected_services=None, group_by: Optional[ mlrun.api.schemas.ListRuntimeResourcesGroupByField] = None, ): runtime_handler = get_runtime_handler(runtime_kind) if group_by is None: project = "*" label_selector = runtime_handler._get_default_label_selector() assertion_func = TestRuntimeHandlerBase._assert_list_resources_response elif group_by == mlrun.api.schemas.ListRuntimeResourcesGroupByField.job: project = self.project label_selector = ",".join([ runtime_handler._get_default_label_selector(), f"mlrun/project={self.project}", ]) assertion_func = ( TestRuntimeHandlerBase._assert_list_resources_grouped_response) else: raise NotImplementedError("Unsupported group by value") resources = runtime_handler.list_resources(project, group_by=group_by) crd_group, crd_version, crd_plural = runtime_handler._get_crd_info() get_k8s().v1api.list_namespaced_pod.assert_called_once_with( get_k8s().resolve_namespace(), label_selector=label_selector, ) if expected_crds: get_k8s( ).crdapi.list_namespaced_custom_object.assert_called_once_with( crd_group, crd_version, get_k8s().resolve_namespace(), crd_plural, label_selector=label_selector, ) if expected_services: get_k8s().v1api.list_namespaced_service.assert_called_once_with( get_k8s().resolve_namespace(), label_selector=label_selector, ) assertion_func( resources, expected_crds=expected_crds, expected_pods=expected_pods, expected_services=expected_services, )
def delete_runtime( kind: str, label_selector: str = None, force: bool = False, grace_period: int = config.runtime_resources_deletion_grace_period, db_session: Session = Depends(deps.get_db_session), ): if kind not in RuntimeKinds.runtime_with_handlers(): log_and_raise(HTTPStatus.BAD_REQUEST.value, kind=kind, err="Invalid runtime kind") runtime_handler = get_runtime_handler(kind) runtime_handler.delete_resources(get_db(), db_session, label_selector, force, grace_period) return Response(status_code=HTTPStatus.NO_CONTENT.value)
def custom_setup(self): config.mpijob_crd_version = MPIJobCRDVersions.v1 self.runtime_handler = get_runtime_handler(RuntimeKinds.mpijob) # initializing them here to save space in tests self.active_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_active_crd_status(), ) self.succeeded_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_succeeded_crd_status(), ) self.failed_crd_dict = self._generate_mpijob_crd( self.project, self.run_uid, self._get_failed_crd_status(), ) # there's currently a bug (fix was merged but not released https://github.com/kubeflow/mpi-operator/pull/271) # that causes mpijob's pods to not being labels with the given (MLRun's) labels - this prevents list resources # from finding the pods, so we're simulating the same thing here self._mock_list_namespaced_pods([[]])
def custom_setup(self): self.runtime_handler = get_runtime_handler(RuntimeKinds.job) labels = { "mlrun/class": self._get_class_name(), "mlrun/function": "my-trainer", "mlrun/name": "my-training", "mlrun/project": self.project, "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "mlrun/uid": self.run_uid, } pod_name = "my-training-j7dtf" # initializing them here to save space in tests self.pending_pod = self._generate_pod(pod_name, labels, PodPhases.pending) self.running_pod = self._generate_pod(pod_name, labels, PodPhases.running) self.completed_pod = self._generate_pod(pod_name, labels, PodPhases.succeeded) self.failed_pod = self._generate_pod(pod_name, labels, PodPhases.failed)
def custom_setup(self): self.runtime_handler = get_runtime_handler(RuntimeKinds.dask) # initializing them here to save space in tests scheduler_pod_labels = { "app": "dask", "dask.org/cluster-name": "mlrun-mydask-d7656bc1-0", "dask.org/component": "scheduler", "mlrun/class": "dask", "mlrun/function": "mydask", "mlrun/project": "default", "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "user": "******", } scheduler_pod_name = "mlrun-mydask-d7656bc1-0n4z9z" self.running_scheduler_pod = self._generate_pod( scheduler_pod_name, scheduler_pod_labels, PodPhases.running, ) self.completed_scheduler_pod = self._generate_pod( scheduler_pod_name, scheduler_pod_labels, PodPhases.succeeded, ) worker_pod_labels = { "app": "dask", "dask.org/cluster-name": "mlrun-mydask-d7656bc1-0", "dask.org/component": "worker", "mlrun/class": "dask", "mlrun/function": "mydask", "mlrun/project": "default", "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "user": "******", } worker_pod_name = "mlrun-mydask-d7656bc1-0pqbnc" self.running_worker_pod = self._generate_pod( worker_pod_name, worker_pod_labels, PodPhases.running, ) self.completed_worker_pod = self._generate_pod( worker_pod_name, worker_pod_labels, PodPhases.succeeded, ) service_name = "mlrun-mydask-d7656bc1-0" service_labels = { "app": "dask", "dask.org/cluster-name": "mlrun-mydask-d7656bc1-0", "dask.org/component": "scheduler", "mlrun/class": "dask", "mlrun/function": "mydask", "mlrun/project": "default", "mlrun/scrape_metrics": "False", "mlrun/tag": "latest", "user": "******", } self.cluster_service = self._generate_service(service_name, service_labels)