def get_latest_run_id(experiment_id): """ Get the latest run id for an experiment. Parameters ---------- experiment_id : str Returns ------- str """ try: kfp_experiment = kfp_client().get_experiment(experiment_name=experiment_id) except ValueError: return None # lists runs for trainings and deployments of an experiment kfp_runs = kfp_client().list_runs( page_size="1", sort_by="created_at desc", experiment_id=kfp_experiment.id, ) # find the latest training run latest_run_id = None for kfp_run in kfp_runs.runs: latest_run_id = kfp_run.id break return latest_run_id
def retry_run(run_id, experiment_id): """ Retry a run in Kubeflow Pipelines. Parameters ---------- run_id : str experiment_id : str Returns ------- dict Retry response confirmation. Raises ------ ApiException BadRequest """ if run_id == "latest": run_id = get_latest_run_id(experiment_id) kfp_run = kfp_client().get_run( run_id=run_id, ) if kfp_run.run.status == "Failed": kfp_client().runs.retry_run(run_id=kfp_run.run.id) else: raise BadRequest("Not a failed run") return {"message": "Run re-initiated successfully"}
def deploy_monitoring(deployment_id, experiment_id, run_id, task_id, monitoring_id): """ Deploy a service and trigger for monitoring. Parameters ---------- deployment_id : str experiment_id : str run_id : str task_id : str monitoring_id : str Returns ------- dict The run attributes. """ @dsl.pipeline(name="Monitoring") def monitoring(): service_name = f"service-{monitoring_id}" service = MONITORING_SERVICE.substitute({ "name": service_name, "namespace": KF_PIPELINES_NAMESPACE, "experimentId": experiment_id, "deploymentId": deployment_id, "runId": run_id, "configMap": f"configmap-{task_id}" }) service_resource = loads(service) monitoring_service = dsl.ResourceOp( name=service_name, k8s_resource=service_resource, success_condition="status.conditions.1.status == True" ) trigger_name = f"trigger-{monitoring_id}" trigger = MONITORING_TRIGGER.substitute({ "name": trigger_name, "namespace": KF_PIPELINES_NAMESPACE, "deploymentId": deployment_id, "service": service_name, }) trigger_resource = loads(trigger) dsl.ResourceOp( name="monitoring_trigger", k8s_resource=trigger_resource, success_condition="status.conditions.2.status == True" ).after(monitoring_service) kfp_client().create_run_from_pipeline_func( monitoring, {}, run_name="monitoring", namespace=KF_PIPELINES_NAMESPACE )
def start_run(operators, project_id, experiment_id, deployment_id=None, deployment_name=None): """ Start a new run in Kubeflow Pipelines. Parameters ---------- operators : list project_id : str experiment_id : str deployment_id : str or None deployment_name : str or None Returns ------- dict The run attributes. """ if len(operators) == 0: raise ValueError("Necessary at least one operator.") if deployment_id is None: name = f"experiment-{experiment_id}" else: name = f"deployment-{deployment_id}" if not deployment_name: deployment_name = deployment_id compile_pipeline(name=name, operators=operators, project_id=project_id, experiment_id=experiment_id, deployment_id=deployment_id, deployment_name=deployment_name) if deployment_id is not None: kfp_experiment = kfp_client().create_experiment(name=deployment_id) else: kfp_experiment = kfp_client().create_experiment(name=experiment_id) tag = datetime.utcnow().strftime("%Y-%m-%d %H-%M-%S") job_name = f"{name}-{tag}" pipeline_package_path = f"{name}.yaml" run = kfp_client().run_pipeline( experiment_id=kfp_experiment.id, job_name=job_name, pipeline_package_path=pipeline_package_path, ) os.remove(pipeline_package_path) return get_run(run.id, experiment_id)
def undeploy_pipeline(resource): """ Undeploy a deployment pipeline. Parameters ---------- resource : dict A k8s resource which will be submitted to the cluster. """ @dsl.pipeline(name="Undeploy") def undeploy(): dsl.ResourceOp(name="undeploy", k8s_resource=resource, action="delete") kfp_client().create_run_from_pipeline_func( undeploy, {}, run_name="undeploy", namespace=KF_PIPELINES_NAMESPACE)
def list_deployments_runs(): """ Retrive runs associated with a deployment. Returns ------- list Deployments runs list. """ token = "" runs = [] while True: list_runs = kfp_client().list_runs(page_token=token, sort_by="created_at desc", page_size=100) if list_runs.runs: runs_details = get_deployment_details(list_runs.runs) runs.extend(runs_details) token = list_runs.next_page_token if token is None: break else: break return runs
def terminate_run(self, deployment_id): """ Terminates a run in Kubeflow Pipelines. Parameters ---------- deployment_id : str Returns ------- projects.schemas.message.Message Raises ------ NotFound When deployment run does not exist. """ load_kube_config() api = client.CustomObjectsApi() custom_objects = api.list_namespaced_custom_object( "machinelearning.seldon.io", "v1", KF_PIPELINES_NAMESPACE, "seldondeployments", ) deployments_objects = custom_objects["items"] if deployments_objects: for deployment in deployments_objects: if deployment["metadata"]["name"] == deployment_id: undeploy_pipeline( name=deployment["metadata"]["name"], kind=deployment["kind"], namespace=deployment["metadata"]["namespace"], ) deployment_run = get_deployment_runs(deployment_id) if not deployment_run: raise NotFound(code="RunNotFound", message="The specified run does not exist.") kfp_client().runs.delete_run(deployment_run["runId"]) return schemas.Message(message="Deployment deleted")
def terminate_run(run_id, experiment_id): """ Terminates a run in Kubeflow Pipelines. Parameters ---------- run_id : str experiment_id : str Returns ------- dict Deleted response confirmation. Raises ------ ApiException """ if run_id == "latest": run_id = get_latest_run_id(experiment_id) kfp_client().runs.terminate_run(run_id=run_id) return {"message": "Run terminated."}
def undeploy_pipeline(name, kind, namespace): """ Undeploy a deployment pipeline. Parameters ---------- name : str kind : str namespace : str """ @dsl.pipeline(name="Undeploy") def undeploy(): kubernetes_resource_delete_op( name=name, kind=kind, namespace=namespace, ) kfp_client().create_run_from_pipeline_func( undeploy, {}, run_name="undeploy", )
def list_runs(experiment_id): """ Lists all runs of an experiment. Parameters ---------- experiment_id : str Returns ------- list A list of all runs. """ # In order to list_runs, we need to find KFP experiment id. # KFP experiment id is different from PlatIAgro's experiment_id, # so calling kfp_client().get_experiment(experiment_name='..') is required first. try: kfp_experiment = kfp_client().get_experiment(experiment_name=experiment_id) except ValueError: return [] # Now, lists runs kfp_runs = kfp_client().list_runs( page_size="10", sort_by="created_at desc", experiment_id=kfp_experiment.id, ) runs = [] for kfp_run in kfp_runs.runs: run_id = kfp_run.id run = get_run(experiment_id=experiment_id, run_id=run_id) runs.append(run) return runs
def get_container_status(experiment_id, operator_id): """ Get operator container status. Parameters ---------- experiment_id : str operator_id : str Returns ------- str The container status. """ # always get the status from the latest run run_id = get_latest_run_id(experiment_id) try: kfp_run = kfp_client().get_run(run_id=run_id) found_operator = False workflow_manifest = json.loads(kfp_run.pipeline_runtime.workflow_manifest) workflow_status = workflow_manifest["status"].get("phase") if workflow_status in {"Succeeded", "Failed"}: status = "Unset" else: status = "Pending" for node in workflow_manifest["status"].get("nodes", {}).values(): if node["displayName"] == operator_id: found_operator = True if "message" in node and str(node["message"]) == "terminated": status = "Terminated" else: status = str(node["phase"]) break if found_operator and workflow_status == "Failed" and status == "Pending": status = "Failed" return status except ApiValueError: return ""
def get_logs(self, project_id: str, experiment_id: str, run_id: str, operator_id: str): """ Get logs from a experiment run. Parameters ---------- project_id : str experiment_id : str run_id : str The run_id. If `run_id=latest`, then returns logs from the latest run_id. operator_id : str Returns ------- dict A dict of logs from a run. """ logs = get_notebook_logs(experiment_id=experiment_id, operator_id=operator_id) if not logs: # No notebooks or logs were found in the Jupyter API. # Search for logs in the operator pod details. if run_id == "latest": run_id = get_latest_run_id(experiment_id) run_details = kfp_client().get_run(run_id=run_id) details = loads(run_details.pipeline_runtime.workflow_manifest) operator = search_for_pod_info(details, operator_id) if operator and operator["status"] == "Failed": logs = { "exception": operator["message"], "traceback": [f"Kernel has died: {operator['message']}"] } else: logs = {"message": "Notebook finished with status completed."} return logs
def make_task_deletion_job( task: models.Task, all_tasks: List[models.Task], namespace: str ): """ Runs a Kubeflow Pipeline that deletes the volume of a task and removes it from JupyterLab. Parameters ---------- task : models.Task all_tasks : List[models.Task] namespace : str Returns ------- RunPipelineResult """ @dsl.pipeline( name="Delete Task", description="A pipeline that deletes K8s resources associated with a given task.", ) def pipeline_func(): # Patches JupyterLab to remove volumeMount resource_op = patch_notebook_volume_mounts_op( tasks=all_tasks, namespace=namespace ) delete_volume_op(name=f"task-{task.uuid}", namespace=namespace).after( resource_op ) run_name = f"Delete Task - {task.name}" return kfp_client().create_run_from_pipeline_func( pipeline_func=pipeline_func, arguments={}, run_name=run_name, experiment_name=task.uuid, )
def setUp(self): self.maxDiff = None with open("tests/resources/mocked_deployment.yaml", "r") as file: content = file.read() content = content.replace("$deploymentId", DEPLOYMENT_ID) with open("tests/resources/mocked.yaml", "w") as file: file.write(content) kfp_experiment = kfp_client().create_experiment(name=DEPLOYMENT_ID) kfp_client().run_pipeline( experiment_id=kfp_experiment.id, job_name=DEPLOYMENT_ID, pipeline_package_path="tests/resources/mocked.yaml", ) conn = engine.connect() text = ( f"INSERT INTO tasks (uuid, name, description, image, commands, arguments, tags, parameters, experiment_notebook_path, deployment_notebook_path, is_default, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( TASK_ID, 'name', 'desc', IMAGE, None, None, TAGS_JSON, dumps([]), EX_NOTEBOOK_PATH, DEPLOY_NOTEBOOK_PATH, 0, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO tasks (uuid, name, description, image, commands, arguments, tags, parameters, experiment_notebook_path, deployment_notebook_path, is_default, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( TASK_ID_2, 'name', 'desc', IMAGE, None, None, TAGS_JSON, dumps([]), EX_NOTEBOOK_PATH, None, 0, CREATED_AT, UPDATED_AT, )) text = (f"INSERT INTO projects (uuid, name, created_at, updated_at) " f"VALUES (%s, %s, %s, %s)") conn.execute(text, ( PROJECT_ID, NAME, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO experiments (uuid, name, project_id, position, is_active, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( EXPERIMENT_ID, NAME, PROJECT_ID, POSITION, 1, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO deployments (uuid, name, project_id, experiment_id, position, is_active, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( DEPLOYMENT_ID, NAME, PROJECT_ID, EXPERIMENT_ID, POSITION, 1, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO deployments (uuid, name, project_id, experiment_id, position, is_active, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( DEPLOYMENT_ID_2, NAME, PROJECT_ID, EXPERIMENT_ID, POSITION, 1, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO operators (uuid, experiment_id, deployment_id, task_id, parameters, dependencies, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( OPERATOR_ID, EXPERIMENT_ID, DEPLOYMENT_ID, TASK_ID_2, PARAMETERS_JSON, DEP_EMPTY_JSON, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO operators (uuid, experiment_id, deployment_id, task_id, parameters, dependencies, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( OPERATOR_ID_2, EXPERIMENT_ID, DEPLOYMENT_ID, TASK_ID, PARAMETERS_JSON, dumps([OPERATOR_ID]), CREATED_AT, UPDATED_AT, )) conn.close()
def make_task_creation_job( task: models.Task, all_tasks: List[models.Task], namespace: str, copy_from: Optional[models.Task] = None, ): """ Runs a Kubeflow Pipeline that creates all resources necessary for a new task. Creates a persistent volume, copies files into the volume, and patches notebook server (updates volumes and volumeMounts). Parameters ---------- task : models.Task all_tasks : List[models.Task] namespace : str copy_from : models.Task, optional Returns ------- RunPipelineResult """ @dsl.pipeline( name="Create Task", description="A pipeline that creates all resources necessary for a new task.", ) def pipeline_func(): # Creates a volume for this task volume_op_task = create_volume_op(name=f"task-{task.uuid}", namespace=namespace) # And a ContainerOp to initialize task contents # Either the contents of other task are copied, # or some empty notebooks are copied into the volume container_op = create_init_task_container_op(copy_from=copy_from) container_op.add_pvolumes( {DESTINATION_TASK_VOLUME_MOUNT_PATH: volume_op_task.volume} ) container_op.set_timeout(DEFAULT_TIMEOUT_IN_SECONDS) if copy_from: # If task is a copy, also adds a volume mount to the source task volume volume_op_task = create_volume_op( name=f"task-{copy_from.uuid}", namespace=namespace ) container_op.add_pvolumes( {SOURCE_TASK_VOLUME_MOUNT_PATH: volume_op_task.volume} ) if task.category == "MONITORING": # if it is a "MONITORING task", creates a configmap using the contents of this task. # knative serving (monitoring service) does not support stateful resources like # persistentvolumes, so we have to use a configmap to create a volumeMount into # the monitoring service. # TODO add real task content create_configmap_op(task=task, namespace=namespace, content="") # Patches JupyterLab to mount new task volume patch_notebook_volume_mounts_op(tasks=all_tasks, namespace=namespace).after( container_op ) run_name = f"Create Task - {task.name}" return kfp_client().create_run_from_pipeline_func( pipeline_func=pipeline_func, arguments={}, run_name=run_name, experiment_name=task.uuid, namespace=namespace, )
def setUp(self): self.maxDiff = None # Run experiment to succeed experiment = kfp_client().create_experiment(name=EXPERIMENT_ID) self.run = kfp_client().run_pipeline( experiment.id, OPERATOR_ID_2, "tests/resources/mocked_operator_succeed.yaml") session = requests.Session() session.cookies.update(COOKIES) session.headers.update(HEADERS) session.hooks = { "response": lambda r, *args, **kwargs: r.raise_for_status(), } conn = engine.connect() text = ( f"INSERT INTO projects (uuid, name, description, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s)") conn.execute(text, ( PROJECT_ID, NAME, DESCRIPTION, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO experiments (uuid, name, project_id, position, is_active, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( EXPERIMENT_ID, EXPERIMENT_NAME, PROJECT_ID, 0, 1, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO tasks (uuid, name, description, image, commands, arguments, tags, parameters, experiment_notebook_path, deployment_notebook_path, is_default, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( TASK_ID, NAME, DESCRIPTION, IMAGE, COMMANDS_JSON, ARGUMENTS_JSON, TAGS_JSON, dumps([]), EXPERIMENT_NOTEBOOK_PATH, EXPERIMENT_NOTEBOOK_PATH, 0, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO operators (uuid, experiment_id, task_id, parameters, position_x, position_y, dependencies, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( OPERATOR_ID, EXPERIMENT_ID, TASK_ID, PARAMETERS_JSON, POSITION_X, POSITION_Y, DEPENDENCIES_OP_ID_JSON, CREATED_AT, UPDATED_AT, )) text = ( f"INSERT INTO operators (uuid, experiment_id, task_id, parameters, position_x, position_y, dependencies, created_at, updated_at) " f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)") conn.execute(text, ( OPERATOR_ID_2, EXPERIMENT_ID, TASK_ID, PARAMETERS_JSON, POSITION_X, POSITION_Y, DEPENDENCIES_OP_ID_JSON, CREATED_AT, UPDATED_AT, )) conn.close() session.put( url=f"{JUPYTER_ENDPOINT}/api/contents/experiments", data=dumps({ "type": "directory", "content": None }), ) session.put( url=f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}", data=dumps({ "type": "directory", "content": None }), ) session.put( url= f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}/operators", data=dumps({ "type": "directory", "content": None }), ) session.put( url= f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID}", data=dumps({ "type": "directory", "content": None }), ) session.put( url= f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID}/Experiment.ipynb", data=dumps({ "type": "notebook", "content": loads(SAMPLE_FAILED_NOTEBOOK) }), ) session.put( url= f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID_2}", data=dumps({ "type": "directory", "content": None }), ) session.put( url= f"{JUPYTER_ENDPOINT}/api/contents/experiments/{EXPERIMENT_ID}/operators/{OPERATOR_ID_2}/Experiment.ipynb", data=dumps({ "type": "notebook", "content": loads(SAMPLE_COMPLETED_NOTEBOOK) }), )
def get_run(run_id, experiment_id): """ Details a run in Kubeflow Pipelines. Parameters ---------- run_id : str experiment_id : str Returns ------- dict The run attributes. Raises ------ ApiException ValueError """ if run_id == "latest": run_id = get_latest_run_id(experiment_id) kfp_run = kfp_client().get_run( run_id=run_id, ) workflow_manifest = json.loads(kfp_run.pipeline_runtime.workflow_manifest) operators = {} workflow_status = workflow_manifest["status"].get("phase") if workflow_status in {"Succeeded", "Failed"}: default_node_status = "Unset" else: default_node_status = "Pending" # initializes all operators with status=Pending and parameters={} template = next(t for t in workflow_manifest["spec"]["templates"] if "dag" in t) tasks = (tsk for tsk in template["dag"]["tasks"] if not tsk["name"].startswith("vol-")) operators = dict((t["name"], {"status": default_node_status, "parameters": {}}) for t in tasks) # set status for each operator for node in workflow_manifest["status"].get("nodes", {}).values(): if node["displayName"] in operators: operator_id = node["displayName"] operators[operator_id]["status"] = get_status(node) # sets taskId and parameters for each operator for template in workflow_manifest["spec"]["templates"]: operator_id = template["name"] if "inputs" in template and "parameters" in template["inputs"]: operators[operator_id]["taskId"] = get_task_id(template) if "container" in template and "env" in template["container"]: operators[operator_id]["parameters"] = get_parameters(template) return { "uuid": kfp_run.run.id, "operators": operators, "createdAt": kfp_run.run.created_at, }