def list_resource_version(group, version, namespace, plural): """ Determines the resource version the watcher should list from. Parameters ---------- group : str version : str namespace : str plural : str Returns ------- str """ load_kube_config() api = client.CustomObjectsApi() r = api.list_namespaced_custom_object( group=group, version=version, namespace=namespace, plural=plural, ) return r["metadata"]["resourceVersion"]
def list_workflow_pods(run_id: str): """ Lists pods from a workflow. Returns only pods that ran a platiagro task. Parameters ---------- run_id : str Returns ------- list A list of all logs from a run. """ workflows = list_workflows(run_id) if len(workflows) == 0: return [] workflow_name = workflows[0]["metadata"]["name"] load_kube_config() core_api = client.CoreV1Api() pod_list = core_api.list_namespaced_pod( namespace=KF_PIPELINES_NAMESPACE, label_selector=f"workflows.argoproj.io/workflow={workflow_name}", ).items # Filters by pods that have an annotation "name=...". # Only pods that ran a platiagro tasks have this annotation. pod_list = [pod for pod in pod_list if "name" in pod.metadata.annotations] return pod_list
def watch_operator(self, deployment_id: Optional[str] = None, experiment_id: Optional[str] = None): GROUP = "argoproj.io" VERSION = "v1alpha1" PLURAL = "workflows" load_kube_config() api = client.CustomObjectsApi() while True: run_id = get_latest_run_id(experiment_id or deployment_id) if not run_id: yield "operator not running" time.sleep(5) else: resource_version = list_resource_version( group=GROUP, version=VERSION, namespace=KF_PIPELINES_NAMESPACE, plural=PLURAL, ) w = Watch() stream = w.stream( api.list_namespaced_custom_object, group=GROUP, version=VERSION, namespace=KF_PIPELINES_NAMESPACE, plural=PLURAL, resource_version=resource_version, label_selector=f"pipeline/runid={run_id}", pretty="true", ) for streamline in stream: yield f"Event: {streamline['type']} {streamline['object']['metadata']['name']}"
def list_deployment_pods(deployment_id): """ List pods under a Deployment. Parameters ---------- deployment_id : str Returns ------- list A list of deployment's pod. Notes ---- Equivalent to `kubectl -n KF_PIPELINES_NAMESPACE get pods -l seldon-deployment-id=deployment_id`. """ load_kube_config() core_api = client.CoreV1Api() pod_list = core_api.list_namespaced_pod( namespace=KF_PIPELINES_NAMESPACE, label_selector=f'seldon-deployment-id={deployment_id}', ).items return pod_list
def get_notebook_state(): """ Get notebook server state. Returns ------- bool Raises ------ ApiException """ load_kube_config() v1 = client.CoreV1Api() try: pod = v1.read_namespaced_pod( name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, _request_timeout=5, ) if pod.status.phase == "Running" \ and all([c.state.running for c in pod.status.container_statuses]): return True return False except ApiException as e: body = literal_eval(e.body) message = body["message"] raise InternalServerError( f"Error while trying to get notebook server state: {message}")
def run(log_level): """ Watches kubernetes events and saves relevant data. """ load_kube_config() api = client.CustomObjectsApi() # This env variable is responsible for stopping all running # watchers when an exception is caught on any thread. os.environ["STOP_THREADS"] = "0" logging.basicConfig(level=log_level) watchers = [watch_workflows, watch_seldon_deployments] # We decided to use a ThreadPoolExecutor to concurrently run our watchers. # This is necessary because we couldn't easily catch watchers exceptions # with Python threading library, with this change, we are able to catch # exceptions in any of the watchers and immediatly terminate all threads. with concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(watcher, api, session) for watcher in watchers ] for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as e: os.environ["STOP_THREADS"] = "1" logging.error(f"Exception caught: {e}") executor.shutdown()
def get_protocol(): """ Get protocol used by the cluster. Returns ------- str The protocol. """ load_kube_config() v1 = client.CustomObjectsApi() gateway = v1.get_namespaced_custom_object(group='networking.istio.io', version='v1alpha3', namespace='kubeflow', plural='gateways', name='kubeflow-gateway') if 'tls' in gateway['spec']['servers'][0]: protocol = 'https' else: protocol = 'http' return protocol
def create_monitoring_task_config_map(task_id, experiment_notebook_content): """ Create a ConfigMap with the notebook of the given task. Parameters ---------- task_id : str experiment_notebook_content : str """ config_map_name = f"configmap-{task_id}" load_kube_config() v1 = client.CoreV1Api() body = { "metadata": { "name": config_map_name, }, "data": {"Experiment.ipynb": experiment_notebook_content}, } v1.create_namespaced_config_map( namespace=KF_PIPELINES_NAMESPACE, body=body, ) warnings.warn(f"ConfigMap of task {task_id} created!")
def list_project_seldon_deployments(project_id): """ List deployments under a project. Parameters ---------- project_id : str Returns ------- list A list of deployment's pod. """ load_kube_config() custom_api = client.CustomObjectsApi() deployments = custom_api.list_namespaced_custom_object( group='machinelearning.seldon.io', version='v1', namespace=KF_PIPELINES_NAMESPACE, plural='seldondeployments', label_selector=f'projectId={project_id}', )["items"] return deployments
def copy_files_in_pod(source_path, destination_path): """ Copies files from inside a pod. Parameters ---------- source_path : str destination_path : str """ warnings.warn(f"Copying '{source_path}' to '{destination_path}'...") load_kube_config() api_instance = client.CoreV1Api() # The following command zip the contents of path exec_command = ["cp", "-a", source_path, destination_path] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) handle_container_stream(container_stream=container_stream) warnings.warn(f"Copied '{source_path}' to '{destination_path}'!")
def list_workflows(run_id): """ List workflow given a run_id. Parameters ---------- run_id : str Returns ------- list A list of workflows. Notes ---- Equivalent to `kubectl -n KF_PIPELINES_NAMESPACE get workflow -l workflows.argoproj.io/workflow=type_-id_`. """ load_kube_config() custom_api = client.CustomObjectsApi() workflows = custom_api.list_namespaced_custom_object( group="argoproj.io", version="v1alpha1", namespace=KF_PIPELINES_NAMESPACE, plural="workflows", label_selector=f"pipeline/runid={run_id}", )["items"] return workflows
def undeploy_monitoring(monitoring_id): """ Undeploy the service and trigger of a given monitoring_id. Parameters ---------- monitoring_id : str """ load_kube_config() api = client.CustomObjectsApi() # Undeploy service service_name = f"service-{monitoring_id}" service_custom_object = api.get_namespaced_custom_object( group="serving.knative.dev", version="v1alpha1", namespace=KF_PIPELINES_NAMESPACE, plural="services", name=service_name ) undeploy_pipeline(service_custom_object) # Undeploy trigger trigger_name = f"trigger-{monitoring_id}" trigger_custom_object = api.get_namespaced_custom_object( group="eventing.knative.dev", version="v1alpha1", namespace=KF_PIPELINES_NAMESPACE, plural="triggers", name=trigger_name ) undeploy_pipeline(trigger_custom_object)
async def set_notebook_metadata(notebook_path, task_id, experiment_id, operator_id): """ Sets metadata values in notebook file. Parameters ---------- notebook_path : str task_id : str experiment_id : str operator_id : str """ if notebook_path is None: return warnings.warn(f"Setting metadata in {notebook_path}...") load_kube_config() api_instance = client.CoreV1Api() # The following command sets task_id in the metadata of a notebook python_script = (f"import json; " f"f = open('{JUPYTER_WORKSPACE}/{notebook_path}'); " f"n = json.load(f); " f"n['metadata']['task_id'] = '{task_id}'; " f"n['metadata']['experiment_id'] = '{experiment_id}'; " f"n['metadata']['operator_id'] = '{operator_id}'; " f"f.close(); " f"f = open('{JUPYTER_WORKSPACE}/{notebook_path}', 'w'); " f"json.dump(n, f); " f"f.close()") exec_command = [ "python", "-c", python_script, ] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) while container_stream.is_open(): container_stream.update(timeout=10) if container_stream.peek_stdout(): warnings.warn("STDOUT: %s" % container_stream.read_stdout()) if container_stream.peek_stderr(): warnings.warn("STDERR: %s" % container_stream.read_stderr()) container_stream.close() warnings.warn(f"Setting metadata in {notebook_path}...")
def copy_file_to_pod(filepath, destination_path): """ Copies a local file to a pod in notebook server. Based on this example: https://github.com/prafull01/Kubernetes-Utilities/blob/master/kubectl_cp_as_python_client.py Parameters ---------- filepath : str destination_path : str """ warnings.warn(f"Copying '{filepath}' to '{destination_path}'...") load_kube_config() api_instance = client.CoreV1Api() # The following command extracts the contents of STDIN to /home/jovyan/tasks exec_command = ["tar", "xvf", "-", "-C", "/home/jovyan/tasks"] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=True, stdout=True, tty=False, _preload_content=False, ) with TemporaryFile() as tar_buffer: # Prepares an uncompressed tarfile that will be written to STDIN with tarfile.open(fileobj=tar_buffer, mode="w") as tar: tar.add(filepath, arcname=destination_path) # Rewinds to beggining of tarfile tar_buffer.seek(0) # WARNING: # Attempts to write the entire tarfile caused connection errors for large files # The loop below reads/writes small chunks to prevent these errors data = tar_buffer.read(1000000) while container_stream.is_open(): container_stream.update(timeout=10) if container_stream.peek_stdout(): warnings.warn("STDOUT: %s" % container_stream.read_stdout()) if container_stream.peek_stderr(): warnings.warn("STDERR: %s" % container_stream.read_stderr()) if data: container_stream.write_stdin(data) data = tar_buffer.read(1000000) else: break container_stream.close() warnings.warn(f"Copied '{filepath}' to '{destination_path}'!")
def log_stream(self, pod, container): """ Generates log stream of given pod's container. Whenever the event source is called, there's a new thread for each pod that listen for new logs and there's a thread that watches for new pods being created. But there's a limitation within the log generation. When the client disconnects from the event source, the allocated threads aren't deallocated, not releasing the memory and process used. Parameters ---------- pod: str container: str Yields ------ str """ load_kube_config() v1 = client.CoreV1Api() w = Watch() pod_name = pod.metadata.name namespace = pod.metadata.namespace container_name = container.name try: for streamline in w.stream( v1.read_namespaced_pod_log, name=pod_name, namespace=namespace, container=container_name, pretty="true", tail_lines=0, timestamps=True, ): self.queue.put_nowait(streamline) except RuntimeError as e: logging.exception(e) return except asyncio.CancelledError as e: logging.exception(e) return except ApiException as e: """ Expected behavior when trying to connect to a container that isn't ready yet. """ logging.exception(e) except CancelledError as e: """ Expected behavior when trying to cancel task """ logging.exception(e) return
def undeploy_monitoring(monitoring_id): """ Undeploy the service and trigger of a given monitoring_id. Parameters ---------- monitoring_id : str Raises ------ NotFound When monitoring resources do not exist. """ load_kube_config() api = client.CustomObjectsApi() try: # Undeploy service service_name = f"service-{monitoring_id}" service_custom_object = api.get_namespaced_custom_object( group="serving.knative.dev", version="v1", namespace=KF_PIPELINES_NAMESPACE, plural="services", name=service_name, ) undeploy_pipeline( name=service_custom_object["metadata"]["name"], kind=service_custom_object["kind"], namespace=service_custom_object["metadata"]["namespace"], ) # Undeploy trigger trigger_name = f"trigger-{monitoring_id}" trigger_custom_object = api.get_namespaced_custom_object( group="eventing.knative.dev", version="v1", namespace=KF_PIPELINES_NAMESPACE, plural="triggers", name=trigger_name, ) undeploy_pipeline( name=trigger_custom_object["metadata"]["name"], kind=trigger_custom_object["kind"], namespace=trigger_custom_object["metadata"]["namespace"], ) except ApiException: raise NotFound( code="MonitoringNotFound", message="The specified monitoring does not exist.", )
def get_files_from_task(task_name): """ Get all files inside a task folder in a notebook server. Parameters ---------- task_name: str Returns ------- str File content. """ warnings.warn(f"Zipping contents of task: '{task_name}'") load_kube_config() api_instance = client.CoreV1Api() python_script = (f"import os; " f"os.chdir('{JUPYTER_WORKSPACE}/{task_name}'); " f"os.system('zip -q -r - * | base64'); ") exec_command = ["python", "-c", python_script] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) zip_file_content = "" while container_stream.is_open(): container_stream.update(timeout=10) if container_stream.peek_stdout(): zip_file_content = container_stream.read_stdout() warnings.warn("File content fetched.") container_stream.close() # the stdout string contains \n character, we must remove clean_zip_file_content = zip_file_content.replace("\n", "") return clean_zip_file_content
def get_cluster_ip(): """ Retrive the cluster ip. Returns ------- str The cluster ip. """ load_kube_config() v1 = client.CoreV1Api() service = v1.read_namespaced_service( name='istio-ingressgateway', namespace='istio-system') return service.status.load_balancer.ingress[0].ip
def get_container_logs(pod, container): """ Returns latest logs of the specified container. Parameters ---------- pod : str container : str Returns ------- str Container's logs. Raises ------ InternalServerError While trying to query Kubernetes API. """ load_kube_config() core_api = client.CoreV1Api() try: logs = core_api.read_namespaced_pod_log( name=pod.metadata.name, namespace=KF_PIPELINES_NAMESPACE, container=container.name, pretty="true", tail_lines=512, timestamps=True, ) return logs except ApiException as e: body = literal_eval(e.body) message = body["message"] if pod.status.reason in IGNORABLE_STATUSES_REASONS: return None for ignorable_messages in IGNORABLE_MESSAGES_KEYTEXTS: if ignorable_messages in message: return None raise InternalServerError( code="CannotRetrieveContainerLogs", message= f"Error while trying to retrieve container's log: {message}", )
def terminate_run(self, deployment_id): """ Terminates a run in Kubeflow Pipelines. Parameters ---------- deployment_id : str Returns ------- projects.schemas.message.Message Raises ------ NotFound When deployment run does not exist. """ load_kube_config() api = client.CustomObjectsApi() custom_objects = api.list_namespaced_custom_object( "machinelearning.seldon.io", "v1", KF_PIPELINES_NAMESPACE, "seldondeployments", ) deployments_objects = custom_objects["items"] if deployments_objects: for deployment in deployments_objects: if deployment["metadata"]["name"] == deployment_id: undeploy_pipeline( name=deployment["metadata"]["name"], kind=deployment["kind"], namespace=deployment["metadata"]["namespace"], ) deployment_run = get_deployment_runs(deployment_id) if not deployment_run: raise NotFound(code="RunNotFound", message="The specified run does not exist.") kfp_client().runs.delete_run(deployment_run["runId"]) return schemas.Message(message="Deployment deleted")
def list_resource_version(): """ Determines the resource version the watcher should list from. Returns ------- str """ load_kube_config() api = client.CustomObjectsApi() r = api.list_namespaced_custom_object( group="argoproj.io", version="v1alpha1", namespace=KF_PIPELINES_NAMESPACE, plural="workflows", ) return r["metadata"]["resourceVersion"]
def get_file_from_pod(filepath): """ Get the content of a file from a pod in notebook server. Parameters ---------- filepath : str Returns ------- str File content. """ notebook_path = f"{JUPYTER_WORKSPACE}/{filepath}" warnings.warn(f"Fetching {notebook_path} from pod...") load_kube_config() api_instance = client.CoreV1Api() exec_command = ["cat", notebook_path] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) file_content = "" while container_stream.is_open(): container_stream.update(timeout=10) if container_stream.peek_stdout(): file_content = container_stream.read_stdout() warnings.warn(f"File content fetched.") container_stream.close() return file_content
def copy_file_to_pod(filepath, destination_path): """ Copies a local file to a pod in notebook server. Based on this example: https://github.com/prafull01/Kubernetes-Utilities/blob/master/kubectl_cp_as_python_client.py Parameters ---------- filepath : str destination_path : str """ warnings.warn(f"Copying '{filepath}' to '{destination_path}'...") load_kube_config() api_instance = client.CoreV1Api() # The following command extracts the contents of STDIN to /home/jovyan/tasks exec_command = ["tar", "xvf", "-", "-C", "/home/jovyan/tasks"] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=True, stdout=True, tty=False, _preload_content=False, ) with TemporaryFile() as tar_buffer: # Prepares an uncompressed tarfile that will be written to STDIN with tarfile.open(fileobj=tar_buffer, mode="w") as tar: tar.add(filepath, arcname=destination_path) # Rewinds to beggining of tarfile tar_buffer.seek(0) handle_container_stream(container_stream=container_stream, buffer=tar_buffer) warnings.warn(f"Copied '{filepath}' to '{destination_path}'!")
def delete_monitoring_task_config_map(task_id): """ Delete ConfigMap of the given task_id. Parameters ---------- task_id : str """ config_map_name = f"configmap-{task_id}" load_kube_config() v1 = client.CoreV1Api() try: v1.delete_namespaced_config_map( name=config_map_name, namespace=KF_PIPELINES_NAMESPACE ) warnings.warn(f"ConfigMap of task {task_id} deleted!") except ApiException: warnings.warn(f"ConfigMap of task {task_id} not found, creating a new one.")
def get_container_logs(pod, container): """ Returns latest logs of the specified container. Parameters ---------- pod : str container : str Returns ------- str Container's logs. Raises ------ InternalServerError While trying to query Kubernetes API. """ load_kube_config() core_api = client.CoreV1Api() try: logs = core_api.read_namespaced_pod_log( name=pod.metadata.name, namespace=KF_PIPELINES_NAMESPACE, container=container.name, pretty="true", tail_lines=512, timestamps=True, ) return logs except ApiException as e: body = literal_eval(e.body) message = body["message"] if "ContainerCreating" in message: return None raise InternalServerError( f"Error while trying to retrive container's log: {message}")
def watch_workflow_pods(self, experiment_id: str): # Bug conhecido: # Um pod que foi encontrado pelo worker de pods pode não ser encontrado pelo worker de logs no caso de experimentos load_kube_config() v1 = client.CoreV1Api() w = Watch() try: for pod in w.stream(v1.list_namespaced_pod, namespace=KF_PIPELINES_NAMESPACE, label_selector=f"experiment-id={experiment_id}"): if pod["type"] == "ADDED": pod = pod["object"] for container in pod.spec.containers: if container.name not in EXCLUDE_CONTAINERS and "name" in pod.metadata.annotations: self.loop.run_in_executor(self.pool, self.log_stream, pod, container) except CancelledError: """ Expected behavior when trying to cancel task """ w.stop() return
def watch_deployment_pods(self, deployment_id): load_kube_config() v1 = client.CoreV1Api() w = Watch() try: for pod in w.stream( v1.list_namespaced_pod, namespace=KF_PIPELINES_NAMESPACE, label_selector=f"seldon-deployment-id={deployment_id}", ): if pod["type"] == "ADDED": pod = pod["object"] for container in pod.spec.containers: if container.name not in EXCLUDE_CONTAINERS: self.loop.run_in_executor(self.pool, self.log_stream, pod, container) except CancelledError: """ Expected behavior when trying to cancel task """ w.stop() return
def run(): """ Watches kubernetes events and saves relevant data. """ load_kube_config() api = client.CustomObjectsApi() w = watch.Watch() # When retrieving a collection of resources the response from the server # will contain a resourceVersion value that can be used to initiate a watch # against the server. resource_version = list_resource_version() while True: stream = w.stream( api.list_namespaced_custom_object, group="argoproj.io", version="v1alpha1", namespace=KF_PIPELINES_NAMESPACE, plural="workflows", resource_version=resource_version, ) try: for workflow_manifest in stream: logging.info("Event: %s %s " % ( workflow_manifest["type"], workflow_manifest["object"]["metadata"]["name"], )) update_status(workflow_manifest) except client.exceptions.ApiException as e: # When the requested watch operations fail because the historical version # of that resource is not available, clients must handle the case by # recognizing the status code 410 Gone, clearing their local cache, # performing a list operation, and starting the watch from the resourceVersion returned by that new list operation. # See: https://kubernetes.io/docs/reference/using-api/api-concepts/#efficient-detection-of-changes if e.status == http.HTTPStatus.GONE: resource_version = list_resource_version()
def copy_files_in_pod(source_path, destination_path): """ Copies files from inside a pod. Parameters ---------- source_path : str destination_path : str """ warnings.warn(f"Copying '{source_path}' to '{destination_path}'...") load_kube_config() api_instance = client.CoreV1Api() # The following command zip the contents of path exec_command = ["cp", "-a", source_path, destination_path] container_stream = stream( api_instance.connect_get_namespaced_pod_exec, name=NOTEBOOK_POD_NAME, namespace=NOTEBOOK_NAMESPACE, command=exec_command, container=NOTEBOOK_CONTAINER_NAME, stderr=True, stdin=False, stdout=True, tty=False, _preload_content=False, ) while container_stream.is_open(): container_stream.update(timeout=10) if container_stream.peek_stdout(): warnings.warn("STDOUT: %s" % container_stream.read_stdout()) if container_stream.peek_stderr(): warnings.warn("STDERR: %s" % container_stream.read_stderr()) container_stream.close() warnings.warn(f"Copied '{source_path}' to '{destination_path}'!")
def get_cluster_ip(): """ Retrieve the cluster ip. Returns ------- str The cluster ip. """ load_kube_config() v1 = client.CoreV1Api() service = v1.read_namespaced_service(name="istio-ingressgateway", namespace="istio-system") if service.status.load_balancer.ingress is None: cluster_ip = service.spec.cluster_ip else: if service.status.load_balancer.ingress[0].hostname: cluster_ip = service.status.load_balancer.ingress[0].hostname else: cluster_ip = service.status.load_balancer.ingress[0].ip return os.environ.get("INGRESS_HOST_PORT", cluster_ip)