def wait_for_ingress(self, title, test_path=None, method="GET"): base_url = self.get_ingress_url(title) test_url = urllib.parse.urljoin(base_url, test_path) request = urllib.request.Request(test_url, method=method) try: urllib.request.urlopen(request) except http.client.RemoteDisconnected as e: self.logger.warning( "wait_for_ingress (%s): RemoteDisconnected: %s" % (test_url, e)) if self.status == deployment_status.STATUS_DEPLOYING: raise deployment_status.StillDeploying( "Waiting for %s ingress" % title) raise deployment_status.DeploymentError("Could not connect: %s" % e) except urllib.error.HTTPError as e: self.logger.warning("wait_for_ingress (%s): HTTPError: %s" % (test_url, e)) if self.status == deployment_status.STATUS_DEPLOYING: if e.code == 404 or e.code == 503 or e.code == 502 or e.code == 504: raise deployment_status.StillDeploying( "Waiting for %s ingress" % title) raise deployment_status.DeploymentError("HTTP response: %s" % e) except urllib.error.URLError as e: self.logger.warning("wait_for_ingress (%s): URLError: %s" % (test_url, e)) if self.status == deployment_status.STATUS_DEPLOYING: raise deployment_status.StillDeploying( "Waiting for %s ingress" % title) raise deployment_status.DeploymentError("URL error: %s" % e)
def sync_source_code(self, url): url = urllib.parse.urljoin(url, "notebook") self.logger.warning("notebookurl: %s" % url) try: download_request = urllib.request.Request(url, method="GET") download_response = urllib.request.urlopen(download_request, timeout=7) container_notebook_version_string = download_response.getheader( "X-Notebook-Version") if container_notebook_version_string is None: remote_notebook_version = -1 else: remote_notebook_version = int( container_notebook_version_string) remote_notebook_code = download_response.read().decode() except http.client.RemoteDisconnected as e: if self.status == deployment_status.STATUS_DEPLOYING: raise deployment_status.StillDeploying( "Waiting for connection") raise deployment_status.DeploymentError("Could not connect: %s" % e) except urllib.error.HTTPError as e: if e.code != 404 and e.code != 503 and e.code != 502 and e.code != 504: raise Exception( "downloading notebook source failed with code %s" % e.code) raise deployment_status.StillDeploying( "Waiting for connection to container") if remote_notebook_version > self.algorithm.source_code_version: self.algorithm.update_source_code( remote_notebook_code, remote_notebook_version, ) self.logger.info( "Received and stored updated source code (version %s)" % remote_notebook_version) if remote_notebook_version < self.algorithm.source_code_version: local_notebook_data = self.algorithm.source_code.encode() upload_request = urllib.request.Request( url, data=local_notebook_data, method="PUT", headers={ "X-Notebook-Version": self.algorithm.source_code_version, "Content-Type": "application/octet-stream", }) try: upload_response = urllib.request.urlopen(upload_request) self.logger.info("Sent source code (version %s)" % self.algorithm.source_code_version) python_code = upload_response.read().decode() #self.logger.info("Resulting python code:\n%s" % python_code) except urllib.error.HTTPError as e: if e.code != 404 and e.code != 503 and e.code != 502 and e.code != 504: raise Exception("Error uploading sourcecode: %s" % e.code) raise deployment_status.StillDeploying( "Waiting for connection to container")
def wait_for_endpoints_pod_ip(self, endpoints_name): endpoints = self.core_api.read_namespaced_endpoints( endpoints_name, self.environment.namespace, ) if not endpoints: raise deployment_status.StillDeploying("Waiting for %s endpoints" % endpoints_name) if not endpoints.subsets: raise deployment_status.StillDeploying("Waiting for %s endpoints" % endpoints_name) pod_ip = None for subset in endpoints.subsets: for address in subset.addresses: pod_ip = address.ip if not pod_ip: raise deployment_status.StillDeploying( "Waiting for %s pod endpoint address" % endpoints_name) return pod_ip
def sync_source_code(self): notebook_url = urllib.parse.urljoin( self.get_ingress_url("editor"), "_dltk/notebook" ) try: download_request = urllib.request.Request(notebook_url, method="GET") download_response = urllib.request.urlopen(download_request) notebook_version = int(download_response.getheader("X-Notebook-Version")) if notebook_version is None: raise Exception("Did not receive notebook version") notebook_code = download_response.read().decode() except http.client.RemoteDisconnected as e: if self.status == deployment_status.STATUS_DEPLOYING: raise deployment_status.StillDeploying("Waiting for connection") raise deployment_status.DeploymentError("Could not connect: %s" % e) except urllib.error.HTTPError as e: if e.code != 404: raise UserFriendlyError("failed downloading notebook source: %s" % e.code) notebook_version = -1 notebook_code = "" if notebook_version > self.algorithm.source_code_version: self.algorithm.update_source_code( notebook_code, notebook_version, ) logging.info("Received and stored updated source code (version %s)" % notebook_version) if notebook_version < self.algorithm.source_code_version: notebook_data = self.algorithm.source_code.encode() #raise Exception("notebook_data: %s" % self.algorithm.source_code) upload_request = urllib.request.Request( notebook_url, data=notebook_data, method="PUT", headers={ "X-Notebook-Version": self.algorithm.source_code_version, } ) try: urllib.request.urlopen(upload_request) except urllib.error.HTTPError as e: raise UserFriendlyError("error sending new source code to runtime: %s" % e) logging.info("Sent source code (version %s)" % self.algorithm.source_code_version)
def deploy_stateful_set( self, component_name, headless_service, cpu_count, # deprecated memory_mb, image, replicas, stateful_set_labels, pod_labels, ports=None, env=None, cpu_request=1, cpu_limit=None, ): if cpu_count is not None: cpu_request_resources = "%s" % cpu_count cpu_limit_resources = "%s" % cpu_count else: if cpu_limit is None: cpu_limit = cpu_request cpu_request_resources = "%s" % cpu_request cpu_limit_resources = "%s" % cpu_limit memory_resources = "%sMi" % memory_mb stateful_set = self.get_stateful_set(stateful_set_labels) if stateful_set: changed = False if stateful_set.spec.replicas != replicas: self.logger.info("replicas changed from %s to %s" % (stateful_set.spec.replicas, replicas)) stateful_set.spec.replicas = replicas changed = True for container in stateful_set.spec.template.spec.containers: if container.name == component_name: if container.image != image: container.image = image changed = True self.logger.info("image changed") if container.resources.requests is None: container.resources.requests = {} if container.resources.limits is None: container.resources.limits = {} if "cpu" not in container.resources.requests or container.resources.requests[ "cpu"] != cpu_request_resources: container.resources.requests[ "cpu"] = cpu_request_resources changed = True self.logger.info("cpu requests changed to %s" % cpu_request_resources) if "cpu" not in container.resources.limits or container.resources.limits[ "cpu"] != cpu_limit_resources: container.resources.limits["cpu"] = cpu_limit_resources changed = True self.logger.info("cpu limit changed to %s" % cpu_limit_resources) if "memory" not in container.resources.requests or container.resources.requests[ "memory"] != memory_resources: container.resources.requests[ "memory"] = memory_resources changed = True self.logger.info("memory request changed to %s" % memory_resources) if "memory" not in container.resources.limits or container.resources.limits[ "memory"] != memory_resources: container.resources.limits["memory"] = memory_resources changed = True self.logger.info("memory limit changed to %s" % memory_resources) if changed: self.logger.info("patching stateful_set...") self.apps_api.patch_namespaced_stateful_set( name=stateful_set.metadata.name, namespace=self.environment.namespace, body=stateful_set, ) raise deployment_status.StillDeploying( "Waiting for %s stateful set being patched" % component_name) else: self.logger.info("creating %s stateful_set..." % component_name) stateful_set = self.apps_api.create_namespaced_stateful_set( namespace=self.environment.namespace, body=kubernetes_client.V1StatefulSet( api_version="apps/v1", kind="StatefulSet", metadata=kubernetes_client.V1ObjectMeta( name=self.generate_object_name(component_name), namespace=self.environment.namespace, labels=self.generate_object_labels( stateful_set_labels)), spec=kubernetes_client.V1StatefulSetSpec( service_name=headless_service.metadata.name, replicas=replicas, selector=kubernetes_client.V1LabelSelector( match_labels=self.generate_object_labels( pod_labels)), template=kubernetes_client.V1PodTemplateSpec( metadata=kubernetes_client.V1ObjectMeta( labels=self.generate_object_labels( pod_labels)), spec=kubernetes_client.V1PodSpec(containers=[ kubernetes_client.V1Container( name=component_name, image=image, image_pull_policy=self.environment. image_pull_policy, resources=kubernetes_client. V1ResourceRequirements( requests={ "cpu": cpu_request_resources, "memory": memory_resources, }, limits={ "cpu": cpu_limit_resources, "memory": memory_resources, }, ), env=env, ports=ports, ), ], ), ), ), ), ) return stateful_set
def deploy_deployment( self, image, memory_mb=50, cpu_count=None, # deprecated cpu_request=1, cpu_limit=None, gpu_request=None, replicas=1, deployment_labels=None, pod_labels=None, name_suffix=None, container_name=None, ports=[], env=None, volumes=[], volume_mounts=[], run_as_user=None, fs_group=None, ): if not container_name: if name_suffix: container_name = name_suffix else: container_name = self.algorithm.runtime.name container_name = container_name.replace(".", "-") if cpu_count is not None: cpu_request_resources = "%s" % cpu_count cpu_limit_resources = "%s" % cpu_count else: if cpu_limit is None: cpu_limit = cpu_request cpu_request_resources = "%s" % cpu_request cpu_limit_resources = "%s" % cpu_limit memory_resources = "%sMi" % memory_mb if gpu_request != None: gpu_request_resources = "%s" % gpu_request gpu_limit_resources = gpu_request_resources else: gpu_request_resources = None gpu_limit_resources = None deployment = self.get_deployment(deployment_labels) if deployment: changed = False if deployment.spec.replicas != replicas: self.logger.info("replicas changed from %s to %s" % (deployment.spec.replicas, replicas)) deployment.spec.replicas = replicas changed = True for container in deployment.spec.template.spec.containers: if container.name == container_name: if container.image != image: container.image = image changed = True self.logger.info("image changed") if container.resources.requests is None: container.resources.requests = {} if container.resources.limits is None: container.resources.limits = {} if "cpu" not in container.resources.requests or container.resources.requests[ "cpu"] != cpu_request_resources: container.resources.requests[ "cpu"] = cpu_request_resources changed = True self.logger.info( "cpu_requests_resources requests changed to %s" % cpu_request_resources) if "cpu" not in container.resources.limits or container.resources.limits[ "cpu"] != cpu_limit_resources: container.resources.limits["cpu"] = cpu_limit_resources changed = True self.logger.info( "cpu_limit_resources limits changed to %s" % cpu_limit_resources) if "memory" not in container.resources.requests: container.resources.requests[ "memory"] = memory_resources changed = True self.logger.info( "memory_resources was not set. Now set to '%s'" % (memory_resources, )) elif resources.parse_memory( container.resources.requests["memory"] ) != resources.parse_memory(memory_resources): container.resources.requests[ "memory"] = memory_resources changed = True self.logger.info( "memory_resources requests changed from '%s' to '%s'" % ( container.resources.requests["memory"], memory_resources, )) if "memory" not in container.resources.limits: container.resources.limits["memory"] = memory_resources changed = True self.logger.info( "memory_resources limits was not set. Now set to %s" % memory_resources) elif resources.parse_memory( container.resources.limits["memory"] ) != resources.parse_memory(memory_resources): container.resources.limits["memory"] = memory_resources changed = True self.logger.info( "memory_resources limits changed from %s to %s" % ( container.resources.limits["memory"], memory_resources, )) if "nvidia.com/gpu" not in container.resources.limits: if gpu_limit_resources != None: container.resources.limits[ "nvidia.com/gpu"] = gpu_limit_resources changed = True self.logger.info( "gpu_resources limits was not set. Now set to %s" % gpu_limit_resources) elif container.resources.limits[ "nvidia.com/gpu"] != gpu_limit_resources: if gpu_limit_resources != None: container.resources.limits[ "nvidia.com/gpu"] = gpu_limit_resources changed = True self.logger.info( "gpu_resources limits changed from %s to %s" % ( container.resources. limits["nvidia.com/gpu"], gpu_limit_resources, )) else: self.logger.info( "gpu_resources limits was set to %s but not required anymore" % (container.resources.limits["nvidia.com/gpu"], )) del container.resources.limits["nvidia.com/gpu"] changed = True if "nvidia.com/gpu" not in container.resources.requests: if gpu_request_resources != None: container.resources.requests[ "nvidia.com/gpu"] = gpu_request_resources changed = True self.logger.info( "gpu_resources requests was not set. Now set to %s" % gpu_request_resources) elif container.resources.requests[ "nvidia.com/gpu"] != gpu_request_resources: if gpu_request_resources != None: container.resources.requests[ "nvidia.com/gpu"] = gpu_request_resources changed = True self.logger.info( "gpu_resources requests changed from %s to %s" % ( container.resources. requests["nvidia.com/gpu"], gpu_request_resources, )) else: self.logger.info( "gpu_resources requests was set to %s but not required anymore" % (container.resources. requests["nvidia.com/gpu"], )) del container.resources.requests["nvidia.com/gpu"] changed = True if changed: self.logger.info("patching deployment...") self.apps_api.patch_namespaced_deployment( name=deployment.metadata.name, namespace=self.environment.namespace, body=deployment, ) raise deployment_status.StillDeploying( "Waiting for %s deployment being patched" % container_name) else: resource_requirements = kubernetes_client.V1ResourceRequirements( requests={ "cpu": cpu_request_resources, "memory": memory_resources, }, limits={ "cpu": cpu_limit_resources, "memory": memory_resources, }, ) if gpu_request_resources: resource_requirements.requests[ "nvidia.com/gpu"] = gpu_request_resources if gpu_limit_resources: resource_requirements.limits[ "nvidia.com/gpu"] = gpu_limit_resources self.logger.info("creating %s deployment..." % container_name) deployment = self.apps_api.create_namespaced_deployment( namespace=self.environment.namespace, body=kubernetes_client.V1Deployment( api_version="apps/v1", kind="Deployment", metadata=kubernetes_client.V1ObjectMeta( name=self.generate_object_name(name_suffix), namespace=self.environment.namespace, labels=self.generate_object_labels(deployment_labels)), spec=kubernetes_client.V1DeploymentSpec( replicas=replicas, selector=kubernetes_client.V1LabelSelector( match_labels=self.generate_object_labels( pod_labels)), template=kubernetes_client.V1PodTemplateSpec( metadata=kubernetes_client.V1ObjectMeta( labels=self.generate_object_labels( pod_labels)), spec=kubernetes_client.V1PodSpec( containers=[ kubernetes_client.V1Container( name=container_name, image=image, image_pull_policy=self.environment. image_pull_policy, resources=resource_requirements, env=env, ports=ports, volume_mounts=volume_mounts, ), ], volumes=volumes, security_context=kubernetes_client. V1PodSecurityContext( run_as_user=run_as_user, fs_group=fs_group, ), ), ), ), ), ) return deployment