def clean_pod_template(pod_template): """ Normalize pod template and check for type errors """ if isinstance(pod_template, str): msg = ('Expected a kubernetes.client.V1Pod object, got %s' 'If trying to pass a yaml filename then use ' 'KubeCluster.from_yaml') raise TypeError(msg % pod_template) if isinstance(pod_template, dict): msg = ('Expected a kubernetes.client.V1Pod object, got %s' 'If trying to pass a dictionary specification then use ' 'KubeCluster.from_dict') raise TypeError(msg % str(pod_template)) pod_template = copy.deepcopy(pod_template) # Make sure metadata / labels / env objects exist, so they can be modified # later without a lot of `is None` checks if pod_template.metadata is None: pod_template.metadata = client.V1ObjectMeta() if pod_template.metadata.labels is None: pod_template.metadata.labels = {} if pod_template.spec.containers[0].env is None: pod_template.spec.containers[0].env = [] # add default tolerations tolerations = [ client.V1Toleration( key='k8s.dask.org/dedicated', operator='Equal', value='worker', effect='NoSchedule', ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key='k8s.dask.org_dedicated', operator='Equal', value='worker', effect='NoSchedule', ), ] if pod_template.spec.tolerations is None: pod_template.spec.tolerations = tolerations else: pod_template.spec.tolerations.extend(tolerations) return pod_template
def test_pod_spec(self): cluster_spec = ClusterSpec(cluster_spec_json=test_spec) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, "other") self.assertEqual( pod.metadata.labels["elasticdl.org/app-name"], "elasticdl" ) self.assertEqual(pod.metadata.labels["elasticdl.org/site"], "hangzhou") self.assertEqual( pod.metadata.annotations["tag.elasticdl.org/optimization"], "enabled", ) expected_tolerations = [ client.V1Toleration( effect="NoSchedule", key="elasticdl.org/logic-pool", operator="Equal", value="ElasticDL", ) ] self.assertEqual(pod.spec.tolerations, expected_tolerations) match_expressions = [ client.V1NodeSelectorRequirement( key="elasticdl.org/logic-pool", operator="In", values=["ElasticDL"], ) ] expected_affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=( client.V1NodeSelector( node_selector_terms=[ client.V1NodeSelectorTerm( match_expressions=match_expressions ) ] ) ) ) ) self.assertEqual(pod.spec.affinity, expected_affinity) expected_env = [] expected_env.append(client.V1EnvVar(name="LOG_ENABLED", value="true")) self.assertEqual(pod.spec.containers[0].env, expected_env) pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.MASTER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Sun") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.WORKER) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Earth") pod = create_test_pod("test_spec") pod = cluster_spec.patch_pod(pod, PodType.PS) self.assertEqual(pod.metadata.labels["elasticdl.org/xyz"], "Moon")
def _create_deployment(self): REPLICAS = 1 container_port = k8s.V1ContainerPort(name=self.uid[-14:], container_port=os.getenv( "OPENVAS_OMP_PORT", 9390)) resources = k8s.V1ResourceRequirements( limits={ "cpu": KubernetesDeployer.CONTAINER_USE_CPU_LIMIT, "memory": KubernetesDeployer.CONTAINER_USE_MEMORY_LIMIT, }) readiness_probe = k8s.V1Probe( _exec=k8s.V1ExecAction( command=KubernetesDeployer.OPENVAS_HEALTHCHECK_COMMAND), initial_delay_seconds=300, period_seconds=30, ) liveness_probe = k8s.V1Probe( tcp_socket=k8s.V1TCPSocketAction( port=container_port.container_port), initial_delay_seconds=180, period_seconds=30, failure_threshold=3, timeout_seconds=5, ) container = k8s.V1Container( image=KubernetesDeployer.OPENVAS_CONTAINER_IMAGE, name=self.uid, image_pull_policy="IfNotPresent", ports=[container_port], resources=resources, readiness_probe=readiness_probe, liveness_probe=liveness_probe, ) toleration = k8s.V1Toleration(effect="NoSchedule", key="Scanners", operator="Exists") pod_spec = k8s.V1PodSpec(containers=[container], tolerations=[toleration]) pod_metadata = k8s.V1ObjectMeta( name=self.uid, labels={"app.kubernetes.io/name": self.uid}, annotations={ "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" }, ) pod_template = k8s.V1PodTemplateSpec(spec=pod_spec, metadata=pod_metadata) selector = k8s.V1LabelSelector( match_labels={"app.kubernetes.io/name": self.uid}) deployment_spec = k8s.V1DeploymentSpec(replicas=REPLICAS, selector=selector, template=pod_template) deployment_metadata = k8s.V1ObjectMeta( name=self.uid, labels={"app.kubernetes.io/name": self.uid}) deployment = k8s.V1Deployment(spec=deployment_spec, metadata=deployment_metadata) return k8s.AppsV1Api(self.client).create_namespaced_deployment( self.namespace, deployment)
def create_toleration(toleration_data): toleration = client.V1Toleration() if "effect" in toleration_data: toleration.effect = toleration_data["effect"] if "key" in toleration_data: toleration.key = toleration_data["key"] if "operator" in toleration_data: toleration.operator = toleration_data["operator"] if "value" in toleration_data: toleration.value = toleration_data["value"] if "toleration_seconds" in toleration_data: toleration.toleration_seconds = int( toleration_data["toleration_seconds"]) return toleration
def _create_deployment(self): REPLICAS = 1 container_port = k8s.V1ContainerPort(name=self.uid[-14:], container_port=os.getenv( "OPENVAS_OMP_PORT", 9390)) resources = k8s.V1ResourceRequirements( limits={ "cpu": KubernetesDeployer.CONTAINER_USE_CPU_LIMIT, "memory": KubernetesDeployer.CONTAINER_USE_MEMORY_LIMIT, }) container = k8s.V1Container( image=KubernetesDeployer.OPENVAS_CONTAINER_IMAGE, name=self.uid, image_pull_policy="IfNotPresent", ports=[container_port], resources=resources, ) toleration = k8s.V1Toleration(effect="NoSchedule", key="Scanners", operator="Exists") pod_spec = k8s.V1PodSpec(containers=[container], tolerations=[toleration]) pod_metadata = k8s.V1ObjectMeta( name=self.uid, labels={"app.kubernetes.io/name": self.uid}) pod_template = k8s.V1PodTemplateSpec(spec=pod_spec, metadata=pod_metadata) selector = k8s.V1LabelSelector( match_labels={"app.kubernetes.io/name": self.uid}) deployment_spec = k8s.V1DeploymentSpec(replicas=REPLICAS, selector=selector, template=pod_template) deployment_metadata = k8s.V1ObjectMeta( name=self.uid, labels={"app.kubernetes.io/name": self.uid}) deployment = k8s.V1Deployment(spec=deployment_spec, metadata=deployment_metadata) return k8s.AppsV1Api(self.client).create_namespaced_deployment( self.namespace, deployment)
def submit(self): """ Submit a build pod to create the image for the repository. Progress of the build can be monitored by listening for items in the Queue passed to the constructor as `q`. """ volume_mounts = [ client.V1VolumeMount(mount_path="/var/run/docker.sock", name="docker-socket") ] docker_socket_path = urlparse(self.docker_host).path volumes = [ client.V1Volume( name="docker-socket", host_path=client.V1HostPathVolumeSource( path=docker_socket_path, type="Socket"), ) ] if self.push_secret: volume_mounts.append( client.V1VolumeMount(mount_path="/root/.docker", name="docker-config")) volumes.append( client.V1Volume( name="docker-config", secret=client.V1SecretVolumeSource( secret_name=self.push_secret), )) env = [] if self.git_credentials: env.append( client.V1EnvVar(name="GIT_CREDENTIAL_ENV", value=self.git_credentials)) self.pod = client.V1Pod( metadata=client.V1ObjectMeta( name=self.name, labels={ "name": self.name, "component": self._component_label, }, annotations={ "binder-repo": self.repo_url, }, ), spec=client.V1PodSpec( containers=[ client.V1Container( image=self.build_image, name="builder", args=self.get_cmd(), volume_mounts=volume_mounts, resources=client.V1ResourceRequirements( limits={"memory": self.memory_limit}, requests={"memory": self.memory_request}, ), env=env, ) ], tolerations=[ client.V1Toleration( key="hub.jupyter.org/dedicated", operator="Equal", value="user", effect="NoSchedule", ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key="hub.jupyter.org_dedicated", operator="Equal", value="user", effect="NoSchedule", ), ], node_selector=self.node_selector, volumes=volumes, restart_policy="Never", affinity=self.get_affinity(), ), ) try: _ = self.api.create_namespaced_pod( self.namespace, self.pod, _request_timeout=KUBE_REQUEST_TIMEOUT, ) except client.rest.ApiException as e: if e.status == 409: # Someone else created it! app_log.info("Build %s already running", self.name) pass else: raise else: app_log.info("Started build %s", self.name) app_log.info("Watching build pod %s", self.name) while not self.stop_event.is_set(): w = watch.Watch() try: for f in w.stream( self.api.list_namespaced_pod, self.namespace, label_selector=f"name={self.name}", timeout_seconds=30, _request_timeout=KUBE_REQUEST_TIMEOUT, ): if f["type"] == "DELETED": # Assume this is a successful completion self.progress( ProgressEvent.Kind.BUILD_STATUS_CHANGE, ProgressEvent.BuildStatus.COMPLETED, ) return self.pod = f["object"] if not self.stop_event.is_set(): # Account for all the phases kubernetes pods can be in # Pending, Running, Succeeded, Failed, Unknown # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase phase = self.pod.status.phase if phase == "Pending": self.progress( ProgressEvent.Kind.BUILD_STATUS_CHANGE, ProgressEvent.BuildStatus.PENDING, ) elif phase == "Running": self.progress( ProgressEvent.Kind.BUILD_STATUS_CHANGE, ProgressEvent.BuildStatus.RUNNING, ) elif phase == "Succeeded": # Do nothing! We will clean this up, and send a 'Completed' progress event # when the pod has been deleted pass elif phase == "Failed": self.progress( ProgressEvent.Kind.BUILD_STATUS_CHANGE, ProgressEvent.BuildStatus.FAILED, ) elif phase == "Unknown": self.progress( ProgressEvent.Kind.BUILD_STATUS_CHANGE, ProgressEvent.BuildStatus.UNKNOWN, ) else: # This shouldn't happen, unless k8s introduces new Phase types warnings.warn( f"Found unknown phase {phase} when building {self.name}" ) if self.pod.status.phase == "Succeeded": self.cleanup() elif self.pod.status.phase == "Failed": self.cleanup() except Exception: app_log.exception("Error in watch stream for %s", self.name) raise finally: w.stop() if self.stop_event.is_set(): app_log.info("Stopping watch of %s", self.name) return
def submit(self): """Submit a build pod to create the image for the repository.""" volume_mounts = [ client.V1VolumeMount(mount_path="/var/run/docker.sock", name="docker-socket") ] docker_socket_path = urlparse(self.docker_host).path volumes = [ client.V1Volume(name="docker-socket", host_path=client.V1HostPathVolumeSource( path=docker_socket_path, type='Socket')) ] if self.push_secret: volume_mounts.append( client.V1VolumeMount(mount_path="/root/.docker", name='docker-push-secret')) volumes.append( client.V1Volume(name='docker-push-secret', secret=client.V1SecretVolumeSource( secret_name=self.push_secret))) env = [] if self.git_credentials: env.append( client.V1EnvVar(name='GIT_CREDENTIAL_ENV', value=self.git_credentials)) self.pod = client.V1Pod( metadata=client.V1ObjectMeta( name=self.name, labels={ "name": self.name, "component": self._component_label, }, annotations={ "binder-repo": self.repo_url, }, ), spec=client.V1PodSpec( containers=[ client.V1Container( image=self.build_image, name="builder", args=self.get_cmd(), volume_mounts=volume_mounts, resources=client.V1ResourceRequirements( limits={'memory': self.memory_limit}, requests={'memory': self.memory_request}, ), env=env) ], tolerations=[ client.V1Toleration( key='hub.jupyter.org/dedicated', operator='Equal', value='user', effect='NoSchedule', ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key='hub.jupyter.org_dedicated', operator='Equal', value='user', effect='NoSchedule', ), ], node_selector=self.node_selector, volumes=volumes, restart_policy="Never", affinity=self.get_affinity())) try: ret = self.api.create_namespaced_pod( self.namespace, self.pod, _request_timeout=KUBE_REQUEST_TIMEOUT, ) except client.rest.ApiException as e: if e.status == 409: # Someone else created it! app_log.info("Build %s already running", self.name) pass else: raise else: app_log.info("Started build %s", self.name) app_log.info("Watching build pod %s", self.name) while not self.stop_event.is_set(): w = watch.Watch() try: for f in w.stream( self.api.list_namespaced_pod, self.namespace, label_selector="name={}".format(self.name), timeout_seconds=30, _request_timeout=KUBE_REQUEST_TIMEOUT, ): if f['type'] == 'DELETED': self.progress('pod.phasechange', 'Deleted') return self.pod = f['object'] if not self.stop_event.is_set(): self.progress('pod.phasechange', self.pod.status.phase) if self.pod.status.phase == 'Succeeded': self.cleanup() elif self.pod.status.phase == 'Failed': self.cleanup() except Exception as e: app_log.exception("Error in watch stream for %s", self.name) raise finally: w.stop() if self.stop_event.is_set(): app_log.info("Stopping watch of %s", self.name) return
def clean_pod_template(pod_template, match_node_purpose="prefer", pod_type="worker"): """ Normalize pod template and check for type errors """ if isinstance(pod_template, str): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a yaml filename then use " "KubeCluster.from_yaml" ) raise TypeError(msg % pod_template) if isinstance(pod_template, dict): msg = ( "Expected a kubernetes.client.V1Pod object, got %s" "If trying to pass a dictionary specification then use " "KubeCluster.from_dict" ) raise TypeError(msg % str(pod_template)) pod_template = copy.deepcopy(pod_template) # Make sure metadata / labels / env objects exist, so they can be modified # later without a lot of `is None` checks if pod_template.metadata is None: pod_template.metadata = client.V1ObjectMeta() if pod_template.metadata.labels is None: pod_template.metadata.labels = {} if pod_template.spec.containers[0].env is None: pod_template.spec.containers[0].env = [] # add default tolerations tolerations = [ client.V1Toleration( key="k8s.dask.org/dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key="k8s.dask.org_dedicated", operator="Equal", value=pod_type, effect="NoSchedule", ), ] if pod_template.spec.tolerations is None: pod_template.spec.tolerations = tolerations else: pod_template.spec.tolerations.extend(tolerations) # add default node affinity to k8s.dask.org/node-purpose=worker if match_node_purpose != "ignore": # for readability affinity = pod_template.spec.affinity if affinity is None: affinity = client.V1Affinity() if affinity.node_affinity is None: affinity.node_affinity = client.V1NodeAffinity() # a common object for both a preferred and a required node affinity node_selector_term = client.V1NodeSelectorTerm( match_expressions=[ client.V1NodeSelectorRequirement( key="k8s.dask.org/node-purpose", operator="In", values=[pod_type] ) ] ) if match_node_purpose == "require": if ( affinity.node_affinity.required_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.required_during_scheduling_ignored_during_execution = client.V1NodeSelector( node_selector_terms=[] ) affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms.append( node_selector_term ) elif match_node_purpose == "prefer": if ( affinity.node_affinity.preferred_during_scheduling_ignored_during_execution is None ): affinity.node_affinity.preferred_during_scheduling_ignored_during_execution = ( [] ) preferred_scheduling_terms = [ client.V1PreferredSchedulingTerm( preference=node_selector_term, weight=100 ) ] affinity.node_affinity.preferred_during_scheduling_ignored_during_execution.extend( preferred_scheduling_terms ) else: raise ValueError( 'Attribute must be one of "ignore", "prefer", or "require".' ) pod_template.spec.affinity = affinity return pod_template
def update_deploy_v2(): data = json.loads(request.get_data().decode('UTF-8')) current_app.logger.debug("接受到的数据:{}".format(data)) namespace = handle_input(data.get('namespace')) deploy_name = handle_input(data.get('deploy_name')) action = handle_input(data.get('action')) image = None replicas = None toleration = None pod_anti_affinity = None pod_affinity = None node_affinity = None labels = None if action == "add_pod_anti_affinity": print("正在运行{}操作".format(action)) affinity = handle_input(data.get('pod_anti_affinity')) affinity_type = handle_input(affinity.get('type')) labelSelector = handle_input(affinity.get('labelSelector')) key = handle_input(affinity.get('key')) value = handle_input(affinity.get('value')) topologyKey = handle_input(affinity.get('topologyKey')) if affinity_type == "required": if labelSelector == "matchExpressions": if not isinstance(value, list): value = [value] operator = handle_input(affinity.get('operator')) if operator != 'In' and operator != 'NotIn': value = None print(value) label_selector = client.V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=key, operator=operator, values=value) ]) elif labelSelector == "matchLabels": if isinstance(value, list): return jsonify( {"error": "{}模式下不支持values设置为数组".format(labelSelector)}) label_selector = client.V1LabelSelector( match_labels={key: value}) else: return jsonify( {"error": "不支持{} labelSelector".format(labelSelector)}) client.V1Affinity pod_anti_affinity = client.V1PodAntiAffinity( required_during_scheduling_ignored_during_execution=[ client.V1PodAffinityTerm(label_selector=label_selector, topology_key=topologyKey) ]) print("添加的互斥调度为:{}".format(pod_anti_affinity)) elif affinity_type == "preferred": weight = string_to_int(handle_input(affinity.get('weight'))) if weight == None: return jsonify( {"error": "{}类型必须设置weight".format(affinity_type)}) if labelSelector == "matchExpressions": if not isinstance(value, list): value = [value] operator = handle_input(affinity.get('operator')) if operator != 'In' and operator != 'NotIn': value = None label_selector = client.V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=key, operator=operator, values=value) ]) elif labelSelector == "matchLabels": if isinstance(value, list): return jsonify( {"error": "{}模式下不支持values设置为数组".format(labelSelector)}) label_selector = client.V1LabelSelector( match_labels={key: value}) else: return jsonify( {"error": "不支持{} labelSelector".format(labelSelector)}) pod_anti_affinity = client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1WeightedPodAffinityTerm( pod_affinity_term=client.V1PodAffinityTerm( label_selector=label_selector, topology_key=topologyKey), weight=weight) ]) print("添加的互斥调度为:{}".format(pod_anti_affinity)) else: return jsonify({"error": "不支持{}这种调度".format(affinity_type)}) elif action == "delete_pod_anti_affinity": print("正在运行{}操作".format(action)) pass elif action == "add_node_affinity": current_app.logger.debug("正在运行{}操作".format(action)) affinity = handle_input(data.get('node_affinity')) node_affinity_type = handle_input(affinity.get('type')) nodeSelector = handle_input(affinity.get('nodeSelector')) key = handle_input(affinity.get('key')) value = handle_input(affinity.get('value')) operator = handle_input(affinity.get('operator')) values = [] if operator == 'Exists' or operator == 'DoesNotExist': values == None else: if not isinstance(value, list): values.append(value) else: values = value if node_affinity_type == "preferred": weight = string_to_int(handle_input(affinity.get('weight'))) if weight == None: return simple_error_handle( "{}类型必须设置weight".format(node_affinity_type)) preferred_term = [] if nodeSelector == "matchExpressions": match_expressions = [] expression = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_expressions.append(expression) preference = client.V1NodeSelectorTerm( match_expressions=match_expressions) # nodeSelector == "matchFields" else: match_fields = [] field = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_fields.append(field) preference = client.V1NodeSelectorTerm( match_fields=match_fields) term = client.V1PreferredSchedulingTerm( weight=weight, preference=preference, ) preferred_term.append(term) node_affinity = client.V1NodeAffinity( #直接append preferred_during_scheduling_ignored_during_execution= preferred_term) elif node_affinity_type == "required": current_app.logger.debug( "node_affinity_type:{}".format(node_affinity_type)) node_selector_terms = [] if nodeSelector == "matchExpressions": match_expressions = [] expression = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_expressions.append(expression) term = client.V1NodeSelectorTerm( match_expressions=match_expressions) else: match_fields = [] field = client.V1NodeSelectorRequirement( key=key, operator=operator, values=values, ) match_fields.append(field) term = client.V1NodeSelectorTerm(match_fields=match_fields) node_selector_terms.append(term) node_affinity = client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client. V1NodeSelector(node_selector_terms=node_selector_terms)) else: return simple_error_handle("不支持{}这种调度".format(node_affinity_type)) elif action == "delete_node_affinity": print("正在运行{}操作".format(action)) pass elif action == "add_toleration": print("正在运行{}操作".format(action)) t = handle_input(data.get("toleration")) print(type(toleration), toleration) effect = t.get('effect') key = t.get('key') operator = t.get('operator') value = t.get('value') toleration_seconds = handle_toleraion_seconds( t.get('toleration_seconds')) print("toleration_seconds:{}".format(toleration_seconds)) toleration = client.V1Toleration(effect=effect, key=key, operator=operator, toleration_seconds=toleration_seconds, value=value) print(toleration) if not toleration: msg = "{}需要提供toleration(effect,key,operator,value,)".format(action) return jsonify({"error": msg}) elif action == "delete_toleration": print("正在运行{}操作".format(action)) t = handle_input(data.get("toleration")) effect = handle_toleration_item(t.get('effect')) key = handle_toleration_item(t.get('key')) operator = handle_toleration_item(t.get('operator')) value = handle_toleration_item(t.get('value')) toleration_seconds = handle_toleraion_seconds( t.get('toleration_seconds')) print("toleration_seconds:{}".format(toleration_seconds)) # if (effect != None and key != None and operator != None): toleration = client.V1Toleration(effect=effect, key=key, operator=operator, toleration_seconds=toleration_seconds, value=value) if not toleration: msg = "{}需要提供toleration(effect,key,operator,value,)".format(action) return jsonify({"error": msg}) elif action == "add_pod_affinity": pass elif action == "delete_pod_affinity": pass elif action == "update_replicas": replicas = handle_input(data.get('replicas')) if not replicas: msg = "{}需要提供replicas".format(action) return jsonify({"error": msg}) elif action == "update_image": project = handle_input(data.get('project')) env = handle_input(data.get('env')) imageRepo = handle_input(data.get('imageRepo')) imageName = handle_input(data.get('imageName')) imageTag = handle_input(data.get('imageTag')) if (imageRepo != None and project != None and env != None and imageName != None and imageTag != None): image = "{}/{}-{}/{}:{}".format(imageRepo, project, env, imageName, imageTag) print("image值{}".format(image)) if not image: msg = "{}需要提供image".format(action) return jsonify({"error": msg}) elif action == "add_labels": pass elif action == "delete_labels": pass else: msg = "暂时不支持{}操作".format(action) print(msg) return jsonify({"error": msg}) return update_deployment_v2(deploy_name=deploy_name, namespace=namespace, action=action, image=image, replicas=replicas,toleration=toleration,node_affinity=node_affinity,\ pod_anti_affinity=pod_anti_affinity,pod_affinity=pod_affinity,labels=labels)
def generate_pod(): metadata = client.V1ObjectMeta( name="platform-app-958795556-2nqgj", namespace="production", generate_name="platform-app-958795556-", labels={ "app": "platform", "chart": "platform", "component": "app", "heritage": "Helm", "pod-template-hash": "958795556", "release": "platform-production", "version": "1.0.3", }, owner_references=[ client.V1OwnerReference( api_version="apps/v1", kind="ReplicaSet", name="platform-app-958795556", uid="35ba938b-681d-11eb-a74a-16e1a04d726b", controller=True, block_owner_deletion=True, ) ], ) container = client.V1Container( name="app", image="platform.azurecr.io/app:master", image_pull_policy="Always", termination_message_policy="File", termination_message_path="/dev/termination-log", env=[], resources=client.V1ResourceRequirements( limits={ "cpu": "1200m", "memory": "1Gi" }, requests={ "cpu": "1", "memory": "768Mi" }, ), ports=[client.V1ContainerPort(container_port=3000, protocol="TCP")], volume_mounts=[ client.V1VolumeMount( name="default-token-2cg25", read_only=True, mount_path="/var/run/secrets/kubernetes.io/serviceaccount", ) ], liveness_probe=client.V1Probe( initial_delay_seconds=10, timeout_seconds=5, period_seconds=10, success_threshold=1, failure_threshold=6, http_get=client.V1HTTPGetAction(path="/health/liveness", port=3000, scheme="HTTP"), ), readiness_probe=client.V1Probe( initial_delay_seconds=10, timeout_seconds=5, period_seconds=10, success_threshold=2, failure_threshold=6, http_get=client.V1HTTPGetAction(path="/health/readness", port=3000, scheme="HTTP"), ), ) spec = client.V1PodSpec( containers=[container], volumes=[ client.V1Volume( name="default-token-2cg25", secret=client.V1SecretVolumeSource( secret_name="default-token-2cg25", default_mode=420), ) ], restart_policy="Always", termination_grace_period_seconds=30, dns_policy="ClusterFirst", service_account_name="default", service_account="default", node_name="aks-agentpool-26722002-vmss00039t", security_context=client.V1PodSecurityContext(run_as_user=1000, fs_group=1000), scheduler_name="default-scheduler", tolerations=[ client.V1Toleration( key="node.kubernetes.io/not-ready", operator="Exists", effect="NoExecute", toleration_seconds=300, ), client.V1Toleration( key="node.kubernetes.io/unreachable", operator="Exists", effect="NoExecute", toleration_seconds=300, ), ], priority=0, enable_service_links=True, ) return client.V1Pod(metadata=metadata, spec=spec)
def submit_job(args, command=None): container_image = args.container container_name = args.name body = client.V1Job(api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name=container_name)) body.status = client.V1JobStatus() template = client.V1PodTemplate() labels = { 'hugin-job': "1", 'hugin-job-name': f'{container_name}' } template.template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels) ) tolerations = [] env = [] if args.environment: for env_spec in args.environment: env_name,env_value = env_spec.split("=", 1) env.append(client.V1EnvVar(name=env_name, value=env_value)) containe_args = dict( name=f"container-{container_name}", image=container_image, env=env, ) if args.gpu: tolerations.append(client.V1Toleration( key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')) containe_args['resources'] = client.V1ResourceRequirements(limits={"nvidia.com/gpu": 1}) if command or args.command: containe_args['command'] = command if command else args.command container = client.V1Container(**containe_args) pull_secrets = [] if args.pull_secret is not None: pull_secrets.append(client.V1LocalObjectReference(name=args.pull_secret)) pod_args = dict(containers=[container], restart_policy='Never', image_pull_secrets=pull_secrets) if tolerations: pod_args['tolerations'] = tolerations if args.node_selector is not None: parts = args.node_selector.split("=", 1) if len(parts) == 2: affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[client.V1NodeSelectorTerm( match_expressions=[client.V1NodeSelectorRequirement( key=parts[0], operator='In', values=[parts[1]])] )] ) ) ) pod_args['affinity'] = affinity template.template.spec = client.V1PodSpec(**pod_args) body.spec = client.V1JobSpec(ttl_seconds_after_finished=1800, template=template.template) try: api_response = batch_v1.create_namespaced_job("default", body, pretty=True) #print (api_response) except client.exceptions.ApiException as e: logging.critical(f"Failed to start job: {e.reason}")
class K8sJobEngine(): def __init__(self): self.log = get_logger(self.__class_name.__name__) if kalytical_config.kalytical_endpoint is None: # This is the API endpoint we send back to the pod for a callback/interaction during pipeline running. It may be behind a load balancer/DNS - i.e. it can't communicate with local host) raise ConfigException( "Config is missing parameter for kalytical API endpoint!") try: # Defaults to the service account asssigned to th epod config.load_incluster_config() except CDonfigException: self.log.warn( f"Could not load kube configuration from pod! Attempting to configure client with local kubeconfig={config.KUBE_CONFIG_DEFAULT_LOCATION}") config.load_kube_config() self._k8s_core_client = client.CoreV1Api() self.log = get_logger(self. __class__.__name__) self._running_job_list = None @staticmethod def _get_default_container_args() -> list: return kalytical_config.k8spodengine_default_container_args @staticmethod def _get_default_container_uri() -> str: return kalytical_config.default_pipeline_image_uri async def submit_job(self, header_model: PipelineHeaderModel, exec_uuid: str, source_uuid: str = None, retry_count: int = 0) -> RunningPipelineModel: self.log.info( f"Attempting to submit pod for pipeline_uuid={header_model.pipeline_uuid}") job_pod = self.marshall_k8s_pod( header_model=header_model, exec_uuid=exec_uuid, source_uuid=source_uuid, retry_count=retry_count) # TODO Handle cases where pod create fails - i.e. resource starvation pod_resp = self._k8s_core_client.create_namespaced_pod( namespace=kalytical_config.k8spodengine_k8s_namespace, body=job_pod) return self.unmarshall_pod(pod_obj=pod_resp) async def marshall_+k8s_pod(self, header_model: PipelineHeaderModel, exec_uuid: str, source_uuid: str = None, retry_count: int = 0) 0 > client.V1Pod(): common_job_name = '-'.join(exec_uuid, header_model.pipeline_uuid, str(retry_count)) if 'pipeline_args' in header_model.engine_args.keys(): pipeline_args = header_model.engine_args['pipeline_args'] else: pipeline_args = self._get_default_engine_args() if 'pipeline_command' in header_model.engine_args.keys(): pipeline_command = header_model.engine_args['pipeline_command'] else: pipeline_command = self._get_default_container_command() if 'pipeline_image' in header_model.engine_args.keys(): pipeline_image = header_model.engine_args['pipeline_image'] else: pipeline_image = self._get_default_container_uri() container_spec = client.V1Container( name=common_job_name, image=pipeline_image, args=pipeline_args, command=pipeline_command, env=[ client.V1EnvVar(name="PIPELINE_UUID", value=header_model.pipeline_uuid), client.V1EnvVar(name="SOURCE_UUID", value=json.dumps( json.dumps(source_uuid))), client.V1EnvVar(name="EXEC_UUID", value=exec_uuid), client.V1EnvVar(name="RETRY_COUNT", value=str(retry_count)), client.V1EnvVar(name="MQ_CALLBACK_URL", value=kalytical_config.mq_url), client.V1EnvVar(name="KALYTICAL_AUTH_SECRET", value=kalytical_config.api_secret), client.V1EnvVar(name="KALYTICAL_API_ENDPOINT", value=kalytical_config.api_endpoint) ], resources=client.V1ResourceRequirements( limit={'cpu': header_model.engine_args['cpu_count'], 'memory': header_model.engine_args['memory_gi']}), pod_spec=client.V1PodSpec(service_account_name=kalytical_config.k8spodengine_svc_account_name, node_selector={"kalkytical.k8s.node/workload": pipeline, "beta.kubernetes.io/instance-type": header_model.engine_args['instance_type']}, tolerations[client.V1Toleration(key="node.kubernetes.io/pipeline", operator="Exists", effect='NoSchedule')], security_context=client.V1PodSecurityContext(fs_group=100), restart_policy=Never, container=[container_spec]) # TODO TOlerations and selectors might not work for a generic use case return client.V1Pod(spec=pod_spec, metadata=client.V1ObjectMeta(name=common_job_name, label{"pod_source": "kalytical", "exec_uuid": exec_uuid, "pipeline_uuid": header_model.pipeline_uuid})))
def make_pod_spec( image, labels={}, threads_per_worker=1, env={}, extra_container_config={}, extra_pod_config={}, memory_limit=None, memory_request=None, cpu_limit=None, cpu_request=None, ): """ Create generic pod template from input parameters Examples -------- >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G') """ args = [ 'dask-worker', '$(DASK_SCHEDULER_ADDRESS)', '--nthreads', str(threads_per_worker), '--death-timeout', '60', ] if memory_limit: args.extend(['--memory-limit', str(memory_limit)]) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels=labels), spec=client.V1PodSpec( restart_policy='Never', containers=[ client.V1Container( name='dask-worker', image=image, args=args, env=[ client.V1EnvVar(name=k, value=v) for k, v in env.items() ], ) ], tolerations=[ client.V1Toleration( key='k8s.dask.org/dedicated', operator='Equal', value='worker', effect='NoSchedule', ), # GKE currently does not permit creating taints on a node pool # with a `/` in the key field client.V1Toleration( key='k8s.dask.org_dedicated', operator='Equal', value='worker', effect='NoSchedule', ), ])) resources = client.V1ResourceRequirements(limits={}, requests={}) if cpu_request: resources.requests['cpu'] = cpu_request if memory_request: resources.requests['memory'] = memory_request if cpu_limit: resources.limits['cpu'] = cpu_limit if memory_limit: resources.limits['memory'] = memory_limit pod.spec.containers[0].resources = resources for key, value in extra_container_config.items(): _set_k8s_attribute(pod.spec.containers[0], key, value) for key, value in extra_pod_config.items(): _set_k8s_attribute(pod.spec, key, value) return pod
def from_runs(cls, id: str, runs: List[Run]): k8s_name = 'tensorboard-' + id run_names_hash = K8STensorboardInstance.generate_run_names_hash(runs) volume_mounts = [] for run in runs: mount = k8s.V1VolumeMount( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, mount_path=os.path.join( cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, run.owner, run.name), sub_path=os.path.join(run.owner, run.name)) volume_mounts.append(mount) deployment_labels = { 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id, 'runs-hash': run_names_hash } tensorboard_command = [ "tensorboard", "--logdir", cls.TENSORBOARD_CONTAINER_MOUNT_PATH_PREFIX, "--port", "6006", "--host", "127.0.0.1" ] nauta_config = NautaPlatformConfig.incluster_init() tensorboard_image = nauta_config.get_tensorboard_image() tensorboard_proxy_image = nauta_config.get_activity_proxy_image() deployment = k8s.V1Deployment( api_version='apps/v1', kind='Deployment', metadata=k8s.V1ObjectMeta(name=k8s_name, labels=deployment_labels), spec=k8s.V1DeploymentSpec( replicas=1, selector=k8s.V1LabelSelector(match_labels=deployment_labels), template=k8s.V1PodTemplateSpec( metadata=k8s.V1ObjectMeta(labels=deployment_labels), spec=k8s.V1PodSpec( tolerations=[ k8s.V1Toleration(key='master', operator='Exists', effect='NoSchedule') ], affinity=k8s. V1Affinity(node_affinity=k8s.V1NodeAffinity( required_during_scheduling_ignored_during_execution =k8s.V1NodeSelector(node_selector_terms=[ k8s.V1NodeSelectorTerm(match_expressions=[ k8s.V1NodeSelectorRequirement( key="master", operator="In", values=["True"]) ]) ]))), containers=[ k8s.V1Container(name='app', image=tensorboard_image, command=tensorboard_command, volume_mounts=volume_mounts), k8s.V1Container( name='proxy', image=tensorboard_proxy_image, ports=[k8s.V1ContainerPort(container_port=80)], readiness_probe=k8s.V1Probe( period_seconds=5, http_get=k8s.V1HTTPGetAction( path='/healthz', port=80))) ], volumes=[ k8s.V1Volume( name=cls.EXPERIMENTS_OUTPUT_VOLUME_NAME, persistent_volume_claim= # noqa k8s.V1PersistentVolumeClaimVolumeSource( claim_name=cls. EXPERIMENTS_OUTPUT_VOLUME_NAME, read_only=True)) ])))) service = k8s.V1Service( api_version='v1', kind='Service', metadata=k8s.V1ObjectMeta(name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }), spec=k8s.V1ServiceSpec( type='ClusterIP', ports=[k8s.V1ServicePort(name='web', port=80, target_port=80)], selector={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id })) ingress = k8s.V1beta1Ingress( api_version='extensions/v1beta1', kind='Ingress', metadata=k8s.V1ObjectMeta( name=k8s_name, labels={ 'name': k8s_name, 'type': 'nauta-tensorboard', 'nauta_app_name': 'tensorboard', 'id': id }, annotations={ 'nauta.ingress.kubernetes.io/rewrite-target': '/', 'kubernetes.io/ingress.class': 'nauta-ingress' }), spec=k8s.V1beta1IngressSpec(rules=[ k8s.V1beta1IngressRule( host='localhost', http=k8s.V1beta1HTTPIngressRuleValue(paths=[ k8s.V1beta1HTTPIngressPath( path='/tb/' + id + "/", backend=k8s.V1beta1IngressBackend( service_name=k8s_name, service_port=80)) ])) ])) return cls(deployment=deployment, service=service, ingress=ingress)