def create_affinity(affinity): affinities = [] if affinity is None: return None elif isinstance(affinity, str): affinities = [{'mode': affinity}] elif isinstance(affinity, dict): affinities = [affinity] elif isinstance(affinity, list): pass else: raise ValueError('Illegal affinity definition') # fill with defaults affinities = [parse_affinity_item(item) for item in affinities] # sort into required/preferred, affinity/anti-affinity stack_req, stack_pref = [], [] spread_req, spread_pref = [], [] for item in affinities: term = create_affinity_term(item) if item['mode'] == 'stack': if item['required']: stack_req.append(term) else: stack_pref.append(term) elif item['mode'] == 'spread': if item['required']: spread_req.append(term) else: spread_pref.append(term) return client.V1Affinity( pod_affinity=client.V1PodAffinity( required_during_scheduling_ignored_during_execution=stack_req, preferred_during_scheduling_ignored_during_execution=stack_pref, ) if len(stack_req) + len(stack_pref) > 0 else None, pod_anti_affinity=client.V1PodAntiAffinity( required_during_scheduling_ignored_during_execution=spread_req, preferred_during_scheduling_ignored_during_execution=spread_pref, ) if len(spread_req) + len(spread_pref) > 0 else None, )
def _generate_affinity(self): return k8s_client.V1Affinity( node_affinity=k8s_client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1PreferredSchedulingTerm( weight=1, preference=k8s_client.V1NodeSelectorTerm( match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "possible-label-value-1", "possible-label-value-2", ], ) ]), ) ], required_during_scheduling_ignored_during_execution=k8s_client. V1NodeSelector(node_selector_terms=[ k8s_client.V1NodeSelectorTerm(match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="some_node_label", operator="In", values=[ "required-label-value-1", "required-label-value-2", ], ) ]), ]), ), pod_affinity=k8s_client.V1PodAffinity( required_during_scheduling_ignored_during_execution=[ k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_labels={ "some-pod-label-key": "some-pod-label-value" }), namespaces=["namespace-a", "namespace-b"], topology_key="key-1", ) ]), pod_anti_affinity=k8s_client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ k8s_client.V1WeightedPodAffinityTerm( weight=1, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector( match_expressions=[ k8s_client.V1LabelSelectorRequirement( key="some_pod_label", operator="NotIn", values=[ "forbidden-label-value-1", "forbidden-label-value-2", ], ) ]), namespaces=["namespace-c"], topology_key="key-2", ), ) ]), )
def k8s_deployment_generator(k8s_config: K8sConfiguration): # add containers containers = [] k8s_containers = [] # add actuator container k8s_container = client.V1Container( name="nodemanager-actuator", image=k8s_config.actuator_image, ports=[ client.V1ContainerPort(container_port=k8s_config.actuator_port) ], volume_mounts=[ client.V1VolumeMount(name="docker-sock", mount_path="/var/run") ], image_pull_policy=k8s_config.k8s_image_pull_policy) k8s_containers.append(k8s_container) # add CPU containers base_port = 8501 for i, model in enumerate( ConfigurationsGenerator.model_list(k8s_config.models)): container_name = "nodemanager-rest-cpu-" + str(i + 1) k8s_container = client.V1Container( name=container_name, image=k8s_config.tfs_image, args=[ "--model_config_file=" + k8s_config.tfs_config_file_name, "--rest_api_port=" + str(base_port) ], ports=[client.V1ContainerPort(container_port=base_port)], volume_mounts=[ client.V1VolumeMount(name="shared-models", mount_path=k8s_config.tfs_models_path) ]) k8s_containers.append(k8s_container) containers.append( Container(model=model.name, version=model.version, active=False, container=container_name, node=None, port=base_port, device=Device.CPU, quota=None)) base_port += 1 # add GPU containers for gpu in range(k8s_config.available_gpus): container_name = "nodemanager-rest-gpu-" + str(gpu + 1) k8s_container = client.V1Container( name=container_name, image=k8s_config.tfs_image + "-gpu", args=[ "--model_config_file=" + k8s_config.tfs_config_file_name, "--rest_api_port=" + str(base_port) ], ports=[client.V1ContainerPort(container_port=base_port)], volume_mounts=[ client.V1VolumeMount(name="shared-models", mount_path=k8s_config.tfs_models_path) ], env=[ client.V1EnvVar(name="NVIDIA_VISIBLE_DEVICES", value=str(gpu + 1)) ]) k8s_containers.append(k8s_container) containers.append( Container(model="all", version=1, active=False, container=container_name, node=None, port=base_port, device=Device.GPU, quota=None)) base_port += 1 # add volumes volumes = [ client.V1Volume( name="docker-sock", host_path=client.V1HostPathVolumeSource(path="/var/run")), client.V1Volume(name="shared-models", empty_dir=client.V1EmptyDirVolumeSource()) ] # set pod affinity affinity = client.V1Affinity(pod_anti_affinity=client.V1PodAffinity( required_during_scheduling_ignored_during_execution=[ client.V1PodAffinityTerm(topology_key="kubernetes.io/hostname") ])) # init containers init_containers = [] for i, model in enumerate( ConfigurationsGenerator.model_list(k8s_config.models)): container_name = "tfs-init-" + str(i + 1) init_containers.append( client.V1Container( name=container_name, image=k8s_config.tfs_init_image, args=[ "-f", "/home/models/", "-d", "/home/models/" + model.name, "-c", k8s_config.tfs_config_endpoint, "-m", model.tfs_model_url ], image_pull_policy=k8s_config.k8s_image_pull_policy, volume_mounts=[ client.V1VolumeMount( name="shared-models", mount_path=k8s_config.tfs_models_path) ])) # add pod spec pod_spec = client.V1PodSpec(containers=k8s_containers, volumes=volumes, affinity=affinity, init_containers=init_containers, host_network=k8s_config.k8s_host_network, dns_policy="Default") # add pod template spec pod_template_spec = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={"run": "nodemanager"}), spec=pod_spec) # add deployment spec deployment_spec = client.V1DeploymentSpec( selector=client.V1LabelSelector( match_labels={"run": "nodemanager"}), template=pod_template_spec, replicas=k8s_config.initial_replicas) # build deployment deployment = client.V1Deployment(api_version="apps/v1", kind="Deployment", metadata=client.V1ObjectMeta( name="nodemanager-deploy", labels={"run": "nodemanager"}), spec=deployment_spec) return containers, deployment
def spawn(self, taskdef: TaskDefinition) -> KubernetesTask: try: self.emit_sync('prepare', taskdef=taskdef) volumes, mounts = create_volumes(taskdef.volumes) # container definition container = client.V1Container( name=taskdef.id, image=taskdef.image, env=self.create_env(taskdef), ports=self.create_ports(taskdef), image_pull_policy='Always', # taskdef field?? resources=client.V1ResourceRequirements( requests={ 'cpu': str(taskdef.cpu or '0'), 'memory': str(taskdef.memory or '0'), }, limits={ 'cpu': str(taskdef.cpu_limit or '0'), 'memory': str(taskdef.memory_limit or '0'), }, ), volume_mounts=mounts, ) labels = { LABEL_TASK_ID: taskdef.id, LABEL_PARENT_ID: taskdef.parent, **taskdef.meta, } affinity = None if (taskdef.affinity is not None) and (taskdef.affinity != {}): affinity_label = {} if taskdef.affinity.get("label"): affinity_label[taskdef.affinity["label"][ "key"]] = taskdef.affinity["label"]["value"] else: affinity_label[ "cowait_default_affinity_key"] = "cowait_default_affinity_value" if taskdef.affinity["type"] == 'spread': aff_def = client.V1PodAntiAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1WeightedPodAffinityTerm( pod_affinity_term=client.V1PodAffinityTerm( label_selector=client. V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=list(affinity_label.keys())[0], operator="In", values=[ list(affinity_label.values()) [0] ], ) ]), topology_key="kubernetes.io/hostname", ), weight=50) ]) elif taskdef.affinity["type"] == 'group': aff_def = client.V1PodAffinity( preferred_during_scheduling_ignored_during_execution=[ client.V1WeightedPodAffinityTerm( pod_affinity_term=client.V1PodAffinityTerm( label_selector=client. V1LabelSelector(match_expressions=[ client.V1LabelSelectorRequirement( key=list(affinity_label.keys())[0], operator="In", values=[ list(affinity_label.values()) [0] ], ) ]), topology_key="kubernetes.io/hostname", ), weight=50) ]) else: aff_def = None affinity = client.V1Affinity( pod_anti_affinity=aff_def) if aff_def else None labels[list(affinity_label.keys())[0]] = list( affinity_label.values())[0] pod = self.core.create_namespaced_pod( namespace=self.namespace, body=client.V1Pod( metadata=client.V1ObjectMeta( name=taskdef.id, namespace=self.namespace, labels=labels, ), spec=client.V1PodSpec( hostname=taskdef.id, restart_policy='Never', image_pull_secrets=self.get_pull_secrets(), volumes=volumes, affinity=affinity, containers=[container], service_account_name=self.service_account, ), ), ) # wrap & return task # print('~~ created kubenetes pod', pod.metadata.name) task = KubernetesTask(self, taskdef, pod) self.emit_sync('spawn', task=task) return task except urllib3.exceptions.MaxRetryError: raise ProviderError('Kubernetes engine unavailable')
def create_run_pod(k8s_settings, run_context): run_id = run_context.id run_name = run_context.run.to_json()["name"] labels = { "run-name": run_name, "run": run_id, } env = get_run_pod_env_vars(run_context) node_topology_key = "kubernetes.io/hostname" # NOTE(taylor): preference to run on nodes with other runs pod_affinities = [ k8s_client.V1WeightedPodAffinityTerm( weight=50, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector(match_labels={ "type": "run", }, ), topology_key=node_topology_key, ), ), ] volumes = [] volume_mounts = [] experiment_id = run_context.experiment if experiment_id: labels.update({"experiment": experiment_id}) # NOTE(taylor): highest preference to run on nodes with runs in the same experiment pod_affinities.append( k8s_client.V1WeightedPodAffinityTerm( weight=100, pod_affinity_term=k8s_client.V1PodAffinityTerm( label_selector=k8s_client.V1LabelSelector(match_labels={ "type": "run", "experiment": experiment_id, }, ), topology_key=node_topology_key, ), )) unacceptable_node_group_types = ["system"] requests = k8s_settings.resources.get("requests") or {} limits = k8s_settings.resources.get("limits") or {} # NOTE(taylor): Preventing GPU-less jobs from running on GPU nodes forces the cluster autoscaler to scale up # CPU nodes. This prevents a situation where the GPU nodes are not scaled down because they are occupied by # CPU workloads. The cluster autoscaler does not know that it should create CPU nodes when the GPUs are unused. # TODO(taylor): This could cause unexpected behavior if the cluster has no CPU nodes. Running CPU jobs on GPU # nodes could also be an opportunity for more efficient resource utilization, but is avoided for now because the # workloads cannot be migrated onto CPU nodes by the cluster autoscaler as mentioned above. # NOTE(taylor): Applying a NoSchedule taint to GPU nodes is another way to achieve this behavior, but does not work as # well out of the box with clusters that orchestrate doesn't provision. Applying a PreferNoSchedule # taint to GPU nodes does not resolve the workload migration issue when there are no CPU nodes. if all( float(group.get("nvidia.com/gpu", 0)) == 0 for group in (requests, limits)): unacceptable_node_group_types.append("gpu") node_affinity = k8s_client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=k8s_client. V1NodeSelector(node_selector_terms=[ k8s_client.V1NodeSelectorTerm(match_expressions=[ k8s_client.V1NodeSelectorRequirement( key="orchestrate.sigopt.com/node-group-type", operator="NotIn", values=unacceptable_node_group_types, ) ], ) ], ), ) pod_affinity = k8s_client.V1PodAffinity( preferred_during_scheduling_ignored_during_execution=pod_affinities, ) pod = k8s_client.V1Pod( metadata=k8s_client.V1ObjectMeta( owner_references=k8s_settings.owner_references, labels={ "type": "run", **labels, }, name=run_name, ), spec=k8s_client.V1PodSpec( affinity=k8s_client.V1Affinity( node_affinity=node_affinity, pod_affinity=pod_affinity, ), containers=[ k8s_client.V1Container( name="model-runner", image=k8s_settings.image, resources=k8s_client.V1ResourceRequirements( **k8s_settings.resources), image_pull_policy="Always", command=[], args=k8s_settings.args, env=env, volume_mounts=volume_mounts, tty=True, ), ], volumes=volumes, restart_policy="Never", ), ) k8s_settings.api.create_namespaced_pod(k8s_settings.namespace, pod) return pod