def create_cluster(**kwargs): workers = task['inputs'].get('workers', 0) cpu = task['inputs'].get('worker_cores', 2) memory = task['inputs'].get('worker_memory', 2) image = task['inputs'].get('worker_image', 'daskdev/dask:latest') resources = { 'cpu': str(cpu), 'memory': str(memory), } container = client.V1Container( name='dask', image=image, args=[ 'dask-worker', '--nthreads', str(cpu_to_threads(cpu)), '--no-bokeh', '--memory-limit', f'{memory}B', '--death-timeout', '60', ], resources=client.V1ResourceRequirements( limits=resources, requests=resources, ), ) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels={ 'cowait/task': 'worker-' + task.get('id'), 'cowait/parent': task.get('id'), }, ), spec=client.V1PodSpec( restart_policy='Never', image_pull_secrets=[ client.V1LocalObjectReference(name=secret) for secret in task['inputs'].get('pull_secrets', ['docker']) ], containers=[container], ), ) return KubeCluster( pod_template=pod, n_workers=workers, )
def deploy_function(function: DaskCluster, secrets=None): try: from dask_kubernetes import KubeCluster, make_pod_spec from dask.distributed import Client, default_client from kubernetes_asyncio import client import dask except ImportError as e: print('missing dask or dask_kubernetes, please run ' '"pip install dask distributed dask_kubernetes", %s', e) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or 'daskdev/dask:latest' env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function) args = ['dask-worker', "--nthreads", str(spec.nthreads)] if spec.args: args += spec.args container = client.V1Container(name='base', image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources) pod_spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=spec.volumes, service_account=spec.service_account) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret)] pod = client.V1Pod(metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), #annotations=meta.annotation), spec=pod_spec) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = 'NodePort' svc_temp['spec']['ports'][1]['nodePort'] = spec.node_port update_in(svc_temp, 'spec.type', spec.service_type) norm_name = normalize_name(meta.name) dask.config.set({"kubernetes.scheduler-service-template": svc_temp, 'kubernetes.name': 'mlrun-' + norm_name + '-{uuid}'}) cluster = KubeCluster( pod, deploy_mode='remote', namespace=namespace, scheduler_timeout=spec.scheduler_timeout) logger.info('cluster {} started at {}'.format( cluster.name, cluster.scheduler_address )) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == 'NodePort': ports = cluster.scheduler.service.spec.ports function.status.node_ports = {'scheduler': ports[0].node_port, 'dashboard': ports[1].node_port} if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
def deploy_function(function: DaskCluster, secrets=None): # TODO: why is this here :| try: from dask_kubernetes import KubeCluster, make_pod_spec # noqa: F401 from dask.distributed import Client, default_client # noqa: F401 from kubernetes_asyncio import client import dask except ImportError as e: print( "missing dask or dask_kubernetes, please run " '"pip install dask distributed dask_kubernetes", %s', e, ) raise e spec = function.spec meta = function.metadata spec.remote = True image = function.full_image_path() or "daskdev/dask:latest" env = spec.env namespace = meta.namespace or config.namespace if spec.extra_pip: env.append(spec.extra_pip) pod_labels = get_resource_labels(function, scrape_metrics=False) args = ["dask-worker", "--nthreads", str(spec.nthreads)] memory_limit = spec.resources.get("limits", {}).get("memory") if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) if spec.args: args.extend(spec.args) container = client.V1Container( name="base", image=image, env=env, args=args, image_pull_policy=spec.image_pull_policy, volume_mounts=spec.volume_mounts, resources=spec.resources, ) pod_spec = client.V1PodSpec( containers=[container], restart_policy="Never", volumes=spec.volumes, service_account=spec.service_account, ) if spec.image_pull_secret: pod_spec.image_pull_secrets = [ client.V1LocalObjectReference(name=spec.image_pull_secret) ] pod = client.V1Pod( metadata=client.V1ObjectMeta(namespace=namespace, labels=pod_labels), # annotations=meta.annotation), spec=pod_spec, ) svc_temp = dask.config.get("kubernetes.scheduler-service-template") if spec.service_type or spec.node_port: if spec.node_port: spec.service_type = "NodePort" svc_temp["spec"]["ports"][1]["nodePort"] = spec.node_port update_in(svc_temp, "spec.type", spec.service_type) norm_name = normalize_name(meta.name) dask.config.set( { "kubernetes.scheduler-service-template": svc_temp, "kubernetes.name": "mlrun-" + norm_name + "-{uuid}", } ) cluster = KubeCluster( pod, deploy_mode="remote", namespace=namespace, scheduler_timeout=spec.scheduler_timeout, ) logger.info( "cluster {} started at {}".format(cluster.name, cluster.scheduler_address) ) function.status.scheduler_address = cluster.scheduler_address function.status.cluster_name = cluster.name if spec.service_type == "NodePort": ports = cluster.scheduler.service.spec.ports function.status.node_ports = { "scheduler": ports[0].node_port, "dashboard": ports[1].node_port, } if spec.replicas: cluster.scale(spec.replicas) else: cluster.adapt(minimum=spec.min_replicas, maximum=spec.max_replicas) return cluster
async def run_inner(self, run_info): run_id = run_info['id'] del run_info # This does not run the experiment, it schedules a runner pod by # talking to the Kubernetes API. That pod will run the experiment and # update the database directly k8s_config.load_incluster_config() name = self._pod_name(run_id) # Load configuration from configmap volume with open(os.path.join(self.config_dir, 'runner.pod_spec')) as fp: pod_spec = yaml.safe_load(fp) with open(os.path.join(self.config_dir, 'runner.namespace')) as fp: namespace = fp.read().strip() # Make required changes for container in pod_spec['containers']: if container['name'] == 'runner': container['args'] += [str(run_id)] # This is mostly used by Tilt if os.environ.get('OVERRIDE_RUNNER_IMAGE'): container['image'] = os.environ['OVERRIDE_RUNNER_IMAGE'] async with k8s_client.ApiClient() as api: # Create a Kubernetes pod to run v1 = k8s_client.CoreV1Api(api) pod = k8s_client.V1Pod( api_version='v1', kind='Pod', metadata=k8s_client.V1ObjectMeta( name=name, labels={ 'app': 'run', 'run': str(run_id), }, ), spec=pod_spec, ) await v1.create_namespaced_pod( namespace=namespace, body=pod, ) logger.info("Pod created: %s", name) PROM_RUNS.inc() # Create a service for proxy connections svc = k8s_client.V1Service( api_version='v1', kind='Service', metadata=k8s_client.V1ObjectMeta( name=name, labels={ 'app': 'run', 'run': str(run_id), }, ), spec=k8s_client.V1ServiceSpec( selector={ 'app': 'run', 'run': str(run_id), }, ports=[ k8s_client.V1ServicePort( protocol='TCP', port=5597, ), ], ), ) await v1.create_namespaced_service( namespace=namespace, body=svc, ) logger.info("Service created: %s", name)
def make_pod_spec( image, labels={}, threads_per_worker=1, env={}, extra_container_config={}, extra_pod_config={}, memory_limit=None, memory_request=None, cpu_limit=None, cpu_request=None, ): """ Create generic pod template from input parameters Examples -------- >>> make_pod_spec(image='daskdev/dask:latest', memory_limit='4G', memory_request='4G') """ args = [ "dask-worker", "$(DASK_SCHEDULER_ADDRESS)", "--nthreads", str(threads_per_worker), "--death-timeout", "60", ] if memory_limit: args.extend(["--memory-limit", str(memory_limit)]) pod = client.V1Pod( metadata=client.V1ObjectMeta(labels=labels), spec=client.V1PodSpec( restart_policy="Never", containers=[ client.V1Container( name="dask-worker", image=image, args=args, env=[ client.V1EnvVar(name=k, value=v) for k, v in env.items() ], ) ], ), ) resources = client.V1ResourceRequirements(limits={}, requests={}) if cpu_request: resources.requests["cpu"] = cpu_request if memory_request: resources.requests["memory"] = memory_request if cpu_limit: resources.limits["cpu"] = cpu_limit if memory_limit: resources.limits["memory"] = memory_limit pod.spec.containers[0].resources = resources for key, value in extra_container_config.items(): _set_k8s_attribute(pod.spec.containers[0], key, value) for key, value in extra_pod_config.items(): _set_k8s_attribute(pod.spec, key, value) return pod