def test_default_toleration_preserved(image_name): pod_spec = make_pod_spec(image=image_name, extra_pod_config={ 'tolerations': [{ 'key': 'example.org/toleration', 'operator': 'Exists', 'effect': 'NoSchedule', }], }) tolerations = pod_spec.to_dict()['spec']['tolerations'] assert { 'key': 'k8s.dask.org/dedicated', 'operator': 'Equal', 'value': 'worker', 'effect': 'NoSchedule', } in tolerations assert { 'key': 'k8s.dask.org_dedicated', 'operator': 'Equal', 'value': 'worker', 'effect': 'NoSchedule', } in tolerations assert { 'key': 'example.org/toleration', 'operator': 'Exists', 'effect': 'NoSchedule', } in tolerations
def test_default_toleration_preserved(image_name): pod_spec = clean_pod_template( make_pod_spec( image=image_name, extra_pod_config={ "tolerations": [ { "key": "example.org/toleration", "operator": "Exists", "effect": "NoSchedule", } ] }, ) ) tolerations = pod_spec.to_dict()["spec"]["tolerations"] assert { "key": "k8s.dask.org/dedicated", "operator": "Equal", "value": "worker", "effect": "NoSchedule", "toleration_seconds": None, } in tolerations assert { "key": "k8s.dask.org_dedicated", "operator": "Equal", "value": "worker", "effect": "NoSchedule", "toleration_seconds": None, } in tolerations assert { "key": "example.org/toleration", "operator": "Exists", "effect": "NoSchedule", } in tolerations
def pod_spec(docker_image): yield clean_pod_template( make_pod_spec( image=docker_image, extra_container_config={"imagePullPolicy": "IfNotPresent"}, ) )
def _build_kubernetes(self): from dask_kubernetes import KubeCluster, make_pod_spec memory = self.memory_per_worker * self.workers_per_job ncpus = self.cores_per_worker * self.workers_per_job spec = make_pod_spec(image=self.image, threads_per_worker=self.tasks_per_worker, memory_limit=memory, memory_request=memory, cpu_limit=ncpus, cpu_request=ncpus) cluster = KubeCluster(spec, n_workers=self.num_workers, **self.cluster_kwargs) return cluster
def executor(self) -> Executor: pod_spec = make_pod_spec( image=self._image, memory_limit=self._pod_memory_limit, memory_request=self._pod_memory_request, threads_per_worker=self._pod_threads_per_worker, cpu_limit=self._pod_cpu_limit, cpu_request=self._pod_cpu_request, env=self._generate_env(), ) pod_spec.spec.containers[0].args.extend(["--resources", "TASKSLOTS=1"]) executor = DaskExecutor( cluster_class=lambda: KubeCluster(pod_spec, deploy_mode=self._deploy_mode), adapt_kwargs={ "minimum": self._adapt_min, "maximum": self._adapt_max }, ) return executor
def pod_spec(image_name): yield clean_pod_template( make_pod_spec( image=image_name, extra_container_config={"imagePullPolicy": "IfNotPresent"} ) )
def pod_spec(docker_image): yield make_pod_spec( image=docker_image, extra_container_config={"imagePullPolicy": "IfNotPresent"} )
# In[ ]: get_ipython().system('pip freeze') # In[ ]: # Dask Kube GPU import dask from dask.distributed import Client from dask_kubernetes import KubeCluster, make_pod_spec #tag::worker_template_with_gpu[] worker_template = make_pod_spec(image='holdenk/dask:latest', memory_limit='8G', memory_request='8G', cpu_limit=1, cpu_request=1) worker_template.spec.containers[0].resources.limits["gpu"] = 1 worker_template.spec.containers[0].resources.requests["gpu"] = 1 worker_template.spec.containers[0].args[0] = "dask-cuda-worker" worker_template.spec.containers[0].env.append("NVIDIA_VISIBLE_DEVICES=ALL") # Or append --resources "GPU=2" #end::worker_template_with_gpu[] #tag::worker_template_with_label[] worker_template = make_pod_spec(image='holdenk/dask:latest', memory_limit='8G', memory_request='8G', cpu_limit=1, cpu_request=1) worker_template.spec.node_selector = "node.kubernetes.io/gpu=gpu" worker_template.spec.containers[0].args[0] = "dask-cuda-worker" worker_template.spec.containers[0].env.append("NVIDIA_VISIBLE_DEVICES=ALL") worker_template.spec.
def pod_spec(image_name): yield make_pod_spec( image=image_name, extra_container_config={'imagePullPolicy': 'IfNotPresent'})
def pod_spec(image_name): yield make_pod_spec(image=image_name)
def __init__(self, **kwargs): """ Constructor for obtaining a client to be used when Dask is necessary 1) Cluster with shared file system and ssh capability: :param hostnames : - list; list of strings containing the host names or IP addresses of the machines that the user wants to use in their cluster/client (First hostname will be running the scheduler!) [None] :param scheduler_file_prefix : string; prefix to used to create dask scheduler-file. :param logging : - boolean; Logging scheduler and worker stdout to files within dask_logs folder [True] Must be a mounted path on all the machines. Necessary if hostnames are provided [$HOME/scheduler-] 2) Local cluster: :param local_params : - dict; dictionary containing Local Cluster options (see help(LocalCluster) for help) [None] :param n_wrks: - int; number of workers to start [1] 3) PBS cluster: :param pbs_params : - dict; dictionary containing PBS Cluster options (see help(PBSCluster) for help) [None] :param n_jobs : - int; number of jobs to be submitted to the cluster :param n_wrks: - int; number of workers per job [1] 4) LSF cluster: :param lfs_params : - dict; dictionary containing LSF Cluster options (see help(LSFCluster) for help) [None] :param n_jobs : - int; number of jobs to be submitted to the cluster :param n_wrks: - int; number of workers per job [1] 5) SLURM cluster: :param slurm_params : - dict; dictionary containing SLURM Cluster options (see help(SLURMCluster) for help) [None] :param n_jobs : - int; number of jobs to be submitted to the cluster :param n_wrks: - int; number of workers per job [1] 6) Kubernetes cluster: :param kube_params : - dict; dictonary containing KubeCluster options (see help(KubeCluster) and help(make_pod_spec) for help) [None] :param n_wrks: - int; number of workers to scale the cluster Note that by default the Kubernetes pods are created using the Docker image "ettore88/occamypy:devel". To change the image to be use, provide the item image within the kube_params dictionary. """ hostnames = kwargs.get("hostnames", None) local_params = kwargs.get("local_params", None) pbs_params = kwargs.get("pbs_params", None) lsf_params = kwargs.get("lsf_params", None) slurm_params = kwargs.get("slurm_params", None) kube_params = kwargs.get("kube_params", None) logging = kwargs.get("logging", True) ClusterInit = None cluster_params = None if local_params: cluster_params = local_params ClusterInit = daskD.LocalCluster elif pbs_params: cluster_params = pbs_params ClusterInit = PBSCluster elif lsf_params: cluster_params = lsf_params ClusterInit = LSFCluster elif slurm_params: cluster_params = slurm_params ClusterInit = SLURMCluster # Checking interface to be used if hostnames: if not isinstance(hostnames, list): raise ValueError("User must provide a list with host names") scheduler_file_prefix = kwargs.get("scheduler_file_prefix", os.path.expanduser("~") + "/scheduler-") # Random port number self.port = ''.join(["1"] + [str(random.randint(0, 9)) for _ in range(3)]) # Creating logging interface stdout_scheduler = DEVNULL stdout_workers = [DEVNULL] * len(hostnames) if logging: # Creating logging folder try: os.mkdir("dask_logs") except OSError: pass stdout_scheduler = open("dask_logs/dask-scheduler.log", "w") stdout_workers = [open("dask_logs/dask-worker-%s.log" % (ii + 1), "w") for ii in range(len(hostnames))] # Starting scheduler scheduler_file = "%s%s" % (scheduler_file_prefix, self.port) + ".json" cmd = ["ssh"] + [hostnames[0]] + \ ["dask-scheduler"] + ["--scheduler-file"] + [scheduler_file] + \ ["--port"] + [self.port] self.scheduler_proc = subprocess.Popen(cmd, stdout=stdout_scheduler, stderr=subprocess.STDOUT) # Checking if scheduler has started and getting tpc information t0 = time() while True: if os.path.isfile(scheduler_file): if get_tcp_info(scheduler_file): break # If the dask scheduler is not started in 5 minutes raise exception if time() - t0 > 300.0: raise SystemError("Dask could not start scheduler! Try different first host name.") # Creating dask Client self.client = daskD.Client(scheduler_file=scheduler_file) # Starting workers on all the other hosts self.worker_procs = [] worker_ips = [] for ii, hostname in enumerate(hostnames): cmd = ["ssh"] + [hostname] + ["dask-worker"] + ["--scheduler-file"] + [scheduler_file] # Starting worker self.worker_procs.append(subprocess.Popen(cmd, stdout=stdout_workers[ii], stderr=subprocess.STDOUT)) # Obtaining IP address of host for the started worker (necessary to resort workers) worker_ips.append( subprocess.check_output( ["ssh"] + [hostname] + ["hostname -I"] + ["| awk '{print $1}'"]).rstrip().decode("utf-8")) # Waiting until all the requested workers are up and running workers = 0 requested = len(hostnames) t0 = time() while workers < requested: workers = len(self.client.get_worker_logs().keys()) # If the number of workers is not reached in 5 minutes raise exception if time() - t0 > 300.0: raise SystemError( "Dask could not start the requested workers within 5 minutes! Try different hostnames.") # Resorting worker IDs according to user-provided list self.WorkerIds = [] wrkIds = list(self.client.get_worker_logs().keys()) # Unsorted workers ids wrk_ips = [idw.split(":")[1][2:] for idw in wrkIds] # Unsorted ip addresses for ip in worker_ips: idx = wrk_ips.index(ip) self.WorkerIds.append(wrkIds[idx]) wrkIds.pop(idx) wrk_ips.pop(idx) elif kube_params: n_wrks = kwargs.get("n_wrks") if "image" not in kube_params: kube_params.update({"image": 'ettore88/occamypy:devel'}) pod_spec = make_pod_spec(**kube_params) self.cluster = KubeCluster(pod_spec, deploy_mode="remote") self.client, self.WorkerIds = client_startup(self.cluster, n_wrks, n_wrks) elif ClusterInit: n_wrks = kwargs.get("n_wrks", 1) if n_wrks <= 0: raise ValueError("n_wrks must equal or greater than 1!") if "local_params" in kwargs: # Starting local cluster n_jobs = n_wrks n_wrks = 1 else: # Starting scheduler-based clusters n_jobs = kwargs.get("n_jobs") if n_jobs <= 0: raise ValueError("n_jobs must equal or greater than 1!") cluster_params.update({"processes": n_wrks}) if n_wrks > 1: # forcing nanny to be true (otherwise, dask-worker command will fail) cluster_params.update({"nanny": True}) self.cluster = ClusterInit(**cluster_params) self.client, self.WorkerIds = client_startup(self.cluster, n_jobs, n_jobs * n_wrks) else: raise ValueError("Either hostnames or local_params or pbs/lsf/slurm_params or kube_params must be " "provided!") # Closing dask processes atexit.register(self.client.shutdown)
dask.config.set({ "multiprocessing.context": "forkserver", "scheduler": "processes" }) #end::dask_use_forkserver[] # In[ ]: #tag::make_dask_k8s_client[] import dask from dask.distributed import Client from dask_kubernetes import KubeCluster, make_pod_spec worker_template = make_pod_spec( image='holdenk/dask:latest', memory_limit='8G', memory_request='8G', cpu_limit=1, cpu_request=1, extra_container_config={"imagePullPolicy": "Always"}) scheduler_template = make_pod_spec( image='holdenk/dask:latest', memory_limit='4G', memory_request='4G', cpu_limit=1, cpu_request=1, extra_container_config={"imagePullPolicy": "Always"}) cluster = KubeCluster(pod_template=worker_template, scheduler_pod_template=scheduler_template) cluster.adapt( minimum=1) # or create and destroy workers dynamically based on workload from dask.distributed import Client
def determine_workers( max_mem_size: int, memory_limit: int = 16, cpu_limit: int = 2, image_repo: str = 'cormorack', image_name: str = 'cava-dask', image_tag: str = '20210610', ) -> dict: """ Determine dask worker spec and cluster size, based on total requested data size Parameters ---------- max_mem_size: int Max memory requirement for the amount of data requested memory_limit: int Memory limit for the dask worker, as well max machine memory cpu_limit: int CPU limit for the dask worker image_repo: str Docker image repository for dask worker image_name: Docker image name for dask worker image_tag: Docker image tag for dask worker Returns ------- dict Dictionary containing the pod_spec, and min, max for number of workers """ max_workers = int(np.ceil(max_mem_size / memory_limit)) min_workers = int(np.ceil(max_workers / 10)) image = f"{image_repo}/{image_name}:{image_tag}" # Determine the memory and cpu request sizes k8s_mem = memory_limit / 2 k8s_cpu = cpu_limit / 2 if max_mem_size < memory_limit: k8s_mem = max_mem_size k8s_cpu = k8s_cpu / 2 pod_spec = make_pod_spec( image=image, labels={'app.kubernetes.io/component': 'cava-dask'}, memory_limit=f'{memory_limit}GB', memory_request=f'{k8s_mem}GB', cpu_limit=str(cpu_limit), cpu_request=str(k8s_cpu), extra_pod_config={ 'nodeSelector': { 'kops.k8s.io/instancegroup': 'compute' }, 'restartPolicy': 'Never', }, extra_container_config={ 'imagePullPolicy': 'IfNotPresent', 'name': 'cava-dask', }, threads_per_worker=2, ) cleaned_spec = _clean_pod_spec(pod_spec) return { 'min_workers': min_workers, 'max_workers': max_workers, 'pod_spec': cleaned_spec, }