示例#1
0
def test_default_toleration_preserved(image_name):
    pod_spec = make_pod_spec(image=image_name,
                             extra_pod_config={
                                 'tolerations': [{
                                     'key': 'example.org/toleration',
                                     'operator': 'Exists',
                                     'effect': 'NoSchedule',
                                 }],
                             })
    tolerations = pod_spec.to_dict()['spec']['tolerations']
    assert {
        'key': 'k8s.dask.org/dedicated',
        'operator': 'Equal',
        'value': 'worker',
        'effect': 'NoSchedule',
    } in tolerations
    assert {
        'key': 'k8s.dask.org_dedicated',
        'operator': 'Equal',
        'value': 'worker',
        'effect': 'NoSchedule',
    } in tolerations
    assert {
        'key': 'example.org/toleration',
        'operator': 'Exists',
        'effect': 'NoSchedule',
    } in tolerations
示例#2
0
def test_default_toleration_preserved(image_name):
    pod_spec = clean_pod_template(
        make_pod_spec(
            image=image_name,
            extra_pod_config={
                "tolerations": [
                    {
                        "key": "example.org/toleration",
                        "operator": "Exists",
                        "effect": "NoSchedule",
                    }
                ]
            },
        )
    )
    tolerations = pod_spec.to_dict()["spec"]["tolerations"]
    assert {
        "key": "k8s.dask.org/dedicated",
        "operator": "Equal",
        "value": "worker",
        "effect": "NoSchedule",
        "toleration_seconds": None,
    } in tolerations
    assert {
        "key": "k8s.dask.org_dedicated",
        "operator": "Equal",
        "value": "worker",
        "effect": "NoSchedule",
        "toleration_seconds": None,
    } in tolerations
    assert {
        "key": "example.org/toleration",
        "operator": "Exists",
        "effect": "NoSchedule",
    } in tolerations
示例#3
0
def pod_spec(docker_image):
    yield clean_pod_template(
        make_pod_spec(
            image=docker_image,
            extra_container_config={"imagePullPolicy": "IfNotPresent"},
        )
    )
示例#4
0
    def _build_kubernetes(self):
        from dask_kubernetes import KubeCluster, make_pod_spec
        memory = self.memory_per_worker * self.workers_per_job
        ncpus = self.cores_per_worker * self.workers_per_job
        spec = make_pod_spec(image=self.image,
                             threads_per_worker=self.tasks_per_worker,
                             memory_limit=memory,
                             memory_request=memory,
                             cpu_limit=ncpus,
                             cpu_request=ncpus)
        cluster = KubeCluster(spec,
                              n_workers=self.num_workers,
                              **self.cluster_kwargs)

        return cluster
示例#5
0
    def executor(self) -> Executor:

        pod_spec = make_pod_spec(
            image=self._image,
            memory_limit=self._pod_memory_limit,
            memory_request=self._pod_memory_request,
            threads_per_worker=self._pod_threads_per_worker,
            cpu_limit=self._pod_cpu_limit,
            cpu_request=self._pod_cpu_request,
            env=self._generate_env(),
        )
        pod_spec.spec.containers[0].args.extend(["--resources", "TASKSLOTS=1"])

        executor = DaskExecutor(
            cluster_class=lambda: KubeCluster(pod_spec,
                                              deploy_mode=self._deploy_mode),
            adapt_kwargs={
                "minimum": self._adapt_min,
                "maximum": self._adapt_max
            },
        )

        return executor
示例#6
0
def pod_spec(image_name):
    yield clean_pod_template(
        make_pod_spec(
            image=image_name, extra_container_config={"imagePullPolicy": "IfNotPresent"}
        )
    )
示例#7
0
def pod_spec(docker_image):
    yield make_pod_spec(
        image=docker_image, extra_container_config={"imagePullPolicy": "IfNotPresent"}
    )
示例#8
0
# In[ ]:


get_ipython().system('pip freeze')


# In[ ]:


# Dask Kube GPU
import dask
from dask.distributed import Client
from dask_kubernetes import KubeCluster, make_pod_spec
#tag::worker_template_with_gpu[]
worker_template = make_pod_spec(image='holdenk/dask:latest',
                         memory_limit='8G', memory_request='8G',
                         cpu_limit=1, cpu_request=1)
worker_template.spec.containers[0].resources.limits["gpu"] = 1
worker_template.spec.containers[0].resources.requests["gpu"] = 1
worker_template.spec.containers[0].args[0] = "dask-cuda-worker"
worker_template.spec.containers[0].env.append("NVIDIA_VISIBLE_DEVICES=ALL")
# Or append --resources "GPU=2"
#end::worker_template_with_gpu[]
#tag::worker_template_with_label[]
worker_template = make_pod_spec(image='holdenk/dask:latest',
                         memory_limit='8G', memory_request='8G',
                         cpu_limit=1, cpu_request=1)
worker_template.spec.node_selector = "node.kubernetes.io/gpu=gpu"
worker_template.spec.containers[0].args[0] = "dask-cuda-worker"
worker_template.spec.containers[0].env.append("NVIDIA_VISIBLE_DEVICES=ALL")
worker_template.spec.
示例#9
0
def pod_spec(image_name):
    yield make_pod_spec(
        image=image_name,
        extra_container_config={'imagePullPolicy': 'IfNotPresent'})
示例#10
0
def pod_spec(image_name):
    yield make_pod_spec(image=image_name)
示例#11
0
 def __init__(self, **kwargs):
     """
 Constructor for obtaining a client to be used when Dask is necessary
 1) Cluster with shared file system and ssh capability:
 :param hostnames : - list; list of strings containing the host names or IP addresses of the machines that
 the user wants to use in their cluster/client (First hostname will be running the scheduler!) [None]
 :param scheduler_file_prefix : string; prefix to used to create dask scheduler-file.
 :param logging : - boolean; Logging scheduler and worker stdout to files within dask_logs folder [True]
 Must be a mounted path on all the machines. Necessary if hostnames are provided [$HOME/scheduler-]
 2) Local cluster:
 :param local_params : - dict; dictionary containing Local Cluster options (see help(LocalCluster) for help) [None]
 :param n_wrks: - int; number of workers to start [1]
 3) PBS cluster:
 :param pbs_params : - dict; dictionary containing PBS Cluster options (see help(PBSCluster) for help) [None]
 :param n_jobs : - int; number of jobs to be submitted to the cluster
 :param n_wrks: - int; number of workers per job [1]
 4) LSF cluster:
 :param lfs_params : - dict; dictionary containing LSF Cluster options (see help(LSFCluster) for help) [None]
 :param n_jobs : - int; number of jobs to be submitted to the cluster
 :param n_wrks: - int; number of workers per job [1]
 5) SLURM cluster:
 :param slurm_params : - dict; dictionary containing SLURM Cluster options (see help(SLURMCluster) for help) [None]
 :param n_jobs : - int; number of jobs to be submitted to the cluster
 :param n_wrks: - int; number of workers per job [1]
 6) Kubernetes cluster:
 :param kube_params : - dict; dictonary containing KubeCluster options
  (see help(KubeCluster) and help(make_pod_spec) for help) [None]
 :param n_wrks: - int; number of workers to scale the cluster
 Note that by default the Kubernetes pods are created using the Docker image "ettore88/occamypy:devel". To change
 the image to be use, provide the item image within the kube_params dictionary.
 """
     hostnames = kwargs.get("hostnames", None)
     local_params = kwargs.get("local_params", None)
     pbs_params = kwargs.get("pbs_params", None)
     lsf_params = kwargs.get("lsf_params", None)
     slurm_params = kwargs.get("slurm_params", None)
     kube_params = kwargs.get("kube_params", None)
     logging = kwargs.get("logging", True)
     ClusterInit = None
     cluster_params = None
     if local_params:
         cluster_params = local_params
         ClusterInit = daskD.LocalCluster
     elif pbs_params:
         cluster_params = pbs_params
         ClusterInit = PBSCluster
     elif lsf_params:
         cluster_params = lsf_params
         ClusterInit = LSFCluster
     elif slurm_params:
         cluster_params = slurm_params
         ClusterInit = SLURMCluster
     # Checking interface to be used
     if hostnames:
         if not isinstance(hostnames, list):
             raise ValueError("User must provide a list with host names")
         scheduler_file_prefix = kwargs.get("scheduler_file_prefix", os.path.expanduser("~") + "/scheduler-")
         # Random port number
         self.port = ''.join(["1"] + [str(random.randint(0, 9)) for _ in range(3)])
         # Creating logging interface
         stdout_scheduler = DEVNULL
         stdout_workers = [DEVNULL] * len(hostnames)
         if logging:
             # Creating logging folder
             try:
                 os.mkdir("dask_logs")
             except OSError:
                 pass
             stdout_scheduler = open("dask_logs/dask-scheduler.log", "w")
             stdout_workers = [open("dask_logs/dask-worker-%s.log" % (ii + 1), "w") for ii in range(len(hostnames))]
         # Starting scheduler
         scheduler_file = "%s%s" % (scheduler_file_prefix, self.port) + ".json"
         cmd = ["ssh"] + [hostnames[0]] + \
               ["dask-scheduler"] + ["--scheduler-file"] + [scheduler_file] + \
               ["--port"] + [self.port]
         self.scheduler_proc = subprocess.Popen(cmd, stdout=stdout_scheduler, stderr=subprocess.STDOUT)
         # Checking if scheduler has started and getting tpc information
         t0 = time()
         while True:
             if os.path.isfile(scheduler_file):
                 if get_tcp_info(scheduler_file): break
             # If the dask scheduler is not started in 5 minutes raise exception
             if time() - t0 > 300.0:
                 raise SystemError("Dask could not start scheduler! Try different first host name.")
         # Creating dask Client
         self.client = daskD.Client(scheduler_file=scheduler_file)
         # Starting workers on all the other hosts
         self.worker_procs = []
         worker_ips = []
         for ii, hostname in enumerate(hostnames):
             cmd = ["ssh"] + [hostname] + ["dask-worker"] + ["--scheduler-file"] + [scheduler_file]
             # Starting worker
             self.worker_procs.append(subprocess.Popen(cmd, stdout=stdout_workers[ii], stderr=subprocess.STDOUT))
             # Obtaining IP address of host for the started worker (necessary to resort workers)
             worker_ips.append(
                 subprocess.check_output(
                     ["ssh"] + [hostname] + ["hostname -I"] + ["| awk '{print $1}'"]).rstrip().decode("utf-8"))
         # Waiting until all the requested workers are up and running
         workers = 0
         requested = len(hostnames)
         t0 = time()
         while workers < requested:
             workers = len(self.client.get_worker_logs().keys())
             # If the number of workers is not reached in 5 minutes raise exception
             if time() - t0 > 300.0:
                 raise SystemError(
                     "Dask could not start the requested workers within 5 minutes! Try different hostnames.")
         # Resorting worker IDs according to user-provided list
         self.WorkerIds = []
         wrkIds = list(self.client.get_worker_logs().keys())  # Unsorted workers ids
         wrk_ips = [idw.split(":")[1][2:] for idw in wrkIds]  # Unsorted ip addresses
         for ip in worker_ips:
             idx = wrk_ips.index(ip)
             self.WorkerIds.append(wrkIds[idx])
             wrkIds.pop(idx)
             wrk_ips.pop(idx)
     elif kube_params:
         n_wrks = kwargs.get("n_wrks")
         if "image" not in kube_params:
             kube_params.update({"image": 'ettore88/occamypy:devel'})
         pod_spec = make_pod_spec(**kube_params)
         self.cluster = KubeCluster(pod_spec, deploy_mode="remote")
         self.client, self.WorkerIds = client_startup(self.cluster, n_wrks, n_wrks)
     elif ClusterInit:
         n_wrks = kwargs.get("n_wrks", 1)
         if n_wrks <= 0:
             raise ValueError("n_wrks must equal or greater than 1!")
         if "local_params" in kwargs:
             # Starting local cluster
             n_jobs = n_wrks
             n_wrks = 1
         else:
             # Starting scheduler-based clusters
             n_jobs = kwargs.get("n_jobs")
             if n_jobs <= 0:
                 raise ValueError("n_jobs must equal or greater than 1!")
             cluster_params.update({"processes": n_wrks})
             if n_wrks > 1:
                 # forcing nanny to be true (otherwise, dask-worker command will fail)
                 cluster_params.update({"nanny": True})
         self.cluster = ClusterInit(**cluster_params)
         self.client, self.WorkerIds = client_startup(self.cluster, n_jobs, n_jobs * n_wrks)
     else:
         raise ValueError("Either hostnames or local_params or pbs/lsf/slurm_params or kube_params must be "
                          "provided!")
     # Closing dask processes
     atexit.register(self.client.shutdown)
示例#12
0
dask.config.set({
    "multiprocessing.context": "forkserver",
    "scheduler": "processes"
})
#end::dask_use_forkserver[]

# In[ ]:

#tag::make_dask_k8s_client[]
import dask
from dask.distributed import Client
from dask_kubernetes import KubeCluster, make_pod_spec
worker_template = make_pod_spec(
    image='holdenk/dask:latest',
    memory_limit='8G',
    memory_request='8G',
    cpu_limit=1,
    cpu_request=1,
    extra_container_config={"imagePullPolicy": "Always"})
scheduler_template = make_pod_spec(
    image='holdenk/dask:latest',
    memory_limit='4G',
    memory_request='4G',
    cpu_limit=1,
    cpu_request=1,
    extra_container_config={"imagePullPolicy": "Always"})
cluster = KubeCluster(pod_template=worker_template,
                      scheduler_pod_template=scheduler_template)
cluster.adapt(
    minimum=1)  # or create and destroy workers dynamically based on workload
from dask.distributed import Client
示例#13
0
def determine_workers(
    max_mem_size: int,
    memory_limit: int = 16,
    cpu_limit: int = 2,
    image_repo: str = 'cormorack',
    image_name: str = 'cava-dask',
    image_tag: str = '20210610',
) -> dict:
    """
    Determine dask worker spec and cluster size,
    based on total requested data size

    Parameters
    ----------
    max_mem_size: int
        Max memory requirement for the amount of data requested
    memory_limit: int
        Memory limit for the dask worker, as well max machine memory
    cpu_limit: int
        CPU limit for the dask worker
    image_repo: str
        Docker image repository for dask worker
    image_name:
        Docker image name for dask worker
    image_tag:
        Docker image tag for dask worker

    Returns
    -------
    dict
        Dictionary containing the pod_spec,
        and min, max for number of workers

    """
    max_workers = int(np.ceil(max_mem_size / memory_limit))
    min_workers = int(np.ceil(max_workers / 10))
    image = f"{image_repo}/{image_name}:{image_tag}"

    # Determine the memory and cpu request sizes
    k8s_mem = memory_limit / 2
    k8s_cpu = cpu_limit / 2
    if max_mem_size < memory_limit:
        k8s_mem = max_mem_size
        k8s_cpu = k8s_cpu / 2

    pod_spec = make_pod_spec(
        image=image,
        labels={'app.kubernetes.io/component': 'cava-dask'},
        memory_limit=f'{memory_limit}GB',
        memory_request=f'{k8s_mem}GB',
        cpu_limit=str(cpu_limit),
        cpu_request=str(k8s_cpu),
        extra_pod_config={
            'nodeSelector': {
                'kops.k8s.io/instancegroup': 'compute'
            },
            'restartPolicy': 'Never',
        },
        extra_container_config={
            'imagePullPolicy': 'IfNotPresent',
            'name': 'cava-dask',
        },
        threads_per_worker=2,
    )

    cleaned_spec = _clean_pod_spec(pod_spec)
    return {
        'min_workers': min_workers,
        'max_workers': max_workers,
        'pod_spec': cleaned_spec,
    }