def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostics_port=None, loop=loop, start=False) cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.worker_info: yield gen.sleep(0.01) assert time() < start + 15 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers yield gen.sleep(0.2) assert not cluster.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) yield c._shutdown() yield cluster._close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostic_port=None, loop=loop, start=False) alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() for i in range(20): futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) del futures yield gen.sleep(0.1) yield c._shutdown() yield cluster._close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, processes=False, diagnostics_port=None, loop=loop, start=False) try: cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = yield Client(cluster, asynchronous=True, loop=loop) futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.worker_info: yield gen.sleep(0.01) assert time() < start + 15 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers assert not cluster.scheduler.workers yield gen.sleep(0.2) assert not cluster.workers assert not cluster.scheduler.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) finally: yield c._close() yield cluster._close()
def test_get_scale_up_kwargs(loop): with LocalCluster(0, scheduler_port=0, silence_logs=False, diagnostics_port=None, loop=loop) as cluster: alc = Adaptive(cluster.scheduler, cluster, interval=100, scale_factor=3) assert alc.get_scale_up_kwargs() == {'n': 1} with Client(cluster, loop=loop) as c: future = c.submit(lambda x: x + 1, 1) assert future.result() == 2 assert c.ncores() assert alc.get_scale_up_kwargs() == {'n': 3}
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostics_port=None, loop=loop, start=False) cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers yield gen.sleep(0.2) assert not cluster.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) yield c._shutdown() yield cluster._close()
def test_min_max(): loop = IOLoop.current() cluster = yield LocalCluster(0, scheduler_port=0, silence_logs=False, processes=False, diagnostics_port=None, loop=loop, asynchronous=True) yield cluster._start() try: adapt = Adaptive(cluster.scheduler, cluster, minimum=1, maximum=2, interval=20) c = yield Client(cluster, asynchronous=True, loop=loop) start = time() while not cluster.scheduler.workers: yield gen.sleep(0.01) assert time() < start + 1 yield gen.sleep(0.2) assert len(cluster.scheduler.workers) == 1 assert frequencies(pluck(1, adapt.log)) == {'up': 1} futures = c.map(slowinc, range(100), delay=0.1) start = time() while len(cluster.scheduler.workers) < 2: yield gen.sleep(0.01) assert time() < start + 1 assert len(cluster.scheduler.workers) == 2 yield gen.sleep(0.5) assert len(cluster.scheduler.workers) == 2 assert len(cluster.workers) == 2 assert frequencies(pluck(1, adapt.log)) == {'up': 2} del futures start = time() while len(cluster.scheduler.workers) != 1: yield gen.sleep(0.01) assert time() < start + 1 assert frequencies(pluck(1, adapt.log)) == {'up': 2, 'down': 1} finally: yield c._close() yield cluster._close()
def test_adaptive_local_cluster(loop): with LocalCluster(0, scheduler_port=0, silence_logs=False, diagnostics_port=None, loop=loop) as cluster: alc = Adaptive(cluster.scheduler, cluster, interval=100) with Client(cluster, loop=loop) as c: assert not c.ncores() future = c.submit(lambda x: x + 1, 1) assert future.result() == 2 assert c.ncores() sleep(0.1) assert c.ncores() # still there after some time del future start = time() while cluster.scheduler.ncores: sleep(0.01) assert time() < start + 5 assert not c.ncores()
def test_avoid_churn(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster(0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, diagnostics_port=None) client = yield Client(cluster, asynchronous=True) try: adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5) for i in range(10): yield client.submit(slowinc, i, delay=0.040) yield gen.sleep(0.040) assert frequencies(pluck(1, adapt.log)) == {'up': 1} finally: yield client._close() yield cluster._close()
class KubeCluster(Cluster): """ Launch a Dask cluster on Kubernetes This starts a local Dask scheduler and then dynamically launches Dask workers on a Kubernetes cluster. The Kubernetes cluster is taken to be either the current one on which this code is running, or as a fallback, the default one configured in a kubeconfig file. **Environments** Your worker pod image should have a similar environment to your local environment, including versions of Python, dask, cloudpickle, and any libraries that you may wish to use (like NumPy, Pandas, or Scikit-Learn). See examples below for suggestions on how to manage and check for this. **Network** Since the Dask scheduler is launched locally, for it to work, we need to be able to open network connections between this local node and all the workers nodes on the Kubernetes cluster. If the current process is not already on a Kubernetes node, some network configuration will likely be required to make this work. **Resources** Your Kubernetes resource limits and requests should match the ``--memory-limit`` and ``--nthreads`` parameters given to the ``dask-worker`` command. Parameters ---------- pod_template: kubernetes.client.V1PodSpec A Kubernetes specification for a Pod for a dask worker. name: str (optional) Name given to the pods. Defaults to ``dask-$USER-random`` namespace: str (optional) Namespace in which to launch the workers. Defaults to current namespace if available or "default" n_workers: int Number of workers on initial launch. Use ``scale_up`` to increase this number in the future env: Dict[str, str] Dictionary of environment variables to pass to worker pod host: str Listen address for local scheduler. Defaults to 0.0.0.0 port: int Port of local scheduler **kwargs: dict Additional keyword arguments to pass to LocalCluster Examples -------- >>> from dask_kubernetes import KubeCluster, make_pod_spec >>> pod_spec = make_pod_spec(image='daskdev/dask:latest', ... memory_limit='4G', memory_request='4G', ... cpu_limit=1, cpu_request=1, ... env={'EXTRA_PIP_PACKAGES': 'fastparquet git+https://github.com/dask/distributed'}) >>> cluster = KubeCluster(pod_spec) >>> cluster.scale_up(10) You can also create clusters with worker pod specifications as dictionaries or stored in YAML files >>> cluster = KubeCluster.from_yaml('worker-template.yml') >>> cluster = KubeCluster.from_dict({...}) Rather than explicitly setting a number of workers you can also ask the cluster to allocate workers dynamically based on current workload >>> cluster.adapt() You can pass this cluster directly to a Dask client >>> from dask.distributed import Client >>> client = Client(cluster) You can verify that your local environment matches your worker environments by calling ``client.get_versions(check=True)``. This will raise an informative error if versions do not match. >>> client.get_versions(check=True) The ``daskdev/dask`` docker images support ``EXTRA_PIP_PACKAGES``, ``EXTRA_APT_PACKAGES`` and ``EXTRA_CONDA_PACKAGES`` environment variables to help with small adjustments to the worker environments. We recommend the use of pip over conda in this case due to a much shorter startup time. These environment variables can be modified directly from the KubeCluster constructor methods using the ``env=`` keyword. You may list as many packages as you like in a single string like the following: >>> pip = 'pyarrow gcsfs git+https://github.com/dask/distributed' >>> conda = '-c conda-forge scikit-learn' >>> KubeCluster.from_yaml(..., env={'EXTRA_PIP_PACKAGES': pip, ... 'EXTRA_CONDA_PACKAGES': conda}) You can also start a KubeCluster with no arguments *if* the worker template is specified in the Dask config files, either as a full template in ``kubernetes.worker-template`` or a path to a YAML file in ``kubernetes.worker-template-path``. See http://dask.pydata.org/en/latest/configuration.html for more information about setting configuration values.:: $ export DASK_KUBERNETES__WORKER_TEMPLATE_PATH=worker_template.yaml >>> cluster = KubeCluster() # automatically finds 'worker_template.yaml' See Also -------- KubeCluster.from_yaml KubeCluster.from_dict KubeCluster.adapt """ def __init__( self, pod_template=None, name=None, namespace=None, n_workers=None, host=None, port=None, env=None, **kwargs ): name = name or dask.config.get('kubernetes.name') namespace = namespace or dask.config.get('kubernetes.namespace') n_workers = n_workers if n_workers is not None else dask.config.get('kubernetes.count.start') host = host or dask.config.get('kubernetes.host') port = port if port is not None else dask.config.get('kubernetes.port') env = env if env is not None else dask.config.get('kubernetes.env') if not pod_template and dask.config.get('kubernetes.worker-template', None): d = dask.config.get('kubernetes.worker-template') pod_template = make_pod_from_dict(d) if not pod_template and dask.config.get('kubernetes.worker-template-path', None): import yaml fn = dask.config.get('kubernetes.worker-template-path') fn = fn.format(**os.environ) with open(fn) as f: d = yaml.safe_load(f) pod_template = make_pod_from_dict(d) if not pod_template: msg = ("Worker pod specification not provided. See KubeCluster " "docstring for ways to specify workers") raise ValueError(msg) self.cluster = LocalCluster(ip=host or socket.gethostname(), scheduler_port=port, n_workers=0, **kwargs) try: kubernetes.config.load_incluster_config() except kubernetes.config.ConfigException: kubernetes.config.load_kube_config() self.core_api = kubernetes.client.CoreV1Api() if namespace is None: namespace = _namespace_default() name = name.format(user=getpass.getuser(), uuid=str(uuid.uuid4())[:10], **os.environ) name = escape(name) self.pod_template = clean_pod_template(pod_template) # Default labels that can't be overwritten self.pod_template.metadata.labels['dask.pydata.org/cluster-name'] = name self.pod_template.metadata.labels['user'] = escape(getpass.getuser()) self.pod_template.metadata.labels['app'] = 'dask' self.pod_template.metadata.labels['component'] = 'dask-worker' self.pod_template.metadata.namespace = namespace self.pod_template.spec.containers[0].env.append( kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS', value=self.scheduler_address) ) if env: self.pod_template.spec.containers[0].env.extend([ kubernetes.client.V1EnvVar(name=k, value=str(v)) for k, v in env.items() ]) self.pod_template.metadata.generate_name = name finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels) if n_workers: self.scale(n_workers) @classmethod def from_dict(cls, pod_spec, **kwargs): """ Create cluster with worker pod spec defined by Python dictionary Examples -------- >>> spec = { ... 'metadata': {}, ... 'spec': { ... 'containers': [{ ... 'args': ['dask-worker', '$(DASK_SCHEDULER_ADDRESS)', ... '--nthreads', '1', ... '--death-timeout', '60'], ... 'command': None, ... 'image': 'daskdev/dask:latest', ... 'name': 'dask-worker', ... }], ... 'restartPolicy': 'Never', ... } ... } >>> cluster = KubeCluster.from_dict(spec, namespace='my-ns') # doctest: +SKIP See Also -------- KubeCluster.from_yaml """ return cls(make_pod_from_dict(pod_spec), **kwargs) @classmethod def from_yaml(cls, yaml_path, **kwargs): """ Create cluster with worker pod spec defined by a YAML file We can start a cluster with pods defined in an accompanying YAML file like the following: .. code-block:: yaml kind: Pod metadata: labels: foo: bar baz: quux spec: containers: - image: daskdev/dask:latest name: dask-worker args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '2', --memory-limit, 8GB] restartPolicy: Never Examples -------- >>> cluster = KubeCluster.from_yaml('pod.yaml', namespace='my-ns') # doctest: +SKIP See Also -------- KubeCluster.from_dict """ if not yaml: raise ImportError("PyYaml is required to use yaml functionality, please install it!") with open(yaml_path) as f: d = yaml.safe_load(f) return cls.from_dict(d, **kwargs) @property def namespace(self): return self.pod_template.metadata.namespace @property def name(self): return self.pod_template.metadata.generate_name def __repr__(self): return 'KubeCluster("%s", workers=%d)' % (self.scheduler.address, len(self.pods())) @property def scheduler(self): return self.cluster.scheduler @property def scheduler_address(self): return self.scheduler.address def pods(self): """ A list of kubernetes pods corresponding to current workers See Also -------- KubeCluster.logs """ return self.core_api.list_namespaced_pod( self.namespace, label_selector=format_labels(self.pod_template.metadata.labels) ).items def logs(self, pod=None): """ Logs from a worker pod You can get this pod object from the ``pods`` method. If no pod is specified all pod logs will be returned. On large clusters this could end up being rather large. Parameters ---------- pod: kubernetes.client.V1Pod The pod from which we want to collect logs. See Also -------- KubeCluster.pods Client.get_worker_logs """ if pod is None: return {pod.status.pod_ip: self.logs(pod) for pod in self.pods()} return self.core_api.read_namespaced_pod_log(pod.metadata.name, pod.metadata.namespace) def scale(self, n): """ Scale cluster to n workers Parameters ---------- n: int Target number of workers Example ------- >>> cluster.scale(10) # scale cluster to ten workers See Also -------- KubeCluster.scale_up KubeCluster.scale_down """ pods = self._cleanup_terminated_pods(self.pods()) if n >= len(pods): return self.scale_up(n, pods=pods) else: n_to_delete = len(pods) - n # Before trying to close running workers, check if we can cancel # pending pods (in case the kubernetes cluster was too full to # provision those pods in the first place). running_workers = list(self.scheduler.workers.keys()) running_ips = set(urlparse(worker).hostname for worker in running_workers) pending_pods = [p for p in pods if p.status.pod_ip not in running_ips] if pending_pods: pending_to_delete = pending_pods[:n_to_delete] logger.debug("Deleting pending pods: %s", pending_to_delete) self._delete_pods(pending_to_delete) n_to_delete = n_to_delete - len(pending_to_delete) if n_to_delete <= 0: return to_close = select_workers_to_close(self.scheduler, n_to_delete) logger.debug("Closing workers: %s", to_close) if len(to_close) < len(self.scheduler.workers): # Close workers cleanly to migrate any temporary results to # remaining workers. @gen.coroutine def f(to_close): yield self.scheduler.retire_workers( workers=to_close, remove=True, close_workers=True) yield offload(self.scale_down, to_close) self.scheduler.loop.add_callback(f, to_close) return # Terminate all pods without waiting for clean worker shutdown self.scale_down(to_close) def _delete_pods(self, to_delete): for pod in to_delete: try: self.core_api.delete_namespaced_pod( pod.metadata.name, self.namespace, kubernetes.client.V1DeleteOptions() ) pod_info = pod.metadata.name if pod.status.reason: pod_info += ' [{}]'.format(pod.status.reason) if pod.status.message: pod_info += ' {}'.format(pod.status.message) logger.info('Deleted pod: %s', pod_info) except kubernetes.client.rest.ApiException as e: # If a pod has already been removed, just ignore the error if e.status != 404: raise def _cleanup_terminated_pods(self, pods): terminated_phases = {'Succeeded', 'Failed'} terminated_pods = [p for p in pods if p.status.phase in terminated_phases] self._delete_pods(terminated_pods) return [p for p in pods if p.status.phase not in terminated_phases] def scale_up(self, n, pods=None, **kwargs): """ Make sure we have n dask-workers available for this cluster Examples -------- >>> cluster.scale_up(20) # ask for twenty workers """ maximum = dask.config.get('kubernetes.count.max') if maximum is not None and maximum < n: logger.info("Tried to scale beyond maximum number of workers %d > %d", n, maximum) n = maximum pods = pods or self._cleanup_terminated_pods(self.pods()) to_create = n - len(pods) new_pods = [] for i in range(3): try: for _ in range(to_create): new_pods.append(self.core_api.create_namespaced_pod( self.namespace, self.pod_template)) to_create -= 1 break except kubernetes.client.rest.ApiException as e: if e.status == 500 and 'ServerTimeout' in e.body: logger.info("Server timeout, retry #%d", i + 1) time.sleep(1) last_exception = e continue else: raise else: raise last_exception return new_pods # fixme: wait for this to be ready before returning! def scale_down(self, workers, pods=None): """ Remove the pods for the requested list of workers When scale_down is called by the _adapt async loop, the workers are assumed to have been cleanly closed first and in-memory data has been migrated to the remaining workers. Note that when the worker process exits, Kubernetes leaves the pods in a 'Succeeded' state that we collect here. If some workers have not been closed, we just delete the pods with matching ip addresses. Parameters ---------- workers: List[str] List of addresses of workers to close """ # Get the existing worker pods pods = pods or self._cleanup_terminated_pods(self.pods()) # Work out the list of pods that we are going to delete # Each worker to delete is given in the form "tcp://<worker ip>:<port>" # Convert this to a set of IPs ips = set(urlparse(worker).hostname for worker in workers) to_delete = [p for p in pods if p.status.pod_ip in ips] if not to_delete: return self._delete_pods(to_delete) def __enter__(self): return self def close(self): """ Close this cluster """ self.scale_down(self.cluster.scheduler.workers) self.cluster.close() def __exit__(self, type, value, traceback): _cleanup_pods(self.namespace, self.pod_template.metadata.labels) self.cluster.__exit__(type, value, traceback)
def __init__( self, pod_template=None, name=None, namespace=None, n_workers=None, host=None, port=None, env=None, **kwargs ): name = name or dask.config.get('kubernetes.name') namespace = namespace or dask.config.get('kubernetes.namespace') n_workers = n_workers if n_workers is not None else dask.config.get('kubernetes.count.start') host = host or dask.config.get('kubernetes.host') port = port if port is not None else dask.config.get('kubernetes.port') env = env if env is not None else dask.config.get('kubernetes.env') if not pod_template and dask.config.get('kubernetes.worker-template', None): d = dask.config.get('kubernetes.worker-template') pod_template = make_pod_from_dict(d) if not pod_template and dask.config.get('kubernetes.worker-template-path', None): import yaml fn = dask.config.get('kubernetes.worker-template-path') fn = fn.format(**os.environ) with open(fn) as f: d = yaml.safe_load(f) pod_template = make_pod_from_dict(d) if not pod_template: msg = ("Worker pod specification not provided. See KubeCluster " "docstring for ways to specify workers") raise ValueError(msg) self.cluster = LocalCluster(ip=host or socket.gethostname(), scheduler_port=port, n_workers=0, **kwargs) try: kubernetes.config.load_incluster_config() except kubernetes.config.ConfigException: kubernetes.config.load_kube_config() self.core_api = kubernetes.client.CoreV1Api() if namespace is None: namespace = _namespace_default() name = name.format(user=getpass.getuser(), uuid=str(uuid.uuid4())[:10], **os.environ) name = escape(name) self.pod_template = clean_pod_template(pod_template) # Default labels that can't be overwritten self.pod_template.metadata.labels['dask.pydata.org/cluster-name'] = name self.pod_template.metadata.labels['user'] = escape(getpass.getuser()) self.pod_template.metadata.labels['app'] = 'dask' self.pod_template.metadata.labels['component'] = 'dask-worker' self.pod_template.metadata.namespace = namespace self.pod_template.spec.containers[0].env.append( kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS', value=self.scheduler_address) ) if env: self.pod_template.spec.containers[0].env.extend([ kubernetes.client.V1EnvVar(name=k, value=str(v)) for k, v in env.items() ]) self.pod_template.metadata.generate_name = name finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels) if n_workers: self.scale(n_workers)
class KubeCluster(Cluster): """ Launch a Dask cluster on Kubernetes This starts a local Dask scheduler and then dynamically launches Dask workers on a Kubernetes cluster. **Environments** Your worker pod image should have a similar environment to your local environment, including versions of Python, dask, cloudpickle, and any libraries that you may wish to use (like NumPy, Pandas, or Scikit-Learn). See examples below for suggestions on how to manage and check for this. **Resources** Your Kubernetes resource limits and requests should match the ``--memory-limit`` and ``--nthreads`` parameters given to the ``dask-worker`` command. Parameters ---------- pod_template: kubernetes.client.V1PodSpec A Kubernetes specification for a Pod for a dask worker. name: str (optional) Name given to the pods. Defaults to ``dask-$USER-random`` namespace: str (optional) Namespace in which to launch the workers. Defaults to current namespace if available or "default" n_workers: int Number of workers on initial launch. Use ``scale_up`` to increase this number in the future env: Dict[str, str] Dictionary of environment variables to pass to worker pod host: str Listen address for local scheduler. Defaults to 0.0.0.0 port: int Port of local scheduler **kwargs: dict Additional keyword arguments to pass to LocalCluster Examples -------- >>> from dask_kubernetes import KubeCluster, make_pod_spec >>> pod_spec = make_pod_spec(image='daskdev/dask:latest', ... memory_limit='4G', memory_request='4G', ... cpu_limit=1, cpu_request=1, ... env={'EXTRA_PIP_PACKAGES': 'fastparquet git+https://github.com/dask/distributed'}) >>> cluster = KubeCluster(pod_spec) >>> cluster.scale_up(10) You can also create clusters with worker pod specifications as dictionaries or stored in YAML files >>> cluster = KubeCluster.from_yaml('worker-template.yml') >>> cluster = KubeCluster.from_dict({...}) Rather than explicitly setting a number of workers you can also ask the cluster to allocate workers dynamically based on current workload >>> cluster.adapt() You can pass this cluster directly to a Dask client >>> from dask.distributed import Client >>> client = Client(cluster) You can verify that your local environment matches your worker environments by calling ``client.get_versions(check=True)``. This will raise an informative error if versions do not match. >>> client.get_versions(check=True) The ``daskdev/dask`` docker images support ``EXTRA_PIP_PACKAGES``, ``EXTRA_APT_PACKAGES`` and ``EXTRA_CONDA_PACKAGES`` environment variables to help with small adjustments to the worker environments. We recommend the use of pip over conda in this case due to a much shorter startup time. These environment variables can be modified directly from the KubeCluster constructor methods using the ``env=`` keyword. You may list as many packages as you like in a single string like the following: >>> pip = 'pyarrow gcsfs git+https://github.com/dask/distributed' >>> conda = '-c conda-forge scikit-learn' >>> KubeCluster.from_yaml(..., env={'EXTRA_PIP_PACKAGES': pip, ... 'ExtRA_CONDA_PACKAGES': conda}) You can also start a KubeCluster with no arguments *if* the YAML file defining the worker template is referred to in the ``DASKERNETES_WORKER_TEMPLATE_PATH`` environment variable $ export DASKERNETES_WORKER_TEMPLATE_PATH=worker_template.yaml >>> cluster = KubeCluster() # automatically finds 'worker_template.yaml' See Also -------- KubeCluster.from_yaml KubeCluster.from_dict KubeCluster.adapt """ def __init__(self, pod_template=None, name=None, namespace=None, n_workers=0, host='0.0.0.0', port=0, env=None, **kwargs): if pod_template is None: if 'kubernetes-worker-template-path' in config: import yaml with open(config['kubernetes-worker-template-path']) as f: d = yaml.safe_load(f) pod_template = make_pod_from_dict(d) else: msg = ( "Worker pod specification not provided. See KubeCluster " "docstring for ways to specify workers") raise ValueError(msg) self.cluster = LocalCluster(ip=host or socket.gethostname(), scheduler_port=port, n_workers=0, **kwargs) try: kubernetes.config.load_incluster_config() except kubernetes.config.ConfigException: kubernetes.config.load_kube_config() self.core_api = kubernetes.client.CoreV1Api() if namespace is None: namespace = _namespace_default() if name is None: worker_name = config.get('kubernetes-worker-name', 'dask-{user}-{uuid}') name = worker_name.format(user=getpass.getuser(), uuid=str(uuid.uuid4())[:10], **os.environ) self.pod_template = clean_pod_template(pod_template) # Default labels that can't be overwritten self.pod_template.metadata.labels[ 'dask.pydata.org/cluster-name'] = name self.pod_template.metadata.labels['app'] = 'dask' self.pod_template.metadata.labels['component'] = 'dask-worker' self.pod_template.metadata.namespace = namespace self.pod_template.spec.containers[0].env.append( kubernetes.client.V1EnvVar(name='DASK_SCHEDULER_ADDRESS', value=self.scheduler_address)) if env: self.pod_template.spec.containers[0].env.extend([ kubernetes.client.V1EnvVar(name=k, value=str(v)) for k, v in env.items() ]) self.pod_template.metadata.generate_name = name finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels) if n_workers: self.scale(n_workers) @classmethod def from_dict(cls, pod_spec, **kwargs): """ Create cluster with worker pod spec defined by Python dictionary Examples -------- >>> spec = { ... 'metadata': {}, ... 'spec': { ... 'containers': [{ ... 'args': ['dask-worker', '$(DASK_SCHEDULER_ADDRESS)', ... '--nthreads', '1', ... '--death-timeout', '60'], ... 'command': None, ... 'image': 'daskdev/dask:latest', ... 'name': 'dask-worker', ... }], ... 'restartPolicy': 'Never', ... } ... } >>> cluster = KubeCluster.from_dict(spec, namespace='my-ns') # doctest: +SKIP See Also -------- KubeCluster.from_yaml """ return cls(make_pod_from_dict(pod_spec), **kwargs) @classmethod def from_yaml(cls, yaml_path, **kwargs): """ Create cluster with worker pod spec defined by a YAML file We can start a cluster with pods defined in an accompanying YAML file like the following: .. code-block:: yaml kind: Pod metadata: labels: foo: bar baz: quux spec: containers: - image: daskdev/dask:latest name: dask-worker args: [dask-worker, $(DASK_SCHEDULER_ADDRESS), --nthreads, '2', --memory-limit, 8GB] restartPolicy: Never Examples -------- >>> cluster = KubeCluster.from_yaml('pod.yaml', namespace='my-ns') # doctest: +SKIP See Also -------- KubeCluster.from_dict """ if not yaml: raise ImportError( "PyYaml is required to use yaml functionality, please install it!" ) with open(yaml_path) as f: d = yaml.safe_load(f) return cls.from_dict(d, **kwargs) @property def namespace(self): return self.pod_template.metadata.namespace @property def name(self): return self.pod_template.metadata.generate_name @property def scheduler(self): return self.cluster.scheduler @property def scheduler_address(self): return self.scheduler.address def pods(self): """ A list of kubernetes pods corresponding to current workers See Also -------- KubeCluster.logs """ return self.core_api.list_namespaced_pod( self.namespace, label_selector=format_labels( self.pod_template.metadata.labels)).items def logs(self, pod): """ Logs from a worker pod You can get this pod object from the ``pods`` method. Parameters ---------- pod: kubernetes.client.V1Pod The pod from which we want to collect logs. See Also -------- KubeCluster.pods Client.get_worker_logs """ return self.core_api.read_namespaced_pod_log(pod.metadata.name, pod.metadata.namespace) def scale(self, n): """ Scale cluster to n workers Parameters ---------- n: int Target number of workers Example ------- >>> cluster.scale(10) # scale cluster to ten workers See Also -------- KubeCluster.scale_up KubeCluster.scale_down """ pods = self.pods() if n >= len(pods): return self.scale_up(n, pods=pods) else: to_close = select_workers_to_close(self.scheduler, len(pods) - n) logger.debug("Closing workers: %s", to_close) return self.scale_down(to_close) def scale_up(self, n, pods=None, **kwargs): """ Make sure we have n dask-workers available for this cluster Examples -------- >>> cluster.scale_up(20) # ask for twenty workers """ pods = pods or self.pods() for i in range(3): try: out = [ self.core_api.create_namespaced_pod( self.namespace, self.pod_template) for _ in range(n - len(pods)) ] break except kubernetes.client.rest.ApiException as e: if e.status == 500 and 'ServerTimeout' in e.body: logger.info("Server timeout, retry #%d", i + 1) time.sleep(1) last_exception = e continue else: raise else: raise last_exception return out # fixme: wait for this to be ready before returning! def scale_down(self, workers): """ When the worker process exits, Kubernetes leaves the pods in a completed state. Kill them when we are asked to. Parameters ---------- workers: List[str] List of addresses of workers to close """ # Get the existing worker pods pods = self.pods() # Work out pods that we are going to delete # Each worker to delete is given in the form "tcp://<worker ip>:<port>" # Convert this to a set of IPs ips = set(urlparse(worker).hostname for worker in workers) to_delete = [ p for p in pods # Every time we run, purge any completed pods as well as the specified ones if p.status.phase == 'Succeeded' or p.status.pod_ip in ips ] if not to_delete: return for pod in to_delete: try: self.core_api.delete_namespaced_pod( pod.metadata.name, self.namespace, kubernetes.client.V1DeleteOptions()) logger.info('Deleted pod: %s', pod.metadata.name) except kubernetes.client.rest.ApiException as e: # If a pod has already been removed, just ignore the error if e.status != 404: raise def __enter__(self): return self def close(self): """ Close this cluster """ self.scale_down(self.cluster.scheduler.workers) self.cluster.close() def __exit__(self, type, value, traceback): _cleanup_pods(self.namespace, self.pod_template.metadata.labels) self.cluster.__exit__(type, value, traceback)
def __init__(self, pod_template=None, name=None, namespace=None, n_workers=None, host=None, port=None, env=None, auth=ClusterAuth.DEFAULT, **kwargs): name = name or dask.config.get("kubernetes.name") namespace = namespace or dask.config.get("kubernetes.namespace") n_workers = (n_workers if n_workers is not None else dask.config.get("kubernetes.count.start")) host = host or dask.config.get("kubernetes.host") port = port if port is not None else dask.config.get("kubernetes.port") env = env if env is not None else dask.config.get("kubernetes.env") if not pod_template and dask.config.get("kubernetes.worker-template", None): d = dask.config.get("kubernetes.worker-template") d = dask.config.expand_environment_variables(d) pod_template = make_pod_from_dict(d) if not pod_template and dask.config.get( "kubernetes.worker-template-path", None): import yaml fn = dask.config.get("kubernetes.worker-template-path") fn = fn.format(**os.environ) with open(fn) as f: d = yaml.safe_load(f) d = dask.config.expand_environment_variables(d) pod_template = make_pod_from_dict(d) if not pod_template: msg = ("Worker pod specification not provided. See KubeCluster " "docstring for ways to specify workers") raise ValueError(msg) pod_template = clean_pod_template(pod_template) ClusterAuth.load_first(auth) self.core_api = kubernetes.client.CoreV1Api() if namespace is None: namespace = _namespace_default() name = name.format(user=getpass.getuser(), uuid=str(uuid.uuid4())[:10], **os.environ) name = escape(name) self.pod_template = pod_template # Default labels that can't be overwritten self.pod_template.metadata.labels["dask.org/cluster-name"] = name self.pod_template.metadata.labels["user"] = escape(getpass.getuser()) self.pod_template.metadata.labels["app"] = "dask" self.pod_template.metadata.labels["component"] = "dask-worker" self.pod_template.metadata.namespace = namespace self.cluster = LocalCluster(host=host or socket.gethostname(), scheduler_port=port, n_workers=0, **kwargs) # TODO: handle any exceptions here, ensure self.cluster is properly # cleaned up. self.pod_template.spec.containers[0].env.append( kubernetes.client.V1EnvVar(name="DASK_SCHEDULER_ADDRESS", value=self.scheduler_address)) if env: self.pod_template.spec.containers[0].env.extend([ kubernetes.client.V1EnvVar(name=k, value=str(v)) for k, v in env.items() ]) self.pod_template.metadata.generate_name = name finalize(self, _cleanup_pods, self.namespace, self.pod_template.metadata.labels) if n_workers: try: self.scale(n_workers) except Exception: self.cluster.close() raise