def adapt(self, minimum_cores=None, maximum_cores=None, minimum_memory=None, maximum_memory=None, **kwargs): """ Turn on adaptivity For keyword arguments see dask.distributed.Adaptive Instead of minimum and maximum parameters which apply to the number of worker, If Cluster object implements jobqueue_worker_spec attribute, one can use the following parameters: Parameters ---------- minimum_cores: int Minimum number of cores for the cluster maximum_cores: int Maximum number of cores for the cluster minimum_memory: str Minimum amount of memory for the cluster maximum_memory: str Maximum amount of memory for the cluster Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') >>> cluster.adapt(minimum_cores=24, maximum_cores=96) >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB') """ with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, "_adaptive_options"): self._adaptive_options = {} if "minimum" not in kwargs: if minimum_cores is not None: kwargs["minimum"] = self._get_nb_workers_from_cores( minimum_cores) elif minimum_memory is not None: kwargs["minimum"] = self._get_nb_workers_from_memory( minimum_memory) if "maximum" not in kwargs: if maximum_cores is not None: kwargs["maximum"] = self._get_nb_workers_from_cores( maximum_cores) elif maximum_memory is not None: kwargs["maximum"] = self._get_nb_workers_from_memory( maximum_memory) self._adaptive_options.update(kwargs) self._adaptive = Adaptive(self.scheduler, self, **self._adaptive_options) return self._adaptive
def adapt(self, Adaptive=Adaptive, **kwargs) -> Adaptive: """Turn on adaptivity For keyword arguments see dask.distributed.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') """ with suppress(AttributeError): self._adaptive.stop() if not hasattr(self, "_adaptive_options"): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive(self, **self._adaptive_options) return self._adaptive
class ClusterManager(object): """ Intermediate Cluster object that should lead to a real ClusterManager This tries to improve upstream Cluster object and underlines needs for better decoupling between ClusterManager and Scheduler object This currently expects a local Scheduler defined on the object, but should eventually only rely on RPC calls on remote or local scheduler. It provides common methods and an IPython widget display. Clusters inheriting from this class should provide the following: 1. A local ``Scheduler`` object at ``.scheduler``. In the future, just a URL to local or remote scheduler. 2. scale_up and scale_down methods as defined below:: def scale_up(self, n: int): ''' Brings total worker count up to ``n`` ''' def scale_down(self, workers: List[str], n: int): ''' Close the workers with the given addresses or remove pending workers to match n running workers. ''' 3. Optionally worker_key: Callable(WorkerState): ''' Callable mapping a WorkerState object to a group, see Scheduler.workers_to_close ''' 4. jobqueue_worker_spec dict attribute if scale(cores=...) or scale(memory=...) can be used by users. jobqueue_worker_spec = {'cores': 4, 'memory': '16 GB'} This will provide a general ``scale`` method as well as an IPython widget for display. Things the will need to change for the complete Cluster Manager Design: - ClusterManager: - Use it's own event loop, or the notebook one. - Connect to a local or remote Scheduler through RPC, and then communicate with it. - Ability to start a local or remote scheduler. - Ability to work with different worker pools: in scale, adaptive, jobqueue_worker_spec... - Scheduler - Provide some remote methods: - retire_workers(n: int): close enough workers ot have only n running at the end. Return the closed workers. - status of connected worker, e.g. scheduler_info() Examples -------- >>> from distributed.deploy import Cluster >>> class MyCluster(cluster): ... def scale_up(self, n): ... ''' Bring the total worker count up to n ''' ... pass ... def scale_down(self, workers, n=None): ... ''' Close the workers with the given addresses ''' ... pass >>> cluster = MyCluster() >>> cluster.scale(5) # scale manually >>> cluster.adapt(minimum=1, maximum=100) # scale automatically >>> cluster.scale(cores=100) # scale manually to cores nb """ def __init__(self, adaptive_options={}): self._target_scale = 0 self._adaptive_options = adaptive_options self._adaptive_options.setdefault("worker_key", self.worker_key) def adapt(self, minimum_cores=None, maximum_cores=None, minimum_memory=None, maximum_memory=None, **kwargs): """ Turn on adaptivity For keyword arguments see dask.distributed.Adaptive Instead of minimum and maximum parameters which apply to the number of worker, If Cluster object implements jobqueue_worker_spec attribute, one can use the following parameters: Parameters ---------- minimum_cores: int Minimum number of cores for the cluster maximum_cores: int Maximum number of cores for the cluster minimum_memory: str Minimum amount of memory for the cluster maximum_memory: str Maximum amount of memory for the cluster Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') >>> cluster.adapt(minimum_cores=24, maximum_cores=96) >>> cluster.adapt(minimum_memory='60 GB', maximum_memory= '1 TB') """ with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, "_adaptive_options"): self._adaptive_options = {} if "minimum" not in kwargs: if minimum_cores is not None: kwargs["minimum"] = self._get_nb_workers_from_cores( minimum_cores) elif minimum_memory is not None: kwargs["minimum"] = self._get_nb_workers_from_memory( minimum_memory) if "maximum" not in kwargs: if maximum_cores is not None: kwargs["maximum"] = self._get_nb_workers_from_cores( maximum_cores) elif maximum_memory is not None: kwargs["maximum"] = self._get_nb_workers_from_memory( maximum_memory) self._adaptive_options.update(kwargs) self._adaptive = Adaptive(self.scheduler, self, **self._adaptive_options) return self._adaptive @property def scheduler_address(self): return self.scheduler.address @property def dashboard_link(self): host = self.scheduler.address.split("://")[1].split(":")[0] port = self.scheduler.services["dashboard"].port return format_dashboard_link(host, port) @gen.coroutine def _scale(self, n=None, cores=None, memory=None): """ Asynchronously called scale method This allows to do every operation with a coherent context """ with log_errors(): if [n, cores, memory].count(None) != 2: raise ValueError("One and only one of n, cores, memory kwargs" " should be used, n={}, cores={}, memory={}" " provided.".format(n, cores, memory)) if n is None: if cores is not None: n = self._get_nb_workers_from_cores(cores) elif memory is not None: n = self._get_nb_workers_from_memory(memory) # here we rely on a ClusterManager attribute to retrieve the # active and pending workers if n == self._target_scale: pass elif n > self._target_scale: self.scale_up(n) else: # TODO to_close may be empty if some workers are pending # This may not be useful to call scheduler methods in this case # Scheduler interface here may need to be modified to_close = self.scheduler.workers_to_close( n=len(self.scheduler.workers) - n, minimum=n, key=self.worker_key) logger.debug("Closing workers: %s", to_close) # Should be an RPC call here yield self.scheduler.retire_workers(workers=to_close) # To close may be empty if just asking to remove pending # workers, so we should also give a target number self.scale_down(to_close, n) self._target_scale = n def scale(self, n=None, cores=None, memory=None): """ Scale cluster to n workers or to the given number of cores or memory number of cores and memory are converted into number of workers using jobqueue_worker_spec attribute. Parameters ---------- n: int Target number of workers cores: int Target number of cores memory: str Target amount of available memory Example ------- >>> cluster.scale(10) # scale cluster to ten workers >>> cluster.scale(cores=100) # scale cluster to 100 cores >>> cluster.scale(memory='1 TB') # scale cluster to 1 TB memory See Also -------- Cluster.scale_up Cluster.scale_down Cluster.jobqueue_worker_spec """ # TODO we should not rely on scheduler loop here, self should have its # own loop self.scheduler.loop.add_callback(self._scale, n, cores, memory) def _widget_status(self): workers = len(self.scheduler.workers) cores = sum(ws.ncores for ws in self.scheduler.workers.values()) memory = sum(ws.memory_limit for ws in self.scheduler.workers.values()) memory = format_bytes(memory) text = """ <div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style> <table style="text-align: right;"> <tr><th>Workers</th> <td>%d</td></tr> <tr><th>Cores</th> <td>%d</td></tr> <tr><th>Memory</th> <td>%s</td></tr> </table> </div> """ % ( workers, cores, memory, ) return text def _widget(self): """ Create IPython widget for display within a notebook """ try: return self._cached_widget except AttributeError: pass from ipywidgets import ( Layout, VBox, HBox, IntText, Button, HTML, Accordion, Text, ) layout = Layout(width="150px") if "dashboard" in self.scheduler.services: link = self.dashboard_link link = '<p><b>Dashboard: </b><a href="%s" target="_blank">%s</a></p>\n' % ( link, link, ) else: link = "" title = "<h2>%s</h2>" % type(self).__name__ title = HTML(title) dashboard = HTML(link) status = HTML(self._widget_status(), layout=Layout(min_width="150px")) request = IntText(0, description="Workers", layout=layout) scale = Button(description="Scale", layout=layout) request_cores = IntText(0, description="Cores", layout=layout) scale_cores = Button(description="Scale", layout=layout) request_memory = Text("O GB", description="Memory", layout=layout) scale_memory = Button(description="Scale", layout=layout) minimum = IntText(0, description="Minimum", layout=layout) maximum = IntText(0, description="Maximum", layout=layout) adapt = Button(description="Adapt", layout=layout) minimum_cores = IntText(0, description="Min cores", layout=layout) maximum_cores = IntText(0, description="Max cores", layout=layout) adapt_cores = Button(description="Adapt", layout=layout) minimum_mem = Text("0 GB", description="Min memory", layout=layout) maximum_mem = Text("0 GB", description="Max memory", layout=layout) adapt_mem = Button(description="Adapt", layout=layout) scale_hbox = [HBox([request, scale])] adapt_hbox = [HBox([minimum, maximum, adapt])] if hasattr(self, "jobqueue_worker_spec"): scale_hbox.append(HBox([request_cores, scale_cores])) scale_hbox.append(HBox([request_memory, scale_memory])) adapt_hbox.append(HBox([minimum_cores, maximum_cores, adapt_cores])) adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem])) accordion = Accordion( [VBox(scale_hbox), VBox(adapt_hbox)], layout=Layout(min_width="500px")) accordion.selected_index = None accordion.set_title(0, "Manual Scaling") accordion.set_title(1, "Adaptive Scaling") box = VBox([title, HBox([status, accordion]), dashboard]) self._cached_widget = box def adapt_cb(b): self.adapt(minimum=minimum.value, maximum=maximum.value) def adapt_cores_cb(b): self.adapt(minimum_cores=minimum_cores.value, maximum_cores=maximum_cores.value) def adapt_mem_cb(b): self.adapt(minimum_memory=minimum_mem.value, maximum_memory=maximum_mem.value) adapt.on_click(adapt_cb) adapt_cores.on_click(adapt_cores_cb) adapt_mem.on_click(adapt_mem_cb) def scale_cb(request, kwarg): def request_cb(b): with log_errors(): arg = request.value with ignoring(AttributeError): self._adaptive.stop() local_kwargs = dict() local_kwargs[kwarg] = arg self.scale(**local_kwargs) return request_cb scale.on_click(scale_cb(request, "n")) scale_cores.on_click(scale_cb(request_cores, "cores")) scale_memory.on_click(scale_cb(request_memory, "memory")) def update(): status.value = self._widget_status() pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop) self.scheduler.periodic_callbacks["cluster-repr"] = pc pc.start() return box def _ipython_display_(self, **kwargs): return self._widget()._ipython_display_(**kwargs) def worker_key(self, worker_state): """ Callable mapping a WorkerState object to a group, see Scheduler.workers_to_close """ return worker_state def _get_nb_workers_from_cores(self, cores): return math.ceil(cores / self.jobqueue_worker_spec["cores"]) def _get_nb_workers_from_memory(self, memory): return math.ceil( parse_bytes(memory) / parse_bytes(self.jobqueue_worker_spec["memory"])) @property def jobqueue_worker_spec(self): """ single worker process info needed for scaling on cores or memory """ raise NotImplementedError( "{} class does not provide jobqueue_worker_spec " "attribute, needed for scaling with " "cores or memory kwargs.".format(self.__class__.__name__))
class Cluster(SyncMethodMixin): """Superclass for cluster objects This class contains common functionality for Dask Cluster manager classes. To implement this class, you must provide 1. A ``scheduler_comm`` attribute, which is a connection to the scheduler following the ``distributed.core.rpc`` API. 2. Implement ``scale``, which takes an integer and scales the cluster to that many workers, or else set ``_supports_scaling`` to False For that, you should get the following: 1. A standard ``__repr__`` 2. A live IPython widget 3. Adaptive scaling 4. Integration with dask-labextension 5. A ``scheduler_info`` attribute which contains an up-to-date copy of ``Scheduler.identity()``, which is used for much of the above 6. Methods to gather logs """ _supports_scaling = True _cluster_info: dict = {} def __init__( self, asynchronous=False, loop=None, quiet=False, name=None, scheduler_sync_interval=1, ): self._loop_runner = LoopRunner(loop=loop, asynchronous=asynchronous) self.loop = self._loop_runner.loop self.scheduler_info = {"workers": {}} self.periodic_callbacks = {} self._watch_worker_status_comm = None self._watch_worker_status_task = None self._cluster_manager_logs = [] self.quiet = quiet self.scheduler_comm = None self._adaptive = None self._sync_interval = parse_timedelta(scheduler_sync_interval, default="seconds") self._sync_cluster_info_task = None if name is None: name = str(uuid.uuid4())[:8] # Mask class attribute with instance attribute self._cluster_info = { "name": name, "type": typename(type(self)), **type(self)._cluster_info, } self.status = Status.created @property def name(self): return self._cluster_info["name"] @name.setter def name(self, name): self._cluster_info["name"] = name async def _start(self): comm = await self.scheduler_comm.live_comm() await comm.write({"op": "subscribe_worker_status"}) self.scheduler_info = SchedulerInfo(await comm.read()) self._watch_worker_status_comm = comm self._watch_worker_status_task = asyncio.ensure_future( self._watch_worker_status(comm)) info = await self.scheduler_comm.get_metadata( keys=["cluster-manager-info"], default={}) self._cluster_info.update(info) # Start a background task for syncing cluster info with the scheduler self._sync_cluster_info_task = asyncio.ensure_future( self._sync_cluster_info()) for pc in self.periodic_callbacks.values(): pc.start() self.status = Status.running async def _sync_cluster_info(self): err_count = 0 warn_at = 5 max_interval = 10 * self._sync_interval # Loop until the cluster is shutting down. We shouldn't really need # this check (the `CancelledError` should be enough), but something # deep in the comms code is silencing `CancelledError`s _some_ of the # time, resulting in a cancellation not always bubbling back up to # here. Relying on the status is fine though, not worth changing. while self.status == Status.running: try: await self.scheduler_comm.set_metadata( keys=["cluster-manager-info"], value=self._cluster_info.copy(), ) err_count = 0 except asyncio.CancelledError: # Task is being closed. When we drop Python < 3.8 we can drop # this check (since CancelledError is not a subclass of # Exception then). break except Exception: err_count += 1 # Only warn if multiple subsequent attempts fail, and only once # per set of subsequent failed attempts. This way we're not # excessively noisy during a connection blip, but we also don't # silently fail. if err_count == warn_at: logger.warning( "Failed to sync cluster info multiple times - perhaps " "there's a connection issue? Error:", exc_info=True, ) # Sleep, with error backoff interval = min(max_interval, self._sync_interval * 1.5**err_count) await asyncio.sleep(interval) async def _close(self): if self.status == Status.closed: return self.status = Status.closing with suppress(AttributeError): self._adaptive.stop() if self._watch_worker_status_comm: await self._watch_worker_status_comm.close() if self._watch_worker_status_task: await self._watch_worker_status_task if self._sync_cluster_info_task: self._sync_cluster_info_task.cancel() with suppress(asyncio.CancelledError): await self._sync_cluster_info_task if self.scheduler_comm: await self.scheduler_comm.close_rpc() for pc in self.periodic_callbacks.values(): pc.stop() self.status = Status.closed def close(self, timeout=None): # If the cluster is already closed, we're already done if self.status == Status.closed: if self.asynchronous: return NoOpAwaitable() else: return with suppress(RuntimeError): # loop closed during process shutdown return self.sync(self._close, callback_timeout=timeout) def __del__(self): if self.status != Status.closed: with suppress(AttributeError, RuntimeError): # during closing self.loop.add_callback(self.close) async def _watch_worker_status(self, comm): """Listen to scheduler for updates on adding and removing workers""" while True: try: msgs = await comm.read() except OSError: break with log_errors(): for op, msg in msgs: self._update_worker_status(op, msg) await comm.close() def _update_worker_status(self, op, msg): if op == "add": workers = msg.pop("workers") self.scheduler_info["workers"].update(workers) self.scheduler_info.update(msg) elif op == "remove": del self.scheduler_info["workers"][msg] else: # pragma: no cover raise ValueError("Invalid op", op, msg) def adapt(self, Adaptive=Adaptive, **kwargs) -> Adaptive: """Turn on adaptivity For keyword arguments see dask.distributed.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') """ with suppress(AttributeError): self._adaptive.stop() if not hasattr(self, "_adaptive_options"): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive(self, **self._adaptive_options) return self._adaptive def scale(self, n: int) -> None: """Scale cluster to n workers Parameters ---------- n : int Target number of workers Examples -------- >>> cluster.scale(10) # scale cluster to ten workers """ raise NotImplementedError() def _log(self, log): """Log a message. Output a message to the user and also store for future retrieval. For use in subclasses where initialisation may take a while and it would be beneficial to feed back to the user. Examples -------- >>> self._log("Submitted job X to batch scheduler") """ self._cluster_manager_logs.append((datetime.datetime.now(), log)) if not self.quiet: print(log) async def _get_logs(self, cluster=True, scheduler=True, workers=True): logs = Logs() if cluster: logs["Cluster"] = Log("\n".join( line[1] for line in self._cluster_manager_logs)) if scheduler: L = await self.scheduler_comm.get_logs() logs["Scheduler"] = Log("\n".join(line for level, line in L)) if workers: if workers is True: workers = None d = await self.scheduler_comm.worker_logs(workers=workers) for k, v in d.items(): logs[k] = Log("\n".join(line for level, line in v)) return logs def get_logs(self, cluster=True, scheduler=True, workers=True): """Return logs for the cluster, scheduler and workers Parameters ---------- cluster : boolean Whether or not to collect logs for the cluster manager scheduler : boolean Whether or not to collect logs for the scheduler workers : boolean or Iterable[str], optional A list of worker addresses to select. Defaults to all workers if `True` or no workers if `False` Returns ------- logs: Dict[str] A dictionary of logs, with one item for the scheduler and one for each worker """ return self.sync(self._get_logs, cluster=cluster, scheduler=scheduler, workers=workers) @_deprecated(use_instead="get_logs") def logs(self, *args, **kwargs): return self.get_logs(*args, **kwargs) @property def dashboard_link(self): try: port = self.scheduler_info["services"]["dashboard"] except KeyError: return "" else: host = self.scheduler_address.split("://")[1].split("/")[0].split( ":")[0] return format_dashboard_link(host, port) def _scaling_status(self): if self._adaptive and self._adaptive.periodic_callback: mode = "Adaptive" else: mode = "Manual" workers = len(self.scheduler_info["workers"]) if hasattr(self, "worker_spec"): requested = sum(1 if "group" not in each else len(each["group"]) for each in self.worker_spec.values()) elif hasattr(self, "workers"): requested = len(self.workers) else: requested = workers worker_count = workers if workers == requested else f"{workers} / {requested}" return f""" <table> <tr><td style="text-align: left;">Scaling mode: {mode}</td></tr> <tr><td style="text-align: left;">Workers: {worker_count}</td></tr> </table> """ def _widget(self): """Create IPython widget for display within a notebook""" try: return self._cached_widget except AttributeError: pass try: from ipywidgets import ( HTML, Accordion, Button, HBox, IntText, Layout, Tab, VBox, ) except ImportError: self._cached_widget = None return None layout = Layout(width="150px") status = HTML(self._repr_html_()) if self._supports_scaling: request = IntText(0, description="Workers", layout=layout) scale = Button(description="Scale", layout=layout) minimum = IntText(0, description="Minimum", layout=layout) maximum = IntText(0, description="Maximum", layout=layout) adapt = Button(description="Adapt", layout=layout) accordion = Accordion( [HBox([request, scale]), HBox([minimum, maximum, adapt])], layout=Layout(min_width="500px"), ) accordion.selected_index = None accordion.set_title(0, "Manual Scaling") accordion.set_title(1, "Adaptive Scaling") def adapt_cb(b): self.adapt(minimum=minimum.value, maximum=maximum.value) update() adapt.on_click(adapt_cb) def scale_cb(b): with log_errors(): n = request.value with suppress(AttributeError): self._adaptive.stop() self.scale(n) update() scale.on_click(scale_cb) else: # pragma: no cover accordion = HTML("") scale_status = HTML(self._scaling_status()) tab = Tab() tab.children = [status, VBox([scale_status, accordion])] tab.set_title(0, "Status") tab.set_title(1, "Scaling") self._cached_widget = tab def update(): status.value = self._repr_html_() scale_status.value = self._scaling_status() cluster_repr_interval = parse_timedelta( dask.config.get("distributed.deploy.cluster-repr-interval", default="ms")) pc = PeriodicCallback(update, cluster_repr_interval * 1000) self.periodic_callbacks["cluster-repr"] = pc pc.start() return tab def _repr_html_(self, cluster_status=None): try: scheduler_info_repr = self.scheduler_info._repr_html_() except AttributeError: scheduler_info_repr = "Scheduler not started yet." return get_template("cluster.html.j2").render( type=type(self).__name__, name=self.name, workers=self.scheduler_info["workers"], dashboard_link=self.dashboard_link, scheduler_info_repr=scheduler_info_repr, cluster_status=cluster_status, ) def _ipython_display_(self, **kwargs): widget = self._widget() if widget is not None: return widget._ipython_display_(**kwargs) else: from IPython.display import display data = {"text/plain": repr(self), "text/html": self._repr_html_()} display(data, raw=True) def __enter__(self): return self.sync(self.__aenter__) def __exit__(self, typ, value, traceback): return self.sync(self.__aexit__, typ, value, traceback) async def __aenter__(self): await self return self async def __aexit__(self, typ, value, traceback): f = self.close() if isawaitable(f): await f @property def scheduler_address(self) -> str: if not self.scheduler_comm: return "<Not Connected>" return self.scheduler_comm.address @property def _cluster_class_name(self): return getattr(self, "_name", type(self).__name__) def __repr__(self): text = "%s(%s, %r, workers=%d, threads=%d" % ( self._cluster_class_name, self.name, self.scheduler_address, len(self.scheduler_info["workers"]), sum(w["nthreads"] for w in self.scheduler_info["workers"].values()), ) memory = [ w["memory_limit"] for w in self.scheduler_info["workers"].values() ] if all(memory): text += ", memory=" + format_bytes(sum(memory)) text += ")" return text @property def plan(self): return set(self.workers) @property def requested(self): return set(self.workers) @property def observed(self): return {d["name"] for d in self.scheduler_info["workers"].values()} def __eq__(self, other): return type(other) == type(self) and self.name == other.name def __hash__(self): return id(self)