def _widget(self): """ Create IPython widget for display within a notebook """ try: return self._cached_widget except AttributeError: pass from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML client = self._dask_client() layout = Layout(width='150px') title = HTML('<h2>YarnCluster</h2>') status = HTML(self._widget_status(), layout=Layout(min_width='150px')) request = IntText(0, description='Workers', layout=layout) scale = Button(description='Scale', layout=layout) @scale.on_click def scale_cb(b): with log_errors(): self.scale(request.value) elements = [title, HBox([status, request, scale])] if self.dashboard_link is not None: link = HTML('<p><b>Dashboard: </b><a href="%s" target="_blank">%s' '</a></p>\n' % (self.dashboard_link, self.dashboard_link)) elements.append(link) self._cached_widget = box = VBox(elements) def update(): status.value = self._widget_status() pc = PeriodicCallback(update, 500, io_loop=client.loop) pc.start() return box
def _widget(self): """ Create IPython widget for display within a notebook """ try: return self._cached_widget except AttributeError: pass from ipywidgets import Layout, VBox, HBox, IntText, Button, HTML client = self._dask_client() layout = Layout(width='150px') title = HTML('<h2>YarnCluster</h2>') status = HTML(self._widget_status(), layout=Layout(min_width='150px')) request = IntText(0, description='Workers', layout=layout) scale = Button(description='Scale', layout=layout) @scale.on_click def scale_cb(b): with log_errors(): self.scale(request.value) box = VBox([title, HBox([status, request, scale])]) self._cached_widget = box def update(): status.value = self._widget_status() pc = PeriodicCallback(update, 500, io_loop=client.loop) pc.start() return box
class DRMAACluster(Cluster): def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), copy_script=True, ip='', **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- template: dict Dictionary specifying options to pass to the DRMAA cluster and the worker. Relevant items are: jobName: string Name of the job as known by the DRMAA cluster. args: list Extra string arguments to pass to dask-worker outputPath: string Path to the dask-worker stdout. Must start with ':'. Defaults to worker.JOBID.TASKID.out in current directory. errorPath: string Path to the dask-worker stderr. Must start with ':' Defaults to worker.JOBID.TASKID.err in current directory. workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler cleanup_interval: int Time interval in seconds at which closed workers are cleaned. Defaults to 1000 hostname: string Host on which to start the local scheduler, defaults to localhost script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) preexec_commands: tuple (optional) Commands to be executed first by temporary script. Cannot be specified at the same time as script. copy_script: bool Whether should copy the passed script to the current working directory. This is primarily to work around an issue with SGE. ip: string IP of the scheduler, default is the empty string which will listen on the primary ip address of the host **kwargs: Additional keyword arguments to be passed to the local scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs) if script is None: fn = os.path.abspath(tempfile.mktemp( suffix='.sh', prefix='dask-worker-script-', dir=os.path.curdir, )) self.script = fn self._should_cleanup_script = True script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: self._should_cleanup_script = False if copy_script: with ignoring(EnvironmentError): # may be in the same path shutil.copy(script, os.path.curdir) # python 2.x returns None script = os.path.join(os.path.curdir, os.path.basename(script)) self._should_cleanup_script = True self.script = os.path.abspath(script) assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec} def adapt(self, **kwargs): """ Turn on adaptivity For keyword arguments see dask_drmaa.adaptive.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') See Also -------- Cluster: an interface for other clusters to inherit from """ from .adaptive import Adaptive with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, '_adaptive_options'): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive( self, self.scheduler, **self._adaptive_options ) return self._adaptive @gen.coroutine def _start(self): pass @property def scheduler(self): return self.local_cluster.scheduler def create_job_template(self, **kwargs): template = self.template.copy() if kwargs: template.update(kwargs) template['args'] = [self.scheduler_address] + template['args'] jt = get_session().createJobTemplate() valid_attributes = dir(jt) for key, value in template.items(): if key not in valid_attributes: raise ValueError("Invalid job template attribute %s" % key) setattr(jt, key, value) return jt def start_workers(self, n=1, **kwargs): if n == 0: return with log_errors(): with self.create_job_template(**kwargs) as jt: ids = get_session().runBulkJobs(jt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update( {jid: WorkerSpec(job_id=jid, kwargs=kwargs, stdout=worker_out_path_template % dict(jid=jid, ext='out'), stderr=worker_out_path_template % dict(jid=jid, ext='err'), ) for jid in ids}) @gen.coroutine def stop_workers(self, worker_ids, sync=False): if isinstance(worker_ids, str): worker_ids = [worker_ids] elif worker_ids: worker_ids = list(worker_ids) else: return # Let the scheduler gracefully retire workers first ids_to_ips = { v['name']: k for k, v in self.scheduler.worker_info.items() } worker_ips = [ids_to_ips[wid] for wid in worker_ids if wid in ids_to_ips] retired = yield self.scheduler.retire_workers(workers=worker_ips, close_workers=True) logger.info("Retired workers %s", retired) for wid in list(worker_ids): try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass try: self.workers.pop(wid) except KeyError: # If we have multiple callers at once, it may have already # been popped off pass logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) @gen.coroutine def scale_up(self, n, **kwargs): yield [self.start_workers(**kwargs) for _ in range(n - len(self.workers))] @gen.coroutine def scale_down(self, workers): workers = set(workers) yield self.scheduler.retire_workers(workers=workers) def close(self): logger.info("Closing DRMAA cluster") self.stop_workers(self.workers, sync=True) self.local_cluster.close() if self._should_cleanup_script and os.path.exists(self.script): os.remove(self.script) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) in ('closed', 'done'): logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers)) __repr__ = __str__
def _widget(self): """ Create IPython widget for display within a notebook """ try: return self._cached_widget except AttributeError: pass from ipywidgets import ( Layout, VBox, HBox, IntText, Button, HTML, Accordion, Text, ) layout = Layout(width="150px") if "dashboard" in self.scheduler.services: link = self.dashboard_link link = '<p><b>Dashboard: </b><a href="%s" target="_blank">%s</a></p>\n' % ( link, link, ) else: link = "" title = "<h2>%s</h2>" % type(self).__name__ title = HTML(title) dashboard = HTML(link) status = HTML(self._widget_status(), layout=Layout(min_width="150px")) request = IntText(0, description="Workers", layout=layout) scale = Button(description="Scale", layout=layout) request_cores = IntText(0, description="Cores", layout=layout) scale_cores = Button(description="Scale", layout=layout) request_memory = Text("O GB", description="Memory", layout=layout) scale_memory = Button(description="Scale", layout=layout) minimum = IntText(0, description="Minimum", layout=layout) maximum = IntText(0, description="Maximum", layout=layout) adapt = Button(description="Adapt", layout=layout) minimum_cores = IntText(0, description="Min cores", layout=layout) maximum_cores = IntText(0, description="Max cores", layout=layout) adapt_cores = Button(description="Adapt", layout=layout) minimum_mem = Text("0 GB", description="Min memory", layout=layout) maximum_mem = Text("0 GB", description="Max memory", layout=layout) adapt_mem = Button(description="Adapt", layout=layout) scale_hbox = [HBox([request, scale])] adapt_hbox = [HBox([minimum, maximum, adapt])] if hasattr(self, "jobqueue_worker_spec"): scale_hbox.append(HBox([request_cores, scale_cores])) scale_hbox.append(HBox([request_memory, scale_memory])) adapt_hbox.append(HBox([minimum_cores, maximum_cores, adapt_cores])) adapt_hbox.append(HBox([minimum_mem, maximum_mem, adapt_mem])) accordion = Accordion( [VBox(scale_hbox), VBox(adapt_hbox)], layout=Layout(min_width="500px")) accordion.selected_index = None accordion.set_title(0, "Manual Scaling") accordion.set_title(1, "Adaptive Scaling") box = VBox([title, HBox([status, accordion]), dashboard]) self._cached_widget = box def adapt_cb(b): self.adapt(minimum=minimum.value, maximum=maximum.value) def adapt_cores_cb(b): self.adapt(minimum_cores=minimum_cores.value, maximum_cores=maximum_cores.value) def adapt_mem_cb(b): self.adapt(minimum_memory=minimum_mem.value, maximum_memory=maximum_mem.value) adapt.on_click(adapt_cb) adapt_cores.on_click(adapt_cores_cb) adapt_mem.on_click(adapt_mem_cb) def scale_cb(request, kwarg): def request_cb(b): with log_errors(): arg = request.value with ignoring(AttributeError): self._adaptive.stop() local_kwargs = dict() local_kwargs[kwarg] = arg self.scale(**local_kwargs) return request_cb scale.on_click(scale_cb(request, "n")) scale_cores.on_click(scale_cb(request_cores, "cores")) scale_memory.on_click(scale_cb(request_memory, "memory")) def update(): status.value = self._widget_status() pc = PeriodicCallback(update, 500, io_loop=self.scheduler.loop) self.scheduler.periodic_callbacks["cluster-repr"] = pc pc.start() return box