def worker_core(self, ident, i): """ Manage one core on one distributed worker node This coroutine listens on worker_queue for the following operations **Incoming Messages**: - compute-task: call worker.compute(...) on remote node, report when done - close: close connection to worker node, report `worker-finished` to scheduler **Outgoing Messages**: - task-finished: sent to scheduler once a task completes - task-erred: sent to scheduler when a task errs - worker-finished: sent to scheduler in response to a close command """ worker = rpc(ip=ident[0], port=ident[1]) logger.debug("Start worker core %s, %d", ident, i) while True: msg = yield self.worker_queues[ident].get() if msg['op'] == 'close': logger.debug("Worker core receives close message %s, %s", ident, msg) break if msg['op'] == 'compute-task': key = msg['key'] needed = msg['needed'] task = msg['task'] if not istask(task): response, content = yield worker.update_data(data={key: task}) assert response == b'OK', response nbytes = content['nbytes'][key] else: response, content = yield worker.compute(function=execute_task, args=(task,), needed=needed, key=key, kwargs={}) if response == b'OK': nbytes = content['nbytes'] logger.debug("Compute response from worker %s, %s, %s, %s", ident, key, response, content) if response == b'error': error, traceback = content self.mark_task_erred(key, ident, error, traceback) elif response == b'missing-data': self.mark_missing_data(content.args, key=key, worker=ident) else: self.mark_task_finished(key, ident, nbytes) yield worker.close(close=True) worker.close_streams() if msg.get('report', True): self.put({'op': 'worker-finished', 'worker': ident}) logger.debug("Close worker core, %s, %d", ident, i)
def worker_core(scheduler_queue, worker_queue, ident, i): """ Manage one core on one distributed worker node This coroutine listens on worker_queue for the following operations **Incoming Messages**: - compute-task: call worker.compute(...) on remote node, report when done - close: close connection to worker node, report `worker-finished` to scheduler **Outgoing Messages**: - task-finished: sent to scheduler once a task completes - task-erred: sent to scheduler when a task errs - worker-finished: sent to scheduler in response to a close command """ worker = rpc(ip=ident[0], port=ident[1]) logger.debug("Start worker core %s, %d", ident, i) while True: msg = yield worker_queue.get() if msg['op'] == 'close': break if msg['op'] == 'compute-task': key = msg['key'] needed = msg['needed'] task = msg['task'] if not istask(task): response = yield worker.update_data(data={key: task}) assert response == b'OK', response else: response = yield worker.compute(function=_execute_task, args=(task, {}), needed=needed, key=key, kwargs={}) if response == b'error': err = yield worker.get_data(keys=[key]) scheduler_queue.put_nowait({'op': 'task-erred', 'key': key, 'worker': ident, 'exception': err[key]}) elif isinstance(response, KeyError): scheduler_queue.put_nowait({'op': 'task-missing-data', 'key': key, 'worker': ident, 'missing': response.args}) else: scheduler_queue.put_nowait({'op': 'task-finished', 'worker': ident, 'key': key}) yield worker.close(close=True) worker.close_streams() scheduler_queue.put_nowait({'op': 'worker-finished', 'worker': ident}) logger.debug("Close worker core, %s, %d", ident, i)
def execute_task(task): """ Evaluate a nested task """ if istask(task): func, args = task[0], task[1:] return func(*map(execute_task, args)) else: return task
def _bottom_up(net, term): if not istask(term): return net._rewrite(term) else: new_args = tuple(_bottom_up(net, t) for t in args(term)) new_term = (head(term),) + new_args return net._rewrite(new_term)
def dumps_task(task): """ Serialize a dask task Returns a dict of bytestrings that can each be loaded with ``loads`` Examples -------- Either returns a task as a function, args, kwargs dict >>> from operator import add >>> dumps_task((add, 1)) # doctest: +SKIP {'function': b'\x80\x04\x95\x00\x8c\t_operator\x94\x8c\x03add\x94\x93\x94.' 'args': b'\x80\x04\x95\x07\x00\x00\x00K\x01K\x02\x86\x94.'} Or as a single task blob if it can't easily decompose the result. This happens either if the task is highly nested, or if it isn't a task at all >>> dumps_task(1) # doctest: +SKIP {'task': b'\x80\x04\x95\x03\x00\x00\x00\x00\x00\x00\x00K\x01.'} """ if istask(task): if task[0] is apply and not any(map(_maybe_complex, task[2:])): d = {'function': dumps_function(task[1]), 'args': dumps(task[2])} if len(task) == 4: d['kwargs'] = dumps(task[3]) return d elif not any(map(_maybe_complex, task[1:])): return {'function': dumps_function(task[0]), 'args': dumps(task[1:])} return {'task': dumps(task)}
def to_networkx(d, data_attributes=None, function_attributes=None): if data_attributes is None: data_attributes = dict() if function_attributes is None: function_attributes = dict() g = nx.DiGraph() for k, v in sorted(d.items(), key=lambda x: x[0]): g.add_node(k, shape='box', **data_attributes.get(k, dict())) if istask(v): func, args = v[0], v[1:] func_node = make_hashable((v, 'function')) g.add_node(func_node, shape='circle', label=name(func), **function_attributes.get(k, dict())) g.add_edge(func_node, k) for dep in sorted(get_dependencies(d, k)): arg2 = make_hashable(dep) g.add_node(arg2, label=str(dep), shape='box', **data_attributes.get(dep, dict())) g.add_edge(arg2, func_node) else: v_hash = make_hashable(v) if v_hash not in d: g.add_node(k, label='%s=%s' % (k, v), **data_attributes.get(k, dict())) else: # alias situation g.add_edge(v_hash, k) return g
def head(task): """Return the top level node of a task""" if istask(task): return task[0] elif isinstance(task, list): return list else: return task
def args(task): """Get the arguments for the current task""" if istask(task): return task[1:] elif isinstance(task, list): return task else: return ()
def execute_task(task): """ Evaluate a nested task >>> inc = lambda x: x + 1 >>> execute_task((inc, 1)) 2 >>> execute_task((sum, [1, 2, (inc, 3)])) 7 """ if istask(task): func, args = task[0], task[1:] return func(*map(execute_task, args)) elif isinstance(task, list): return list(map(execute_task, task)) else: return task
def test_istask(): assert istask((inc, 1)) assert not istask(1) assert not istask((1, 2)) f = namedtuple('f', ['x', 'y']) assert not istask(f(sum, 2))
def _execute_task(task: tuple): if not istask(task): return _get_arg(task) return spawn(task[0], args=tuple(_get_arg(a) for a in task[1:]))
def worker_core(self, ident, i): """ Manage one core on one distributed worker node This coroutine listens on worker_queue for the following operations **Incoming Messages**: - compute-task: call worker.compute(...) on remote node, report when done - close: close connection to worker node, report `worker-finished` to scheduler See Also -------- Scheduler.mark_task_finished Scheduler.mark_task_erred Scheduler.mark_missing_data distributed.worker.Worker.compute """ worker = rpc(ip=ident[0], port=ident[1]) logger.debug("Start worker core %s, %d", ident, i) while True: msg = yield self.worker_queues[ident].get() if msg["op"] == "close": logger.debug("Worker core receives close message %s, %s", ident, msg) break if msg["op"] == "compute-task": key = msg["key"] who_has = msg["who_has"] task = msg["task"] if not istask(task): response, content = yield worker.update_data(data={key: task}, report=self.center is not None) assert response == b"OK", response nbytes = content["nbytes"][key] else: response, content = yield worker.compute( function=execute_task, args=(task,), who_has=who_has, key=key, kwargs={}, report=self.center is not None, ) if response == b"OK": nbytes = content["nbytes"] logger.debug("Compute response from worker %s, %s, %s, %s", ident, key, response, content) if response == b"error": error, traceback = content self.mark_task_erred(key, ident, error, traceback) elif response == b"missing-data": self.mark_missing_data(content.args, key=key, worker=ident) else: self.mark_task_finished(key, ident, nbytes) yield worker.close(close=True) worker.close_streams() if msg.get("report", True): self.put({"op": "worker-finished", "worker": ident}) logger.debug("Close worker core, %s, %d", ident, i)
def test_istask(): assert istask((inc, 1)) assert not istask(1) assert not istask((1, 2)) f = namedtuple("f", ["x", "y"]) assert not istask(f(sum, 2))
def to_graphviz( dsk, data_attributes=None, function_attributes=None, rankdir="BT", graph_attr=None, node_attr=None, edge_attr=None, collapse_outputs=False, verbose=False, **kwargs, ): data_attributes = data_attributes or {} function_attributes = function_attributes or {} graph_attr = graph_attr or {} node_attr = node_attr or {} edge_attr = edge_attr or {} graph_attr["rankdir"] = rankdir node_attr["fontname"] = "helvetica" graph_attr.update(kwargs) g = graphviz.Digraph(graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr) seen = set() connected = set() for k, v in dsk.items(): k_name = name(k) if istask(v): func_name = name( (k, "function")) if not collapse_outputs else k_name if collapse_outputs or func_name not in seen: seen.add(func_name) attrs = function_attributes.get(k, {}).copy() attrs.setdefault("label", key_split(k)) attrs.setdefault("shape", "circle") g.node(func_name, **attrs) if not collapse_outputs: g.edge(func_name, k_name) connected.add(func_name) connected.add(k_name) for dep in get_dependencies(dsk, k): dep_name = name(dep) if dep_name not in seen: seen.add(dep_name) attrs = data_attributes.get(dep, {}).copy() attrs.setdefault("label", box_label(dep, verbose)) attrs.setdefault("shape", "box") g.node(dep_name, **attrs) g.edge(dep_name, func_name) connected.add(dep_name) connected.add(func_name) elif ishashable(v) and v in dsk: v_name = name(v) g.edge(v_name, k_name) connected.add(v_name) connected.add(k_name) if (not collapse_outputs or k_name in connected) and k_name not in seen: seen.add(k_name) attrs = data_attributes.get(k, {}).copy() attrs.setdefault("label", box_label(k, verbose)) attrs.setdefault("shape", "box") g.node(k_name, **attrs) return g
def pprint_task(task, keys, label_size=60): """Return a nicely formatted string for a task. Parameters ---------- task: Value within dask graph to render as text keys: iterable List of keys within dask graph label_size: int (optional) Maximum size of output label, defaults to 60 Examples -------- >>> from operator import add, mul >>> dsk = {'a': 1, ... 'b': 2, ... 'c': (add, 'a', 'b'), ... 'd': (add, (mul, 'a', 'b'), 'c'), ... 'e': (sum, ['a', 'b', 5]), ... 'f': (add,), ... 'g': []} >>> pprint_task(dsk['c'], dsk) 'add(_, _)' >>> pprint_task(dsk['d'], dsk) 'add(mul(_, _), _)' >>> pprint_task(dsk['e'], dsk) 'sum([_, _, *])' >>> pprint_task(dsk['f'], dsk) 'add()' >>> pprint_task(dsk['g'], dsk) '[]' """ if istask(task): func = task[0] if func is apply: head = funcname(task[1]) tail = ")" args = unquote(task[2]) if len(task) > 2 else () kwargs = unquote(task[3]) if len(task) > 3 else {} else: if hasattr(func, "funcs"): head = "(".join(funcname(f) for f in func.funcs) tail = ")" * len(func.funcs) else: head = funcname(task[0]) tail = ")" args = task[1:] kwargs = {} if args or kwargs: label_size2 = int( (label_size - len(head) - len(tail)) // (len(args) + len(kwargs)) ) pprint = lambda t: pprint_task(t, keys, label_size2) if args: if label_size2 > 5: args = ", ".join(pprint(t) for t in args) else: args = "..." else: args = "" if kwargs: if label_size2 > 5: kwargs = ", " + ", ".join( f"{k}={pprint(v)}" for k, v in sorted(kwargs.items()) ) else: kwargs = ", ..." else: kwargs = "" return f"{head}({args}{kwargs}{tail}" elif isinstance(task, list): if not task: return "[]" elif len(task) > 3: result = pprint_task(task[:3], keys, label_size) return result[:-1] + ", ...]" else: label_size2 = int((label_size - 2 - 2 * len(task)) // len(task)) args = ", ".join(pprint_task(t, keys, label_size2) for t in task) return f"[{args}]" else: try: if task in keys: return "_" else: return "*" except TypeError: return "*"
def test_istask(): assert istask((inc, 1)) assert not istask(1) assert not istask((1, 2))
def _bottom_up(net, term): if istask(term): term = (head(term),) + tuple(_bottom_up(net, t) for t in args(term)) elif isinstance(term, list): term = [_bottom_up(net, t) for t in args(term)] return net._rewrite(term)
def _bottom_up(net, term): if istask(term): term = (head(term), ) + tuple(_bottom_up(net, t) for t in args(term)) elif isinstance(term, list): term = [_bottom_up(net, t) for t in args(term)] return net._rewrite(term)
def _rayify_task( task, key, deps, ray_presubmit_cbs, ray_postsubmit_cbs, ray_pretask_cbs, ray_posttask_cbs, ): """ Rayifies the given task, submitting it as a Ray task to the Ray cluster. Args: task (tuple): A Dask graph value, being either a literal, dependency key, Dask task, or a list thereof. key (str): The Dask graph key for the given task. deps (dict): The dependencies of this task. ray_presubmit_cbs (callable): Pre-task submission callbacks. ray_postsubmit_cbs (callable): Post-task submission callbacks. ray_pretask_cbs (callable): Pre-task execution callbacks. ray_posttask_cbs (callable): Post-task execution callbacks. Returns: A literal, a Ray object reference representing a submitted task, or a list thereof. """ if isinstance(task, list): # Recursively rayify this list. This will still bottom out at the first # actual task encountered, inlining any tasks in that task's arguments. return [ _rayify_task( t, key, deps, ray_presubmit_cbs, ray_postsubmit_cbs, ray_pretask_cbs, ray_posttask_cbs, ) for t in task ] elif istask(task): # Unpacks and repacks Ray object references and submits the task to the # Ray cluster for execution. if ray_presubmit_cbs is not None: alternate_returns = [ cb(task, key, deps) for cb in ray_presubmit_cbs ] for alternate_return in alternate_returns: # We don't submit a Ray task if a presubmit callback returns # a non-`None` value, instead we return said value. # NOTE: This returns the first non-None presubmit callback # return value. if alternate_return is not None: return alternate_return func, args = task[0], task[1:] if func is multiple_return_get: return _execute_task(task, deps) # If the function's arguments contain nested object references, we must # unpack said object references into a flat set of arguments so that # Ray properly tracks the object dependencies between Ray tasks. arg_object_refs, repack = unpack_object_refs(args, deps) # Submit the task using a wrapper function. object_refs = dask_task_wrapper.options( name=f"dask:{key!s}", num_returns=(1 if not isinstance(func, MultipleReturnFunc) else func.num_returns), ).remote( func, repack, key, ray_pretask_cbs, ray_posttask_cbs, *arg_object_refs, ) if ray_postsubmit_cbs is not None: for cb in ray_postsubmit_cbs: cb(task, key, deps, object_refs) return object_refs elif not ishashable(task): return task elif task in deps: return deps[task] else: return task
def worker_core(scheduler_queue, worker_queue, ident, i): """ Manage one core on one distributed worker node This coroutine listens on worker_queue for the following operations **Incoming Messages**: - compute-task: call worker.compute(...) on remote node, report when done - close: close connection to worker node, report `worker-finished` to scheduler **Outgoing Messages**: - task-finished: sent to scheduler once a task completes - task-erred: sent to scheduler when a task errs - worker-finished: sent to scheduler in response to a close command """ worker = rpc(ip=ident[0], port=ident[1]) logger.debug("Start worker core %s, %d", ident, i) while True: msg = yield worker_queue.get() if msg['op'] == 'close': break if msg['op'] == 'compute-task': key = msg['key'] needed = msg['needed'] task = msg['task'] if not istask(task): response = yield worker.update_data(data={key: task}) assert response == b'OK', response else: response = yield worker.compute(function=_execute_task, args=(task, {}), needed=needed, key=key, kwargs={}) if response == b'error': err = yield worker.get_data(keys=[key]) scheduler_queue.put_nowait({ 'op': 'task-erred', 'key': key, 'worker': ident, 'exception': err[key] }) elif isinstance(response, KeyError): scheduler_queue.put_nowait({ 'op': 'task-missing-data', 'key': key, 'worker': ident, 'missing': response.args }) else: scheduler_queue.put_nowait({ 'op': 'task-finished', 'worker': ident, 'key': key }) yield worker.close(close=True) worker.close_streams() scheduler_queue.put_nowait({'op': 'worker-finished', 'worker': ident}) logger.debug("Close worker core, %s, %d", ident, i)