def _taskify(arg, dsk): if is_dask_collection(arg): arg, graph = finalize(arg) dsk.update(graph) else: arg = quote(arg) return arg
def test_quote(): literals = [[1, 2, 3], (add, 1, 2), [1, [2, 3]], (add, 1, (add, 2, 3)), { "x": "x" }] for l in literals: assert core.get({"x": quote(l)}, "x") == l
def fold(self, binop, combine=None, initial=no_default): """ Parallelizable reduction Fold is like the builtin function ``reduce`` except that it works in parallel. Fold takes two binary operator functions, one to reduce each partition of our dataset and another to combine results between partitions 1. ``binop``: Binary operator to reduce within each partition 2. ``combine``: Binary operator to combine results from binop Sequentially this would look like the following: >>> intermediates = [reduce(binop, part) for part in partitions] # doctest: +SKIP >>> final = reduce(combine, intermediates) # doctest: +SKIP If only one function is given then it is used for both functions ``binop`` and ``combine`` as in the following example to compute the sum: >>> def add(x, y): ... return x + y >>> b = from_sequence(range(5)) >>> b.fold(add).compute() # doctest: +SKIP 10 In full form we provide both binary operators as well as their default arguments >>> b.fold(binop=add, combine=add, initial=0).compute() # doctest: +SKIP 10 More complex binary operators are also doable >>> def add_to_set(acc, x): ... ''' Add new element x to set acc ''' ... return acc | set([x]) >>> b.fold(add_to_set, set.union, initial=set()).compute() # doctest: +SKIP {1, 2, 3, 4, 5} See Also -------- Bag.foldby """ a = next(names) b = next(names) initial = quote(initial) if initial is not no_default: dsk = dict(((a, i), (reduce, binop, (self.name, i), initial)) for i in range(self.npartitions)) else: dsk = dict(((a, i), (reduce, binop, (self.name, i))) for i in range(self.npartitions)) dsk2 = {b: (reduce, combine or binop, list(dsk.keys()))} return Item(merge(self.dask, dsk, dsk2), b)
def _parse_s3_URI(bucket_name, paths): from ..compatibility import quote, unquote assert bucket_name.startswith('s3://') o = urlparse('s3://' + quote(bucket_name[len('s3://'):])) # if path is specified if (paths == '*') and (o.path != '' and o.path != '/'): paths = unquote(o.path[1:]) bucket_name = unquote(o.hostname) return bucket_name, paths
def _parse_s3_URI(bucket_name, paths): from ..compatibility import quote, unquote assert bucket_name.startswith("s3://") o = urlparse("s3://" + quote(bucket_name[len("s3://") :])) # if path is specified if (paths == "*") and (o.path != "" and o.path != "/"): paths = unquote(o.path[1:]) bucket_name = unquote(o.hostname) return bucket_name, paths
def distinct(self): """ Distinct elements of collection Unordered without repeats. >>> b = from_sequence(['Alice', 'Bob', 'Alice']) >>> sorted(b.distinct()) ['Alice', 'Bob'] """ a = next(names) dsk = dict(((a, i), (set, key)) for i, key in enumerate(self._keys())) b = next(names) dsk2 = {(b, 0): (apply, set.union, quote(list(dsk.keys())))} return type(self)(merge(self.dask, dsk, dsk2), b, 1)
def pluck(self, key, default=no_default): """ Select item from all tuples/dicts in collection >>> b = from_sequence([{'name': 'Alice', 'credits': [1, 2, 3]}, ... {'name': 'Bob', 'credits': [10, 20]}]) >>> list(b.pluck('name')) # doctest: +SKIP ['Alice', 'Bob'] >>> list(b.pluck('credits').pluck(0)) # doctest: +SKIP [1, 10] """ name = next(names) key = quote(key) if default == no_default: dsk = dict(((name, i), (list, (pluck, key, (self.name, i)))) for i in range(self.npartitions)) else: dsk = dict(((name, i), (list, (pluck, key, (self.name, i), default))) for i in range(self.npartitions)) return type(self)(merge(self.dask, dsk), name, self.npartitions)
def _unpack(expr): if is_dask_collection(expr): tok = tokenize(expr) if tok not in repack_dsk: repack_dsk[tok] = (getitem, collections_token, len(collections)) collections.append(expr) return tok tok = uuid.uuid4().hex if not traverse: tsk = quote(expr) else: # Treat iterators like lists typ = list if isinstance(expr, Iterator) else type(expr) if typ in (list, tuple, set): tsk = (typ, [_unpack(i) for i in expr]) elif typ in (dict, OrderedDict): tsk = (typ, [[_unpack(k), _unpack(v)] for k, v in expr.items()]) elif dataclasses.is_dataclass(expr) and not isinstance(expr, type): tsk = ( apply, typ, (), ( dict, [ [f.name, _unpack(getattr(expr, f.name))] for f in dataclasses.fields(expr) ], ), ) else: return expr repack_dsk[tok] = tsk return tok
def repack(results): dsk = repack_dsk.copy() dsk[collections_token] = quote(results) return simple_get(dsk, out)
def test_quote(): literals = [[1, 2, 3], (add, 1, 2), [1, [2, 3]], (add, 1, (add, 2, 3))] for l in literals: assert core.get({'x': quote(l)}, 'x') == l
def test_quote(): literals = [[1, 2, 3], (add, 1, 2), [1, [2, 3]], (add, 1, (add, 2, 3))] for l in literals: assert get({'x': quote(l)}, 'x') == l
def submit(self, func, *args, **kwargs): """ Submit a function application to the scheduler Parameters ---------- func: callable *args: **kwargs: pure: bool (defaults to True) Whether or not the function is pure. Set ``pure=False`` for impure functions like ``np.random.random``. workers: set, iterable of sets A set of worker hostnames on which computations may be performed. Leave empty to default to all workers (common case) Examples -------- >>> c = executor.submit(add, a, b) # doctest: +SKIP Returns ------- Future See Also -------- distributed.executor.Executor.submit: """ if not callable(func): raise TypeError("First input to submit must be a callable function") key = kwargs.pop('key', None) pure = kwargs.pop('pure', True) workers = kwargs.pop('workers', None) if key is None: if pure: key = funcname(func) + '-' + tokenize(func, kwargs, *args) else: key = funcname(func) + '-' + next(tokens) if key in self.futures: return Future(key, self) args = quote(args) if kwargs: task = (apply, func, args, kwargs) else: task = (func,) + args if workers is not None: restrictions = {key: workers} else: restrictions = {} if key not in self.futures: self.futures[key] = {'event': Event(), 'status': 'waiting'} logger.debug("Submit %s(...), %s", funcname(func), key) self.scheduler_queue.put_nowait({'op': 'update-graph', 'dsk': {key: task}, 'keys': [key], 'restrictions': restrictions}) return Future(key, self)
def submit(self, func, *args, **kwargs): """ Submit a function application to the scheduler Parameters ---------- func: callable *args: **kwargs: pure: bool (defaults to True) Whether or not the function is pure. Set ``pure=False`` for impure functions like ``np.random.random``. workers: set, iterable of sets A set of worker hostnames on which computations may be performed. Leave empty to default to all workers (common case) Examples -------- >>> c = executor.submit(add, a, b) # doctest: +SKIP Returns ------- Future See Also -------- distributed.executor.Executor.submit: """ if not callable(func): raise TypeError( "First input to submit must be a callable function") key = kwargs.pop('key', None) pure = kwargs.pop('pure', True) workers = kwargs.pop('workers', None) if key is None: if pure: key = funcname(func) + '-' + tokenize(func, kwargs, *args) else: key = funcname(func) + '-' + next(tokens) if key in self.futures: return Future(key, self) args = quote(args) if kwargs: task = (apply, func, args, kwargs) else: task = (func, ) + args if workers is not None: restrictions = {key: workers} else: restrictions = {} if key not in self.futures: self.futures[key] = {'event': Event(), 'status': 'waiting'} logger.debug("Submit %s(...), %s", funcname(func), key) self.scheduler_queue.put_nowait({ 'op': 'update-graph', 'dsk': { key: task }, 'keys': [key], 'restrictions': restrictions }) return Future(key, self)
def delayed(obj, name=None, pure=None, nout=None, traverse=True): """Wraps a function or object to produce a ``Delayed``. ``Delayed`` objects act as proxies for the object they wrap, but all operations on them are done lazily by building up a dask graph internally. Parameters ---------- obj : object The function or object to wrap name : string or hashable, optional The key to use in the underlying graph for the wrapped object. Defaults to hashing content. Note that this only affects the name of the object wrapped by this call to delayed, and *not* the output of delayed function calls - for that use ``dask_key_name=`` as described below. .. note:: Because this ``name`` is used as the key in task graphs, you should ensure that it uniquely identifies ``obj``. If you'd like to provide a descriptive name that is still unique, combine the descriptive name with :func:`dask.base.tokenize` of the ``array_like``. See :ref:`graphs` for more. pure : bool, optional Indicates whether calling the resulting ``Delayed`` object is a pure operation. If True, arguments to the call are hashed to produce deterministic keys. If not provided, the default is to check the global ``delayed_pure`` setting, and fallback to ``False`` if unset. nout : int, optional The number of outputs returned from calling the resulting ``Delayed`` object. If provided, the ``Delayed`` output of the call can be iterated into ``nout`` objects, allowing for unpacking of results. By default iteration over ``Delayed`` objects will error. Note, that ``nout=1`` expects ``obj`` to return a tuple of length 1, and consequently for ``nout=0``, ``obj`` should return an empty tuple. traverse : bool, optional By default dask traverses builtin python collections looking for dask objects passed to ``delayed``. For large collections this can be expensive. If ``obj`` doesn't contain any dask objects, set ``traverse=False`` to avoid doing this traversal. Examples -------- Apply to functions to delay execution: >>> from dask import delayed >>> def inc(x): ... return x + 1 >>> inc(10) 11 >>> x = delayed(inc, pure=True)(10) >>> type(x) == Delayed True >>> x.compute() 11 Can be used as a decorator: >>> @delayed(pure=True) ... def add(a, b): ... return a + b >>> add(1, 2).compute() 3 ``delayed`` also accepts an optional keyword ``pure``. If False, then subsequent calls will always produce a different ``Delayed``. This is useful for non-pure functions (such as ``time`` or ``random``). >>> from random import random >>> out1 = delayed(random, pure=False)() >>> out2 = delayed(random, pure=False)() >>> out1.key == out2.key False If you know a function is pure (output only depends on the input, with no global state), then you can set ``pure=True``. This will attempt to apply a consistent name to the output, but will fallback on the same behavior of ``pure=False`` if this fails. >>> @delayed(pure=True) ... def add(a, b): ... return a + b >>> out1 = add(1, 2) >>> out2 = add(1, 2) >>> out1.key == out2.key True Instead of setting ``pure`` as a property of the callable, you can also set it contextually using the ``delayed_pure`` setting. Note that this influences the *call* and not the *creation* of the callable: >>> @delayed ... def mul(a, b): ... return a * b >>> import dask >>> with dask.config.set(delayed_pure=True): ... print(mul(1, 2).key == mul(1, 2).key) True >>> with dask.config.set(delayed_pure=False): ... print(mul(1, 2).key == mul(1, 2).key) False The key name of the result of calling a delayed object is determined by hashing the arguments by default. To explicitly set the name, you can use the ``dask_key_name`` keyword when calling the function: >>> add(1, 2) # doctest: +SKIP Delayed('add-3dce7c56edd1ac2614add714086e950f') >>> add(1, 2, dask_key_name='three') Delayed('three') Note that objects with the same key name are assumed to have the same result. If you set the names explicitly you should make sure your key names are different for different results. >>> add(1, 2, dask_key_name='three') Delayed('three') >>> add(2, 1, dask_key_name='three') Delayed('three') >>> add(2, 2, dask_key_name='four') Delayed('four') ``delayed`` can also be applied to objects to make operations on them lazy: >>> a = delayed([1, 2, 3]) >>> isinstance(a, Delayed) True >>> a.compute() [1, 2, 3] The key name of a delayed object is hashed by default if ``pure=True`` or is generated randomly if ``pure=False`` (default). To explicitly set the name, you can use the ``name`` keyword. To ensure that the key is unique you should include the tokenized value as well, or otherwise ensure that it's unique: >>> from dask.base import tokenize >>> data = [1, 2, 3] >>> a = delayed(data, name='mylist-' + tokenize(data)) >>> a # doctest: +SKIP Delayed('mylist-55af65871cb378a4fa6de1660c3e8fb7') Delayed results act as a proxy to the underlying object. Many operators are supported: >>> (a + [1, 2]).compute() [1, 2, 3, 1, 2] >>> a[1].compute() 2 Method and attribute access also works: >>> a.count(2).compute() 1 Note that if a method doesn't exist, no error will be thrown until runtime: >>> res = a.not_a_real_method() # doctest: +SKIP >>> res.compute() # doctest: +SKIP AttributeError("'list' object has no attribute 'not_a_real_method'") "Magic" methods (e.g. operators and attribute access) are assumed to be pure, meaning that subsequent calls must return the same results. This behavior is not overrideable through the ``delayed`` call, but can be modified using other ways as described below. To invoke an impure attribute or operator, you'd need to use it in a delayed function with ``pure=False``: >>> class Incrementer: ... def __init__(self): ... self._n = 0 ... @property ... def n(self): ... self._n += 1 ... return self._n ... >>> x = delayed(Incrementer()) >>> x.n.key == x.n.key True >>> get_n = delayed(lambda x: x.n, pure=False) >>> get_n(x).key == get_n(x).key False In contrast, methods are assumed to be impure by default, meaning that subsequent calls may return different results. To assume purity, set ``pure=True``. This allows sharing of any intermediate values. >>> a.count(2, pure=True).key == a.count(2, pure=True).key True As with function calls, method calls also respect the global ``delayed_pure`` setting and support the ``dask_key_name`` keyword: >>> a.count(2, dask_key_name="count_2") Delayed('count_2') >>> import dask >>> with dask.config.set(delayed_pure=True): ... print(a.count(2).key == a.count(2).key) True """ if isinstance(obj, Delayed): return obj if is_dask_collection(obj) or traverse: task, collections = unpack_collections(obj) else: task = quote(obj) collections = set() if not (nout is None or (type(nout) is int and nout >= 0)): raise ValueError( "nout must be None or a non-negative integer, got %s" % nout) if task is obj: if not name: try: prefix = obj.__name__ except AttributeError: prefix = type(obj).__name__ token = tokenize(obj, nout, pure=pure) name = f"{prefix}-{token}" return DelayedLeaf(obj, name, pure=pure, nout=nout) else: if not name: name = f"{type(obj).__name__}-{tokenize(task, pure=pure)}" layer = {name: task} graph = HighLevelGraph.from_collections(name, layer, dependencies=collections) return Delayed(name, graph, nout)