Exemplo n.º 1
0
def test_stringify():
    obj = "Hello"
    assert stringify(obj) is obj
    obj = b"Hello"
    assert stringify(obj) is obj
    dsk = {"x": 1}

    assert stringify(dsk) == str(dsk)
    assert stringify(dsk, exclusive=()) == dsk

    dsk = {("x", 1): (inc, 1)}
    assert stringify(dsk) == str({("x", 1): (inc, 1)})
    assert stringify(dsk, exclusive=()) == {("x", 1): (inc, 1)}

    dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))}
    assert stringify(dsk, exclusive=dsk) == {
        ("x", 1): (inc, 1),
        ("x", 2): (inc, str(("x", 1))),
    }

    dsks = [
        {
            "x": 1
        },
        {
            ("x", 1): (inc, 1),
            ("x", 2): (inc, ("x", 1))
        },
        {
            ("x", 1): (sum, [1, 2, 3]),
            ("x", 2): (sum, [("x", 1), ("x", 1)])
        },
    ]
    for dsk in dsks:
        sdsk = {
            stringify(k): stringify(v, exclusive=dsk)
            for k, v in dsk.items()
        }
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, str) for k in sdsk)
        assert get(dsk, keys) == get(sdsk, skeys)

    dsk = {
        ("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x",
                                    (("y", 1), )), (("z", 1), ))
    }
    dsk = stringify(dsk, exclusive=set(dsk) | {("z", 1)})
    assert dsk[("y", 1)][0].dsk["x"] == "('y', 1)"
    assert dsk[("y", 1)][1][0] == "('z', 1)"
Exemplo n.º 2
0
async def test_resources_str(c, s, a, b):
    pd = pytest.importorskip("pandas")
    dd = pytest.importorskip("dask.dataframe")

    await a.set_resources(MyRes=1)

    x = dd.from_pandas(pd.DataFrame({"A": [1, 2], "B": [3, 4]}), npartitions=1)
    y = x.apply(lambda row: row.sum(), axis=1, meta=(None, "int64"))
    yy = y.persist(resources={"MyRes": 1})
    await wait(yy)

    ts_first = s.tasks[stringify(y.__dask_keys__()[0])]
    assert ts_first.resource_restrictions == {"MyRes": 1}
    ts_last = s.tasks[stringify(y.__dask_keys__()[-1])]
    assert ts_last.resource_restrictions == {"MyRes": 1}
Exemplo n.º 3
0
    def cause_of_failure(self, *args, keys=(), **kwargs):
        """
        Return details of first failed task required by set of keys

        Parameters
        ----------
        keys : list of keys known to the scheduler

        Returns
        -------
        Dictionary with:
        cause: the key that failed
        task: the definition of that key
        deps: keys that the task depends on
        """
        for key in keys:
            if isinstance(key, list):
                key = tuple(key)  # ensure not a list from msgpack
            key = stringify(key)
            ts = self.scheduler.tasks.get(key)
            if ts is not None and ts.exception_blame is not None:
                cause = ts.exception_blame
                # NOTE: cannot serialize sets
                return {
                    "deps": [dts.key for dts in cause.dependencies],
                    "cause": cause.key,
                    "task": cause.run_spec,
                }
Exemplo n.º 4
0
    def __dask_distributed_annotations_pack__(
        self,
        annotations: Mapping[str, Any] | None = None
    ) -> Mapping[str, Any] | None:
        """Packs Layer annotations for transmission to scheduler

        Callables annotations are fully expanded over Layer keys, while
        other values are simply transmitted as is

        Parameters
        ----------
        annotations : Mapping[str, Any], optional
            A top-level annotations.

        Returns
        -------
        packed_annotations : dict
            Packed annotations.
        """
        annotations = cast(
            "dict[str, Any]",
            toolz.merge(self.annotations or {}, annotations or {}))
        packed = {}
        for a, v in annotations.items():
            if callable(v):
                packed[a] = {stringify(k): v(k) for k in self}
                packed[a]["__expanded_annotations__"] = True
            else:
                packed[a] = v
        return packed
Exemplo n.º 5
0
 async def _set(self, value):
     if isinstance(value, Future):
         await self.client.scheduler.variable_set(
             key=stringify(value.key), name=self.name
         )
     else:
         await self.client.scheduler.variable_set(data=value, name=self.name)
Exemplo n.º 6
0
 async def _put(self, value, timeout=None):
     if isinstance(value, Future):
         await self.client.scheduler.queue_put(key=stringify(value.key),
                                               timeout=timeout,
                                               name=self.name)
     else:
         await self.client.scheduler.queue_put(data=value,
                                               timeout=timeout,
                                               name=self.name)
Exemplo n.º 7
0
async def test_compute(c, s):
    da = pytest.importorskip("dask.array")
    x = da.random.random((10, 10), chunks=(5, 5))
    y = da.random.random((10, 10), chunks=(5, 5))

    low = c.compute(x, priority=-1)
    futures = c.map(slowinc, range(10), delay=0.1)
    high = c.compute(y, priority=1)
    async with Worker(s.address, nthreads=1):
        await wait(high)
        assert all(s.processing.values())
        assert s.tasks[stringify(low.key)].state in ("processing", "waiting")
Exemplo n.º 8
0
 def __init__(self, keys, scheduler, minimum=0, dt=0.1, complete=False):
     self.keys = {k.key if hasattr(k, "key") else k for k in keys}
     self.keys = {stringify(k) for k in self.keys}
     self.scheduler = scheduler
     self.complete = complete
     self._minimum = minimum
     self._dt = dt
     self.last_duration = 0
     self._start_time = default_timer()
     self._running = False
     self.status = None
     self.extra = {}
Exemplo n.º 9
0
async def test_persist_collections(c, s, a, b):
    da = pytest.importorskip("dask.array")
    x = da.arange(10, chunks=(5,))
    y = x.map_blocks(lambda x: x + 1)
    z = y.map_blocks(lambda x: 2 * x)
    w = z.sum()

    ww, yy = c.persist([w, y], resources={tuple(y.__dask_keys__()): {"A": 1}})

    await wait([ww, yy])

    assert all(stringify(key) in a.data for key in y.__dask_keys__())
Exemplo n.º 10
0
async def test_persist_collections(c, s, a, b):
    da = pytest.importorskip("dask.array")
    x = da.arange(10, chunks=(5, ))
    with dask.annotate(resources={"A": 1}):
        y = x.map_blocks(lambda x: x + 1)
    z = y.map_blocks(lambda x: 2 * x)
    w = z.sum()

    ww, yy = c.persist([w, y], optimize_graph=False)

    await wait([ww, yy])

    assert all(stringify(key) in a.data for key in y.__dask_keys__())
Exemplo n.º 11
0
async def test_persist(c, s):
    da = pytest.importorskip("dask.array")
    x = da.random.random((10, 10), chunks=(5, 5))
    y = da.random.random((10, 10), chunks=(5, 5))

    low = x.persist(priority=-1)
    futures = c.map(slowinc, range(10), delay=0.1)
    high = y.persist(priority=1)
    async with Worker(s.address, nthreads=1):
        await wait(high)
        assert all(s.processing.values())
        assert all(s.tasks[stringify(k)].state in ("processing", "waiting")
                   for k in flatten(low.__dask_keys__()))
Exemplo n.º 12
0
def _materialized_layer_pack(
    layer: Layer,
    all_keys,
    known_key_dependencies,
    client,
    client_keys,
):
    from ..client import Future

    dsk = dict(layer)

    # Find aliases not in `client_keys` and substitute all matching keys
    # with its Future
    values = {
        k: v
        for k, v in dsk.items()
        if isinstance(v, Future) and k not in client_keys
    }
    if values:
        dsk = subs_multiple(dsk, values)

    # Unpack remote data and record its dependencies
    dsk = {k: unpack_remotedata(v, byte_keys=True) for k, v in layer.items()}
    unpacked_futures = set.union(*[v[1]
                                   for v in dsk.values()]) if dsk else set()
    for future in unpacked_futures:
        if future.client is not client:
            raise ValueError(
                "Inputs contain futures that were created by another client.")
        if stringify(future.key) not in client.futures:
            raise CancelledError(stringify(future.key))
    unpacked_futures_deps = {}
    for k, v in dsk.items():
        if len(v[1]):
            unpacked_futures_deps[k] = {f.key for f in v[1]}
    dsk = {k: v[0] for k, v in dsk.items()}

    # Calculate dependencies without re-calculating already known dependencies
    missing_keys = set(dsk.keys()).difference(known_key_dependencies.keys())
    dependencies = {
        k: keys_in_tasks(all_keys, [dsk[k]], as_list=False)
        for k in missing_keys
    }
    for k, v in unpacked_futures_deps.items():
        dependencies[k] = set(dependencies.get(k, ())) | v

    # The scheduler expect all keys to be strings
    dependencies = {
        stringify(k): [stringify(dep) for dep in deps]
        for k, deps in dependencies.items()
    }
    all_keys = all_keys.union(dsk)
    dsk = {
        stringify(k): stringify(v, exclusive=all_keys)
        for k, v in dsk.items()
    }
    dsk = valmap(dumps_task, dsk)
    return {"dsk": dsk, "dependencies": dependencies}
Exemplo n.º 13
0
 def put(self,
         comm=None,
         keys=None,
         data=None,
         name=None,
         override=False,
         client=None):
     with log_errors():
         if not override and name in self.datasets:
             raise KeyError("Dataset %s already exists" % name)
         self.scheduler.client_desires_keys(
             keys, "published-%s" % stringify(name))
         self.datasets[name] = {"data": data, "keys": keys}
         return {"status": "OK", "name": name}
Exemplo n.º 14
0
    def __dask_distributed_annotations_unpack__(
        annotations: MutableMapping[str, Any],
        new_annotations: Mapping[str, Any] | None,
        keys: Iterable[Hashable],
    ) -> None:
        """
        Unpack a set of layer annotations across a set of keys, then merge those
        expanded annotations for the layer into an existing annotations mapping.

        This is not a simple shallow merge because some annotations like retries,
        priority, workers, etc need to be able to retain keys from different layers.

        Parameters
        ----------
        annotations: MutableMapping[str, Any], input/output
            Already unpacked annotations, which are to be updated with the new
            unpacked annotations
        new_annotations: Mapping[str, Any], optional
            New annotations to be unpacked into `annotations`
        keys: Iterable
            All keys in the layer.
        """
        if new_annotations is None:
            return

        expanded = {}
        keys_stringified = False

        # Expand the new annotations across the keyset
        for a, v in new_annotations.items():
            if type(v) is dict and "__expanded_annotations__" in v:
                # Maybe do a destructive update for efficiency?
                v = v.copy()
                del v["__expanded_annotations__"]
                expanded[a] = v
            else:
                if not keys_stringified:
                    keys = [stringify(k) for k in keys]
                    keys_stringified = True

                expanded[a] = dict.fromkeys(keys, v)

        # Merge the expanded annotations with the existing annotations mapping
        for k, v in expanded.items():
            v.update(annotations.get(k, {}))
        annotations.update(expanded)
Exemplo n.º 15
0
 def delete(self, comm=None, name=None):
     with log_errors():
         out = self.datasets.pop(name, {"keys": []})
         self.scheduler.client_releases_keys(
             out["keys"], "published-%s" % stringify(name))
Exemplo n.º 16
0
def unpack_remotedata(o, byte_keys=False, myset=None):
    """Unpack WrappedKey objects from collection

    Returns original collection and set of all found WrappedKey objects

    Examples
    --------
    >>> rd = WrappedKey('mykey')
    >>> unpack_remotedata(1)
    (1, set())
    >>> unpack_remotedata(())
    ((), set())
    >>> unpack_remotedata(rd)
    ('mykey', {WrappedKey('mykey')})
    >>> unpack_remotedata([1, rd])
    ([1, 'mykey'], {WrappedKey('mykey')})
    >>> unpack_remotedata({1: rd})
    ({1: 'mykey'}, {WrappedKey('mykey')})
    >>> unpack_remotedata({1: [rd]})
    ({1: ['mykey']}, {WrappedKey('mykey')})

    Use the ``byte_keys=True`` keyword to force string keys

    >>> rd = WrappedKey(('x', 1))
    >>> unpack_remotedata(rd, byte_keys=True)
    ("('x', 1)", {WrappedKey('('x', 1)')})
    """
    if myset is None:
        myset = set()
        out = unpack_remotedata(o, byte_keys, myset)
        return out, myset

    typ = type(o)

    if typ is tuple:
        if not o:
            return o
        if type(o[0]) is SubgraphCallable:
            sc = o[0]
            futures = set()
            dsk = {
                k: unpack_remotedata(v, byte_keys, futures)
                for k, v in sc.dsk.items()
            }
            args = tuple(
                unpack_remotedata(i, byte_keys, futures) for i in o[1:])
            if futures:
                myset.update(futures)
                futures = (tuple(stringify(f.key)
                                 for f in futures) if byte_keys else tuple(
                                     f.key for f in futures))
                inkeys = sc.inkeys + futures
                return ((SubgraphCallable(dsk, sc.outkey, inkeys, sc.name), ) +
                        args + futures)
            else:
                return o
        else:
            return tuple(
                unpack_remotedata(item, byte_keys, myset) for item in o)
    if typ in collection_types:
        if not o:
            return o
        outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
        return typ(outs)
    elif typ is dict:
        if o:
            return {
                k: unpack_remotedata(v, byte_keys, myset)
                for k, v in o.items()
            }
        else:
            return o
    elif issubclass(typ, WrappedKey):  # TODO use type is Future
        k = o.key
        if byte_keys:
            k = stringify(k)
        myset.add(o)
        return k
    else:
        return o
Exemplo n.º 17
0
 def _process_key(self, key):
     if isinstance(key, list):
         key = tuple(key)  # ensure not a list from msgpack
     key = stringify(key)
     return key
Exemplo n.º 18
0
    def __dask_distributed_pack__(
        self,
        all_hlg_keys: Iterable[Hashable],
        known_key_dependencies: Mapping[Hashable, Set],
        client,
        client_keys: Iterable[Hashable],
    ) -> Any:
        """Pack the layer for scheduler communication in Distributed

        This method should pack its current state and is called by the Client when
        communicating with the Scheduler.
        The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack
        the state, materialize the layer, and merge it into the global task graph.

        The returned state must be compatible with Distributed's scheduler, which
        means it must obey the following:
          - Serializable by msgpack (notice, msgpack converts lists to tuples)
          - All remote data must be unpacked (see unpack_remotedata())
          - All keys must be converted to strings now or when unpacking
          - All tasks must be serialized (see dumps_task())

        The default implementation materialize the layer thus layers such as Blockwise
        and ShuffleLayer should implement a specialized pack and unpack function in
        order to avoid materialization.

        Parameters
        ----------
        all_hlg_keys: Iterable[Hashable]
            All keys in the high level graph
        known_key_dependencies: Mapping[Hashable, Set]
            Already known dependencies
        client: distributed.Client
            The client calling this function.
        client_keys : Iterable[Hashable]
            List of keys requested by the client.

        Returns
        -------
        state: Object serializable by msgpack
            Scheduler compatible state of the layer
        """
        from distributed.client import Future
        from distributed.utils import CancelledError
        from distributed.utils_comm import subs_multiple, unpack_remotedata
        from distributed.worker import dumps_task

        dsk = dict(self)

        # Find aliases not in `client_keys` and substitute all matching keys
        # with its Future
        future_aliases = {
            k: v
            for k, v in dsk.items()
            if isinstance(v, Future) and k not in client_keys
        }
        if future_aliases:
            dsk = subs_multiple(dsk, future_aliases)

        # Remove `Future` objects from graph and note any future dependencies
        dsk2 = {}
        fut_deps = {}
        for k, v in dsk.items():
            dsk2[k], futs = unpack_remotedata(v, byte_keys=True)
            if futs:
                fut_deps[k] = futs
        dsk = dsk2

        # Check that any collected futures are valid
        unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set()
        for future in unpacked_futures:
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # Calculate dependencies without re-calculating already known dependencies
        # - Start with known dependencies
        dependencies = ensure_dict(known_key_dependencies, copy=True)
        # - Remove aliases for any tasks that depend on both an alias and a future.
        #   These can only be found in the known_key_dependencies cache, since
        #   any dependencies computed in this method would have already had the
        #   aliases removed.
        if future_aliases:
            alias_keys = set(future_aliases)
            dependencies = {k: v - alias_keys for k, v in dependencies.items()}
        # - Add in deps for any missing keys
        missing_keys = dsk.keys() - dependencies.keys()

        dependencies.update(
            (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False))
            for k in missing_keys)
        # - Add in deps for any tasks that depend on futures
        for k, futures in fut_deps.items():
            if futures:
                d = ensure_set(dependencies[k], copy=True)
                d.update(f.key for f in futures)
                dependencies[k] = d

        # The scheduler expect all keys to be strings
        dependencies = {
            stringify(k): {stringify(dep)
                           for dep in deps}
            for k, deps in dependencies.items()
        }

        merged_hlg_keys = all_hlg_keys | dsk.keys()
        dsk = {
            stringify(k): stringify(v, exclusive=merged_hlg_keys)
            for k, v in dsk.items()
        }
        dsk = toolz.valmap(dumps_task, dsk)
        return {"dsk": dsk, "dependencies": dependencies}