Пример #1
0
    def __dask_distributed_pack__(self, client):
        from distributed.worker import dumps_function
        from distributed.utils import CancelledError
        from distributed.utils_comm import unpack_remotedata

        keys = tuple(map(blockwise_token, range(len(self.indices))))
        dsk, _ = fuse(self.dsk, [self.output])

        dsk = (SubgraphCallable(dsk, self.output, keys), )
        dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True)

        func = dumps_function(dsk[0])
        func_future_args = dsk[1:]

        indices = list(toolz.concat(self.indices))
        indices, indices_unpacked_futures = unpack_remotedata(indices,
                                                              byte_keys=True)

        # Check the legality of the unpacked futures
        for future in itertools.chain(dsk_unpacked_futures,
                                      indices_unpacked_futures):
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # All blockwise tasks will depend on the futures in `indices`
        global_dependencies = tuple(
            stringify(f.key) for f in indices_unpacked_futures)

        ret = {
            "output":
            self.output,
            "output_indices":
            self.output_indices,
            "func":
            func,
            "func_future_args":
            func_future_args,
            "global_dependencies":
            global_dependencies,
            "indices":
            indices,
            "numblocks":
            self.numblocks,
            "concatenate":
            self.concatenate,
            "new_axes":
            self.new_axes,
            "io_subgraph":
            (self.io_name, self.io_subgraph) if self.io_name else (None, None),
            "output_blocks":
            self.output_blocks,
            "dims":
            self.dims,
        }
        return ret
Пример #2
0
    def __dask_distributed_pack__(
        self,
        all_hlg_keys: Iterable[Hashable],
        known_key_dependencies: Mapping[Hashable, set],
        client,
        client_keys: Iterable[Hashable],
    ) -> Any:
        """Pack the layer for scheduler communication in Distributed

        This method should pack its current state and is called by the Client when
        communicating with the Scheduler.
        The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack
        the state, materialize the layer, and merge it into the global task graph.

        The returned state must be compatible with Distributed's scheduler, which
        means it must obey the following:
          - Serializable by msgpack (notice, msgpack converts lists to tuples)
          - All remote data must be unpacked (see unpack_remotedata())
          - All keys must be converted to strings now or when unpacking
          - All tasks must be serialized (see dumps_task())

        The default implementation materialize the layer thus layers such as Blockwise
        and ShuffleLayer should implement a specialized pack and unpack function in
        order to avoid materialization.

        Parameters
        ----------
        all_hlg_keys: Iterable[Hashable]
            All keys in the high level graph
        known_key_dependencies: Mapping[Hashable, set]
            Already known dependencies
        client: distributed.Client
            The client calling this function.
        client_keys : Iterable[Hashable]
            List of keys requested by the client.

        Returns
        -------
        state: Object serializable by msgpack
            Scheduler compatible state of the layer
        """
        from distributed.client import Future
        from distributed.utils import CancelledError
        from distributed.utils_comm import subs_multiple, unpack_remotedata
        from distributed.worker import dumps_task

        dsk = dict(self)

        # Find aliases not in `client_keys` and substitute all matching keys
        # with its Future
        values = {
            k: v
            for k, v in dsk.items()
            if isinstance(v, Future) and k not in client_keys
        }
        if values:
            dsk = subs_multiple(dsk, values)

        # Unpack remote data and record its dependencies
        dsk = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
        unpacked_futures = set.union(*[v[1] for v in dsk.values()]) if dsk else set()
        for future in unpacked_futures:
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))
        unpacked_futures_deps = {}
        for k, v in dsk.items():
            if len(v[1]):
                unpacked_futures_deps[k] = {f.key for f in v[1]}
        dsk = {k: v[0] for k, v in dsk.items()}

        # Calculate dependencies without re-calculating already known dependencies
        missing_keys = dsk.keys() - known_key_dependencies.keys()
        dependencies = {
            k: keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False)
            for k in missing_keys
        }
        for k, v in unpacked_futures_deps.items():
            dependencies[k] = set(dependencies.get(k, ())) | v
        dependencies.update(known_key_dependencies)

        # The scheduler expect all keys to be strings
        dependencies = {
            stringify(k): {stringify(dep) for dep in deps}
            for k, deps in dependencies.items()
        }

        merged_hlg_keys = all_hlg_keys | dsk.keys()
        dsk = {
            stringify(k): stringify(v, exclusive=merged_hlg_keys)
            for k, v in dsk.items()
        }
        dsk = toolz.valmap(dumps_task, dsk)
        return {"dsk": dsk, "dependencies": dependencies}
Пример #3
0
    def __dask_distributed_pack__(
        self,
        all_hlg_keys: Iterable[Hashable],
        known_key_dependencies: Mapping[Hashable, Set],
        client,
        client_keys: Iterable[Hashable],
    ) -> Any:
        """Pack the layer for scheduler communication in Distributed

        This method should pack its current state and is called by the Client when
        communicating with the Scheduler.
        The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack
        the state, materialize the layer, and merge it into the global task graph.

        The returned state must be compatible with Distributed's scheduler, which
        means it must obey the following:
          - Serializable by msgpack (notice, msgpack converts lists to tuples)
          - All remote data must be unpacked (see unpack_remotedata())
          - All keys must be converted to strings now or when unpacking
          - All tasks must be serialized (see dumps_task())

        The default implementation materialize the layer thus layers such as Blockwise
        and ShuffleLayer should implement a specialized pack and unpack function in
        order to avoid materialization.

        Parameters
        ----------
        all_hlg_keys: Iterable[Hashable]
            All keys in the high level graph
        known_key_dependencies: Mapping[Hashable, Set]
            Already known dependencies
        client: distributed.Client
            The client calling this function.
        client_keys : Iterable[Hashable]
            List of keys requested by the client.

        Returns
        -------
        state: Object serializable by msgpack
            Scheduler compatible state of the layer
        """
        from distributed.client import Future
        from distributed.utils import CancelledError
        from distributed.utils_comm import subs_multiple, unpack_remotedata
        from distributed.worker import dumps_task

        dsk = dict(self)

        # Find aliases not in `client_keys` and substitute all matching keys
        # with its Future
        future_aliases = {
            k: v
            for k, v in dsk.items()
            if isinstance(v, Future) and k not in client_keys
        }
        if future_aliases:
            dsk = subs_multiple(dsk, future_aliases)

        # Remove `Future` objects from graph and note any future dependencies
        dsk2 = {}
        fut_deps = {}
        for k, v in dsk.items():
            dsk2[k], futs = unpack_remotedata(v, byte_keys=True)
            if futs:
                fut_deps[k] = futs
        dsk = dsk2

        # Check that any collected futures are valid
        unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set()
        for future in unpacked_futures:
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # Calculate dependencies without re-calculating already known dependencies
        # - Start with known dependencies
        dependencies = ensure_dict(known_key_dependencies, copy=True)
        # - Remove aliases for any tasks that depend on both an alias and a future.
        #   These can only be found in the known_key_dependencies cache, since
        #   any dependencies computed in this method would have already had the
        #   aliases removed.
        if future_aliases:
            alias_keys = set(future_aliases)
            dependencies = {k: v - alias_keys for k, v in dependencies.items()}
        # - Add in deps for any missing keys
        missing_keys = dsk.keys() - dependencies.keys()

        dependencies.update(
            (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False))
            for k in missing_keys)
        # - Add in deps for any tasks that depend on futures
        for k, futures in fut_deps.items():
            if futures:
                d = ensure_set(dependencies[k], copy=True)
                d.update(f.key for f in futures)
                dependencies[k] = d

        # The scheduler expect all keys to be strings
        dependencies = {
            stringify(k): {stringify(dep)
                           for dep in deps}
            for k, deps in dependencies.items()
        }

        merged_hlg_keys = all_hlg_keys | dsk.keys()
        dsk = {
            stringify(k): stringify(v, exclusive=merged_hlg_keys)
            for k, v in dsk.items()
        }
        dsk = toolz.valmap(dumps_task, dsk)
        return {"dsk": dsk, "dependencies": dependencies}
Пример #4
0
    def __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies,
                                  client, client_keys):
        from distributed.protocol.serialize import import_allowed_module
        from distributed.utils import CancelledError
        from distributed.utils_comm import unpack_remotedata
        from distributed.worker import dumps_function

        keys = tuple(map(blockwise_token, range(len(self.indices))))
        dsk, _ = fuse(self.dsk, [self.output])

        # Embed literals in `dsk`
        keys2 = []
        indices2 = []
        for key, (val, index) in zip(keys, self.indices):
            if index is None:  # Literal
                dsk[key] = val
            else:
                keys2.append(key)
                indices2.append((val, index))

        dsk = (SubgraphCallable(dsk, self.output, tuple(keys2)), )
        dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True)

        func = dumps_function(dsk[0])
        func_future_args = dsk[1:]

        indices = list(toolz.concat(indices2))
        indices, indices_unpacked_futures = unpack_remotedata(indices,
                                                              byte_keys=True)

        # Check the legality of the unpacked futures
        for future in itertools.chain(dsk_unpacked_futures,
                                      indices_unpacked_futures):
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # All blockwise tasks will depend on the futures in `indices`
        global_dependencies = {
            stringify(f.key)
            for f in indices_unpacked_futures
        }

        # Handle `io_deps` serialization.
        # If `io_deps[<collection_key>]` is just a dict, we rely
        # entirely on msgpack.  It is up to the `Blockwise` layer to
        # ensure that all arguments are msgpack serializable. To enable
        # more control over serialization, a `BlockwiseIODeps` mapping
        # subclass can be defined with the necessary
        # `__dask_distributed_{pack,unpack}__` methods.
        packed_io_deps = {}
        for name, input_map in self.io_deps.items():
            if isinstance(input_map, tuple):
                # Use the `__dask_distributed_pack__` definition for the
                # specified `BlockwiseIODeps` subclass
                module_name, attr_name = input_map[0].rsplit(".", 1)
                io_dep_map = getattr(import_allowed_module(module_name),
                                     attr_name)
                packed_io_deps[name] = io_dep_map.__dask_distributed_pack__(
                    *input_map)
            else:
                packed_io_deps[name] = input_map

        return {
            "output": self.output,
            "output_indices": self.output_indices,
            "func": func,
            "func_future_args": func_future_args,
            "global_dependencies": global_dependencies,
            "indices": indices,
            "is_list": [isinstance(x, list) for x in indices],
            "numblocks": self.numblocks,
            "concatenate": self.concatenate,
            "new_axes": self.new_axes,
            "output_blocks": self.output_blocks,
            "dims": self.dims,
            "io_deps": packed_io_deps,
        }