def __dask_distributed_pack__(self, client): from distributed.worker import dumps_function from distributed.utils import CancelledError from distributed.utils_comm import unpack_remotedata keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) dsk = (SubgraphCallable(dsk, self.output, keys), ) dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True) func = dumps_function(dsk[0]) func_future_args = dsk[1:] indices = list(toolz.concat(self.indices)) indices, indices_unpacked_futures = unpack_remotedata(indices, byte_keys=True) # Check the legality of the unpacked futures for future in itertools.chain(dsk_unpacked_futures, indices_unpacked_futures): if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # All blockwise tasks will depend on the futures in `indices` global_dependencies = tuple( stringify(f.key) for f in indices_unpacked_futures) ret = { "output": self.output, "output_indices": self.output_indices, "func": func, "func_future_args": func_future_args, "global_dependencies": global_dependencies, "indices": indices, "numblocks": self.numblocks, "concatenate": self.concatenate, "new_axes": self.new_axes, "io_subgraph": (self.io_name, self.io_subgraph) if self.io_name else (None, None), "output_blocks": self.output_blocks, "dims": self.dims, } return ret
def __dask_distributed_pack__( self, all_hlg_keys: Iterable[Hashable], known_key_dependencies: Mapping[Hashable, set], client, client_keys: Iterable[Hashable], ) -> Any: """Pack the layer for scheduler communication in Distributed This method should pack its current state and is called by the Client when communicating with the Scheduler. The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack the state, materialize the layer, and merge it into the global task graph. The returned state must be compatible with Distributed's scheduler, which means it must obey the following: - Serializable by msgpack (notice, msgpack converts lists to tuples) - All remote data must be unpacked (see unpack_remotedata()) - All keys must be converted to strings now or when unpacking - All tasks must be serialized (see dumps_task()) The default implementation materialize the layer thus layers such as Blockwise and ShuffleLayer should implement a specialized pack and unpack function in order to avoid materialization. Parameters ---------- all_hlg_keys: Iterable[Hashable] All keys in the high level graph known_key_dependencies: Mapping[Hashable, set] Already known dependencies client: distributed.Client The client calling this function. client_keys : Iterable[Hashable] List of keys requested by the client. Returns ------- state: Object serializable by msgpack Scheduler compatible state of the layer """ from distributed.client import Future from distributed.utils import CancelledError from distributed.utils_comm import subs_multiple, unpack_remotedata from distributed.worker import dumps_task dsk = dict(self) # Find aliases not in `client_keys` and substitute all matching keys # with its Future values = { k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys } if values: dsk = subs_multiple(dsk, values) # Unpack remote data and record its dependencies dsk = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()} unpacked_futures = set.union(*[v[1] for v in dsk.values()]) if dsk else set() for future in unpacked_futures: if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) unpacked_futures_deps = {} for k, v in dsk.items(): if len(v[1]): unpacked_futures_deps[k] = {f.key for f in v[1]} dsk = {k: v[0] for k, v in dsk.items()} # Calculate dependencies without re-calculating already known dependencies missing_keys = dsk.keys() - known_key_dependencies.keys() dependencies = { k: keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False) for k in missing_keys } for k, v in unpacked_futures_deps.items(): dependencies[k] = set(dependencies.get(k, ())) | v dependencies.update(known_key_dependencies) # The scheduler expect all keys to be strings dependencies = { stringify(k): {stringify(dep) for dep in deps} for k, deps in dependencies.items() } merged_hlg_keys = all_hlg_keys | dsk.keys() dsk = { stringify(k): stringify(v, exclusive=merged_hlg_keys) for k, v in dsk.items() } dsk = toolz.valmap(dumps_task, dsk) return {"dsk": dsk, "dependencies": dependencies}
def __dask_distributed_pack__( self, all_hlg_keys: Iterable[Hashable], known_key_dependencies: Mapping[Hashable, Set], client, client_keys: Iterable[Hashable], ) -> Any: """Pack the layer for scheduler communication in Distributed This method should pack its current state and is called by the Client when communicating with the Scheduler. The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack the state, materialize the layer, and merge it into the global task graph. The returned state must be compatible with Distributed's scheduler, which means it must obey the following: - Serializable by msgpack (notice, msgpack converts lists to tuples) - All remote data must be unpacked (see unpack_remotedata()) - All keys must be converted to strings now or when unpacking - All tasks must be serialized (see dumps_task()) The default implementation materialize the layer thus layers such as Blockwise and ShuffleLayer should implement a specialized pack and unpack function in order to avoid materialization. Parameters ---------- all_hlg_keys: Iterable[Hashable] All keys in the high level graph known_key_dependencies: Mapping[Hashable, Set] Already known dependencies client: distributed.Client The client calling this function. client_keys : Iterable[Hashable] List of keys requested by the client. Returns ------- state: Object serializable by msgpack Scheduler compatible state of the layer """ from distributed.client import Future from distributed.utils import CancelledError from distributed.utils_comm import subs_multiple, unpack_remotedata from distributed.worker import dumps_task dsk = dict(self) # Find aliases not in `client_keys` and substitute all matching keys # with its Future future_aliases = { k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys } if future_aliases: dsk = subs_multiple(dsk, future_aliases) # Remove `Future` objects from graph and note any future dependencies dsk2 = {} fut_deps = {} for k, v in dsk.items(): dsk2[k], futs = unpack_remotedata(v, byte_keys=True) if futs: fut_deps[k] = futs dsk = dsk2 # Check that any collected futures are valid unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set() for future in unpacked_futures: if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # Calculate dependencies without re-calculating already known dependencies # - Start with known dependencies dependencies = ensure_dict(known_key_dependencies, copy=True) # - Remove aliases for any tasks that depend on both an alias and a future. # These can only be found in the known_key_dependencies cache, since # any dependencies computed in this method would have already had the # aliases removed. if future_aliases: alias_keys = set(future_aliases) dependencies = {k: v - alias_keys for k, v in dependencies.items()} # - Add in deps for any missing keys missing_keys = dsk.keys() - dependencies.keys() dependencies.update( (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False)) for k in missing_keys) # - Add in deps for any tasks that depend on futures for k, futures in fut_deps.items(): if futures: d = ensure_set(dependencies[k], copy=True) d.update(f.key for f in futures) dependencies[k] = d # The scheduler expect all keys to be strings dependencies = { stringify(k): {stringify(dep) for dep in deps} for k, deps in dependencies.items() } merged_hlg_keys = all_hlg_keys | dsk.keys() dsk = { stringify(k): stringify(v, exclusive=merged_hlg_keys) for k, v in dsk.items() } dsk = toolz.valmap(dumps_task, dsk) return {"dsk": dsk, "dependencies": dependencies}
def __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys): from distributed.protocol.serialize import import_allowed_module from distributed.utils import CancelledError from distributed.utils_comm import unpack_remotedata from distributed.worker import dumps_function keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) # Embed literals in `dsk` keys2 = [] indices2 = [] for key, (val, index) in zip(keys, self.indices): if index is None: # Literal dsk[key] = val else: keys2.append(key) indices2.append((val, index)) dsk = (SubgraphCallable(dsk, self.output, tuple(keys2)), ) dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True) func = dumps_function(dsk[0]) func_future_args = dsk[1:] indices = list(toolz.concat(indices2)) indices, indices_unpacked_futures = unpack_remotedata(indices, byte_keys=True) # Check the legality of the unpacked futures for future in itertools.chain(dsk_unpacked_futures, indices_unpacked_futures): if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # All blockwise tasks will depend on the futures in `indices` global_dependencies = { stringify(f.key) for f in indices_unpacked_futures } # Handle `io_deps` serialization. # If `io_deps[<collection_key>]` is just a dict, we rely # entirely on msgpack. It is up to the `Blockwise` layer to # ensure that all arguments are msgpack serializable. To enable # more control over serialization, a `BlockwiseIODeps` mapping # subclass can be defined with the necessary # `__dask_distributed_{pack,unpack}__` methods. packed_io_deps = {} for name, input_map in self.io_deps.items(): if isinstance(input_map, tuple): # Use the `__dask_distributed_pack__` definition for the # specified `BlockwiseIODeps` subclass module_name, attr_name = input_map[0].rsplit(".", 1) io_dep_map = getattr(import_allowed_module(module_name), attr_name) packed_io_deps[name] = io_dep_map.__dask_distributed_pack__( *input_map) else: packed_io_deps[name] = input_map return { "output": self.output, "output_indices": self.output_indices, "func": func, "func_future_args": func_future_args, "global_dependencies": global_dependencies, "indices": indices, "is_list": [isinstance(x, list) for x in indices], "numblocks": self.numblocks, "concatenate": self.concatenate, "new_axes": self.new_axes, "output_blocks": self.output_blocks, "dims": self.dims, "io_deps": packed_io_deps, }