def test_ensure_set(): s = {1} assert ensure_set(s) is s class myset(set): pass s2 = ensure_set(s, copy=True) s3 = ensure_set(myset(s)) for si in (s2, s3): assert type(si) is set assert si is not s assert si == s
def __dask_distributed_pack__( self, all_hlg_keys: Iterable[Hashable], known_key_dependencies: Mapping[Hashable, Set], client, client_keys: Iterable[Hashable], ) -> Any: """Pack the layer for scheduler communication in Distributed This method should pack its current state and is called by the Client when communicating with the Scheduler. The Scheduler will then use .__dask_distributed_unpack__(data, ...) to unpack the state, materialize the layer, and merge it into the global task graph. The returned state must be compatible with Distributed's scheduler, which means it must obey the following: - Serializable by msgpack (notice, msgpack converts lists to tuples) - All remote data must be unpacked (see unpack_remotedata()) - All keys must be converted to strings now or when unpacking - All tasks must be serialized (see dumps_task()) The default implementation materialize the layer thus layers such as Blockwise and ShuffleLayer should implement a specialized pack and unpack function in order to avoid materialization. Parameters ---------- all_hlg_keys: Iterable[Hashable] All keys in the high level graph known_key_dependencies: Mapping[Hashable, Set] Already known dependencies client: distributed.Client The client calling this function. client_keys : Iterable[Hashable] List of keys requested by the client. Returns ------- state: Object serializable by msgpack Scheduler compatible state of the layer """ from distributed.client import Future from distributed.utils import CancelledError from distributed.utils_comm import subs_multiple, unpack_remotedata from distributed.worker import dumps_task dsk = dict(self) # Find aliases not in `client_keys` and substitute all matching keys # with its Future future_aliases = { k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys } if future_aliases: dsk = subs_multiple(dsk, future_aliases) # Remove `Future` objects from graph and note any future dependencies dsk2 = {} fut_deps = {} for k, v in dsk.items(): dsk2[k], futs = unpack_remotedata(v, byte_keys=True) if futs: fut_deps[k] = futs dsk = dsk2 # Check that any collected futures are valid unpacked_futures = set.union(*fut_deps.values()) if fut_deps else set() for future in unpacked_futures: if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # Calculate dependencies without re-calculating already known dependencies # - Start with known dependencies dependencies = ensure_dict(known_key_dependencies, copy=True) # - Remove aliases for any tasks that depend on both an alias and a future. # These can only be found in the known_key_dependencies cache, since # any dependencies computed in this method would have already had the # aliases removed. if future_aliases: alias_keys = set(future_aliases) dependencies = {k: v - alias_keys for k, v in dependencies.items()} # - Add in deps for any missing keys missing_keys = dsk.keys() - dependencies.keys() dependencies.update( (k, keys_in_tasks(all_hlg_keys, [dsk[k]], as_list=False)) for k in missing_keys) # - Add in deps for any tasks that depend on futures for k, futures in fut_deps.items(): if futures: d = ensure_set(dependencies[k], copy=True) d.update(f.key for f in futures) dependencies[k] = d # The scheduler expect all keys to be strings dependencies = { stringify(k): {stringify(dep) for dep in deps} for k, deps in dependencies.items() } merged_hlg_keys = all_hlg_keys | dsk.keys() dsk = { stringify(k): stringify(v, exclusive=merged_hlg_keys) for k, v in dsk.items() } dsk = toolz.valmap(dumps_task, dsk) return {"dsk": dsk, "dependencies": dependencies}