async def test_resources_str(c, s, a, b): pd = pytest.importorskip("pandas") dd = pytest.importorskip("dask.dataframe") await a.set_resources(MyRes=1) x = dd.from_pandas(pd.DataFrame({"A": [1, 2], "B": [3, 4]}), npartitions=1) y = x.apply(lambda row: row.sum(), axis=1, meta=(None, "int64")) yy = y.persist(resources={"MyRes": 1}) await wait(yy) ts_first = s.tasks[tokey(y.__dask_keys__()[0])] assert ts_first.resource_restrictions == {"MyRes": 1} ts_last = s.tasks[tokey(y.__dask_keys__()[-1])] assert ts_last.resource_restrictions == {"MyRes": 1}
def put(self, stream=None, keys=None, data=None, name=None, client=None): with log_errors(): if name in self.datasets: raise KeyError("Dataset %s already exists" % name) self.scheduler.client_desires_keys(keys, 'published-%s' % tokey(name)) self.datasets[name] = {'data': data, 'keys': keys} return {'status': 'OK', 'name': name}
def put(self, stream=None, keys=None, data=None, name=None, client=None): with log_errors(): if name in self.datasets: raise KeyError("Dataset %s already exists" % name) self.scheduler.client_desires_keys(keys, "published-%s" % tokey(name)) self.datasets[name] = {"data": data, "keys": keys} return {"status": "OK", "name": name}
def test_compute(c, s, a, b): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = c.compute(x, priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = c.compute(y, priority=1) yield wait(high) assert all(s.processing.values()) assert s.tasks[tokey(low.key)].state in ("processing", "waiting")
def test_compute(c, s, a, b): da = pytest.importorskip('dask.array') x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = c.compute(x, priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = c.compute(y, priority=1) yield wait(high) assert all(s.processing.values()) assert s.tasks[tokey(low.key)].state in ('processing', 'waiting')
def test_persist(c, s, a, b): da = pytest.importorskip("dask.array") x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) yield wait(high) assert all(s.processing.values()) assert all(s.tasks[tokey(k)].state in ("processing", "waiting") for k in flatten(low.__dask_keys__()))
def test_persist_collections(c, s, a, b): da = pytest.importorskip('dask.array') x = da.arange(10, chunks=(5,)) y = x.map_blocks(lambda x: x + 1) z = y.map_blocks(lambda x: 2 * x) w = z.sum() ww, yy = c.persist([w, y], resources={tuple(y.__dask_keys__()): {'A': 1}}) yield wait([ww, yy]) assert all(tokey(key) in a.data for key in y.__dask_keys__())
def test_persist(c, s, a, b): da = pytest.importorskip('dask.array') x = da.random.random((10, 10), chunks=(5, 5)) y = da.random.random((10, 10), chunks=(5, 5)) low = x.persist(priority=-1) futures = c.map(slowinc, range(10), delay=0.1) high = y.persist(priority=1) yield wait(high) assert all(s.processing.values()) assert all(s.tasks[tokey(k)].state in ('processing', 'waiting') for k in flatten(low.__dask_keys__()))
def test_compute_multidim(c, s, a, b): da = pytest.importorskip('dask.array') np = pytest.importorskip('numpy') x = delayed(np.random.randint)(0, 10, (5, 5)) y = da.from_delayed(x, (5, 5), int) xx = c.compute( x, resources={x: { 'A': 1 }}, ) yy = c.compute( tupl(y.__dask_keys__()), resources={x: { 'A': 1 }}, ) yield wait([xx, yy]) assert all(tokey(key) in a.data for key in x.__dask_keys__()) assert all(tokey(key) in a.data for key in y.__dask_keys__())
def key_stringify(task): """Convert all keys in `task` to strings. This is a fast version of distributed.utils.str_graph() that only handles keys of the from: `("a string", ...)` """ from distributed.utils import tokey typ = type(task) if typ is tuple and task and callable(task[0]): return (task[0], ) + tuple(key_stringify(x) for x in task[1:]) if typ is list: return [key_stringify(v) for v in task] if typ is dict: return {k: key_stringify(v) for k, v in task.items()} if typ is tuple and task and type(task[0]) is str: return tokey(task) elif typ is tuple: # If the tuple itself isn't a key, check its elements return tuple(key_stringify(v) for v in task) return task
def __dask_distributed_unpack__(cls, state, dsk, dependencies): from distributed.worker import dumps_task from distributed.utils import tokey # msgpack will convert lists into tuples, here # we convert them back to lists if isinstance(state["column"], tuple): state["column"] = list(state["column"]) if "inputs" in state: state["inputs"] = list(state["inputs"]) # Materialize the layer raw = dict(cls(**state)) # Convert all keys to strings and dump tasks raw = {tokey(k): key_stringify(v) for k, v in raw.items()} dsk.update(toolz.valmap(dumps_task, raw)) # TODO: use shuffle-knowledge to calculate dependencies more efficiently dependencies.update( {k: keys_in_tasks(dsk, [v], as_list=True) for k, v in raw.items()})
def delete(self, stream=None, name=None): with log_errors(): out = self.datasets.pop(name, {'keys': []}) self.scheduler.client_releases_keys(out['keys'], 'published-%s' % tokey(name))
def test_pack_data_with_key_mapping(): data = {tokey(('x', 1)): 1} assert pack_data((('x', 1), 'y'), data) == (1, 'y')
def delete(self, stream=None, name=None): with log_errors(): out = self.datasets.pop(name, {"keys": []}) self.scheduler.client_releases_keys(out["keys"], "published-%s" % tokey(name))