def test_async(c, s, a, b): x = create_test_data() assert not dask.is_dask_collection(x) y = x.chunk({'dim2': 4}) + 10 assert dask.is_dask_collection(y) assert dask.is_dask_collection(y.var1) assert dask.is_dask_collection(y.var2) z = y.persist() assert str(z) assert dask.is_dask_collection(z) assert dask.is_dask_collection(z.var1) assert dask.is_dask_collection(z.var2) assert len(y.__dask_graph__()) > len(z.__dask_graph__()) assert not futures_of(y) assert futures_of(z) future = c.compute(z) w = yield future assert not dask.is_dask_collection(w) assert_allclose(x + 10, w) assert s.tasks
def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = yield Client((s.ip, s.port), asynchronous=True) f = yield Client((s.ip, s.port), asynchronous=True) bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c.publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f.get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()} out = yield f.compute(result) assert out == [0, 1, 2] yield c.close() yield f.close()
async def test_publish_bag(s, a, b): db = pytest.importorskip("dask.bag") c = await Client(s.address, asynchronous=True) f = await Client(s.address, asynchronous=True) bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) await c.publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = await f.get_dataset("data") assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values() } == {f.key for f in bagp.dask.values()} out = await f.compute(result) assert out == [0, 1, 2] await c.close() await f.close()
def tqdm_dask(futures, **kwargs): notebook = is_kernel() futures = futures_of(futures) if not isinstance(futures, (set, list)): futures = [futures] if notebook: return TqdmNotebookProgress(futures, **kwargs) else: TqdmProgressBar(futures, **kwargs)
async def _get_errored_future(self, future): """ For a given future collection, return the first future that raised an error. """ await wait(future) futures = [f.key for f in futures_of(future) if f.status == "error"] if not futures: raise ValueError("No errored futures passed") cause_key = await self.scheduler.get_error_cause(keys=futures) return cause_key
async def test_async(c, s, a, b): """Test asynchronous operations.""" da = dask.array.arange(0, 25, chunks=5, dtype=float).reshape((5, 5)) q = ureg.Quantity(da, units_) x = q + ureg.Quantity(5, units_) y = x.persist() assert str(y) assert dask.is_dask_collection(y) assert len(x.__dask_graph__()) > len(y.__dask_graph__()) assert not futures_of(x) assert futures_of(y) future = c.compute(y) w = await future assert not dask.is_dask_collection(w) truth = np.arange(0, 25, dtype=float).reshape((5, 5)) + 5 assert np.all(truth == w.m)
def release_collection(collection, client=None): """ An explicit unpersist() function for dask collections, for when you can't merely release the reference because it is held by a downstream persisted() task. Copied from: - https://github.com/dask/dask/issues/2492 - https://stackoverflow.com/questions/44797668 """ if client is None or isinstance(client, DebugClient): return for future in futures_of(collection): future.release()
def progress(*futures, notebook=None, multi=True, complete=True, **kwargs): """Track progress of futures This operates differently in the notebook and the console * Notebook: This returns immediately, leaving an IPython widget on screen * Console: This blocks until the computation completes Parameters ---------- futures : Futures A list of futures or keys to track notebook : bool (optional) Running in the notebook or not (defaults to guess) multi : bool (optional) Track different functions independently (defaults to True) complete : bool (optional) Track all keys (True) or only keys that have not yet run (False) (defaults to True) Notes ----- In the notebook, the output of `progress` must be the last statement in the cell. Typically, this means calling `progress` at the end of a cell. Examples -------- >>> progress(futures) # doctest: +SKIP [########################################] | 100% Completed | 1.7s """ futures = futures_of(futures) if not isinstance(futures, (set, list)): futures = [futures] if notebook is None: notebook = is_kernel() # often but not always correct assumption if notebook: if multi: bar = MultiProgressWidget(futures, complete=complete, **kwargs) else: bar = ProgressWidget(futures, complete=complete, **kwargs) return bar else: TextProgressBar(futures, complete=complete, **kwargs)
def progress(*futures): """Track progress of dask computation in a remote cluster. LogProgressBar is defined inside here to avoid having to import its dependencies if not used. """ # Import distributed only when used from distributed.client import futures_of # pylint: disable=C0415 from distributed.diagnostics.progressbar import TextProgressBar # pylint: disable=c0415 class LogProgressBar(TextProgressBar): """Dask progress bar based on logging instead of stdout.""" last = 0 logger = logging.getLogger('distributed') def _draw_bar(self, remaining, all, **kwargs): # pylint: disable=W0221,W0622 done = all - remaining frac = (done / all) if all else 0 if frac > self.last + 0.01: self.last = int(frac * 100) / 100 bar = "#" * int(self.width * frac) percent = int(100 * frac) time_per_task = self.elapsed / (all - remaining) remaining_time = timedelta(seconds=time_per_task * remaining) eta = datetime.utcnow() + remaining_time elapsed = timedelta(seconds=self.elapsed) msg = "[{0:<{1}}] | {2}/{3} ({4}%) Completed | {5} | {6} | {7}".format( bar, self.width, done, all, percent, elapsed, remaining_time, eta ) self.logger.info(msg) LOGGER.info(msg) def _draw_stop(self, **kwargs): pass futures = futures_of(futures) if not isinstance(futures, (set, list)): futures = [futures] LogProgressBar(futures)
def _compute(collection, recreate_error_locally=True, **kwargs): """This function works exactly as dask.compute, but automatically recreates the error locally so that it can be inspected with pdb. """ try: client = distributed.Client.current() except ValueError as e: if "No clients found" not in str(e): raise e client = None if client: collection = collection.persist() futures = futures_of(collection) distributed.wait(futures) gathered = gather(futures, recreate_error_locally) f, a = collection.__dask_postcompute__() res = f(gathered, *a) return res else: return collection.compute( recreate_error_locally=recreate_error_locally)
def progress(*futures): futures = futures_of(futures) if not isinstance(futures, (set, list)): futures = [futures] LogProgressBar(futures)
some_bag.npartitions # In[ ]: distributed_array.chunks # In[ ]: import dask.dataframe as dd df = dd.from_dask_array(distributed_array) # In[ ]: df.index # In[ ]: #tag::manual_persist[] df.persist # You do a bunch of things on DF # I'm done! from distributed.client import futures_of list(map(lambda x: x.release(), futures_of(df))) #end::manual_persist[] # In[ ]: # In[ ]: