Exemplo n.º 1
0
def test_async(c, s, a, b):
    x = create_test_data()
    assert not dask.is_dask_collection(x)
    y = x.chunk({'dim2': 4}) + 10
    assert dask.is_dask_collection(y)
    assert dask.is_dask_collection(y.var1)
    assert dask.is_dask_collection(y.var2)

    z = y.persist()
    assert str(z)

    assert dask.is_dask_collection(z)
    assert dask.is_dask_collection(z.var1)
    assert dask.is_dask_collection(z.var2)
    assert len(y.__dask_graph__()) > len(z.__dask_graph__())

    assert not futures_of(y)
    assert futures_of(z)

    future = c.compute(z)
    w = yield future
    assert not dask.is_dask_collection(w)
    assert_allclose(x + 10, w)

    assert s.tasks
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = yield Client((s.ip, s.port), asynchronous=True)
    f = yield Client((s.ip, s.port), asynchronous=True)

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c.publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f.get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)
    assert out == [0, 1, 2]
    yield c.close()
    yield f.close()
Exemplo n.º 3
0
def test_publish_bag(s, a, b):
    db = pytest.importorskip('dask.bag')
    c = yield Client((s.ip, s.port), asynchronous=True)
    f = yield Client((s.ip, s.port), asynchronous=True)

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    yield c.publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = yield f.get_dataset('data')
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()}

    out = yield f.compute(result)
    assert out == [0, 1, 2]
    yield c.close()
    yield f.close()
Exemplo n.º 4
0
async def test_publish_bag(s, a, b):
    db = pytest.importorskip("dask.bag")
    c = await Client(s.address, asynchronous=True)
    f = await Client(s.address, asynchronous=True)

    bag = db.from_sequence([0, 1, 2])
    bagp = c.persist(bag)

    assert len(futures_of(bagp)) == 3
    keys = {f.key for f in futures_of(bagp)}
    assert keys == set(bag.dask)

    await c.publish_dataset(data=bagp)

    # check that serialization didn't affect original bag's dask
    assert len(futures_of(bagp)) == 3

    result = await f.get_dataset("data")
    assert set(result.dask.keys()) == set(bagp.dask.keys())
    assert {f.key
            for f in result.dask.values()
            } == {f.key
                  for f in bagp.dask.values()}

    out = await f.compute(result)
    assert out == [0, 1, 2]
    await c.close()
    await f.close()
Exemplo n.º 5
0
def test_async(c, s, a, b):
    x = create_test_data()
    assert not dask.is_dask_collection(x)
    y = x.chunk({'dim2': 4}) + 10
    assert dask.is_dask_collection(y)
    assert dask.is_dask_collection(y.var1)
    assert dask.is_dask_collection(y.var2)

    z = y.persist()
    assert str(z)

    assert dask.is_dask_collection(z)
    assert dask.is_dask_collection(z.var1)
    assert dask.is_dask_collection(z.var2)
    assert len(y.__dask_graph__()) > len(z.__dask_graph__())

    assert not futures_of(y)
    assert futures_of(z)

    future = c.compute(z)
    w = yield future
    assert not dask.is_dask_collection(w)
    assert_allclose(x + 10, w)

    assert s.tasks
Exemplo n.º 6
0
def tqdm_dask(futures, **kwargs):
    notebook = is_kernel()
    futures = futures_of(futures)
    if not isinstance(futures, (set, list)):
        futures = [futures]
    if notebook:
        return TqdmNotebookProgress(futures, **kwargs)
    else:
        TqdmProgressBar(futures, **kwargs)
Exemplo n.º 7
0
 async def _get_errored_future(self, future):
     """
     For a given future collection, return the first future that raised
     an error.
     """
     await wait(future)
     futures = [f.key for f in futures_of(future) if f.status == "error"]
     if not futures:
         raise ValueError("No errored futures passed")
     cause_key = await self.scheduler.get_error_cause(keys=futures)
     return cause_key
Exemplo n.º 8
0
async def test_async(c, s, a, b):
    """Test asynchronous operations."""
    da = dask.array.arange(0, 25, chunks=5, dtype=float).reshape((5, 5))
    q = ureg.Quantity(da, units_)

    x = q + ureg.Quantity(5, units_)
    y = x.persist()
    assert str(y)

    assert dask.is_dask_collection(y)
    assert len(x.__dask_graph__()) > len(y.__dask_graph__())

    assert not futures_of(x)
    assert futures_of(y)

    future = c.compute(y)
    w = await future
    assert not dask.is_dask_collection(w)

    truth = np.arange(0, 25, dtype=float).reshape((5, 5)) + 5
    assert np.all(truth == w.m)
Exemplo n.º 9
0
def release_collection(collection, client=None):
    """
    An explicit unpersist() function for dask collections,
    for when you can't merely release the reference because
    it is held by a downstream persisted() task.

    Copied from:
        - https://github.com/dask/dask/issues/2492
        - https://stackoverflow.com/questions/44797668
    """
    if client is None or isinstance(client, DebugClient):
        return

    for future in futures_of(collection):
        future.release()
Exemplo n.º 10
0
def progress(*futures, notebook=None, multi=True, complete=True, **kwargs):
    """Track progress of futures

    This operates differently in the notebook and the console

    *  Notebook:  This returns immediately, leaving an IPython widget on screen
    *  Console:  This blocks until the computation completes

    Parameters
    ----------
    futures : Futures
        A list of futures or keys to track
    notebook : bool (optional)
        Running in the notebook or not (defaults to guess)
    multi : bool (optional)
        Track different functions independently (defaults to True)
    complete : bool (optional)
        Track all keys (True) or only keys that have not yet run (False)
        (defaults to True)

    Notes
    -----
    In the notebook, the output of `progress` must be the last statement
    in the cell. Typically, this means calling `progress` at the end of a
    cell.

    Examples
    --------
    >>> progress(futures)  # doctest: +SKIP
    [########################################] | 100% Completed |  1.7s
    """
    futures = futures_of(futures)
    if not isinstance(futures, (set, list)):
        futures = [futures]
    if notebook is None:
        notebook = is_kernel()  # often but not always correct assumption
    if notebook:
        if multi:
            bar = MultiProgressWidget(futures, complete=complete, **kwargs)
        else:
            bar = ProgressWidget(futures, complete=complete, **kwargs)
        return bar
    else:
        TextProgressBar(futures, complete=complete, **kwargs)
Exemplo n.º 11
0
def progress(*futures):
    """Track progress of dask computation in a remote cluster.

    LogProgressBar is defined inside here to avoid having to import
    its dependencies if not used.
    """
    # Import distributed only when used
    from distributed.client import futures_of  # pylint: disable=C0415
    from distributed.diagnostics.progressbar import TextProgressBar  # pylint: disable=c0415

    class LogProgressBar(TextProgressBar):
        """Dask progress bar based on logging instead of stdout."""

        last = 0
        logger = logging.getLogger('distributed')

        def _draw_bar(self, remaining, all, **kwargs):   # pylint: disable=W0221,W0622
            done = all - remaining
            frac = (done / all) if all else 0

            if frac > self.last + 0.01:
                self.last = int(frac * 100) / 100
                bar = "#" * int(self.width * frac)
                percent = int(100 * frac)

                time_per_task = self.elapsed / (all - remaining)
                remaining_time = timedelta(seconds=time_per_task * remaining)
                eta = datetime.utcnow() + remaining_time

                elapsed = timedelta(seconds=self.elapsed)
                msg = "[{0:<{1}}] | {2}/{3} ({4}%) Completed | {5} | {6} | {7}".format(
                    bar, self.width, done, all, percent, elapsed, remaining_time, eta
                )
                self.logger.info(msg)
                LOGGER.info(msg)

        def _draw_stop(self, **kwargs):
            pass

    futures = futures_of(futures)
    if not isinstance(futures, (set, list)):
        futures = [futures]

    LogProgressBar(futures)
Exemplo n.º 12
0
def _compute(collection, recreate_error_locally=True, **kwargs):
    """This function works exactly as dask.compute, but automatically
    recreates the error locally so that it can be inspected with pdb.
    """
    try:
        client = distributed.Client.current()
    except ValueError as e:
        if "No clients found" not in str(e):
            raise e
        client = None

    if client:
        collection = collection.persist()
        futures = futures_of(collection)
        distributed.wait(futures)
        gathered = gather(futures, recreate_error_locally)

        f, a = collection.__dask_postcompute__()
        res = f(gathered, *a)

        return res
    else:
        return collection.compute(
            recreate_error_locally=recreate_error_locally)
Exemplo n.º 13
0
def progress(*futures):
    futures = futures_of(futures)
    if not isinstance(futures, (set, list)):
        futures = [futures]

    LogProgressBar(futures)
Exemplo n.º 14
0
some_bag.npartitions

# In[ ]:

distributed_array.chunks

# In[ ]:

import dask.dataframe as dd
df = dd.from_dask_array(distributed_array)

# In[ ]:

df.index

# In[ ]:

#tag::manual_persist[]
df.persist
# You do a bunch of things on DF

# I'm done!
from distributed.client import futures_of
list(map(lambda x: x.release(), futures_of(df)))
#end::manual_persist[]

# In[ ]:

# In[ ]: