def test_prefer_cheap_dependent(): dsk = {'x': (f, 0.01, 10), 'y': (f, 0.000001, 1, 'x')} c = Cache(10000) with c: get_sync(dsk, 'y') assert c.cache.scorer.cost['x'] < c.cache.scorer.cost['y']
def test_shuffle(): s = shuffle(d, d.b, npartitions=2) assert isinstance(s, dd.DataFrame) assert s.npartitions == 2 x = get_sync(s.dask, (s._name, 0)) y = get_sync(s.dask, (s._name, 1)) assert not (set(x.b) & set(y.b)) # disjoint
def test_callback(): flag = [False] class MyCallback(Callback): def _start(self, dsk): flag[0] = True with MyCallback(): get_sync({'x': 1}, 'x') assert flag[0] is True
def test_start_callback(): flag = [False] class MyCallback(Callback): def _start(self, dsk): flag[0] = True with MyCallback(): get_sync({'x': 1}, 'x') assert flag[0] is True
def test_ordering(): L = [] def append(i): L.append(i) dsk = {('x', i): (append, i) for i in range(10)} x_keys = sorted(dsk) dsk['y'] = (lambda *args: None, list(x_keys)) get_sync(dsk, 'y') assert L == sorted(L)
def test_cache_options(): try: from chest import Chest except ImportError: return cache = Chest() def inc2(x): assert 'y' in cache return x + 1 with dask.set_options(cache=cache): get_sync({'x': (inc2, 'y'), 'y': 1}, 'x')
def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame) assert s.npartitions == d.npartitions x = get_sync(s.dask, (s._name, 0)) y = get_sync(s.dask, (s._name, 1)) assert not (set(x.b) & set(y.b)) # disjoint assert set(s.dask).issuperset(d.dask) assert shuffle_func(d, d.b)._name == shuffle_func(d, d.b)._name
def test_start_state_callback(): flag = [False] class MyCallback(Callback): def _start_state(self, dsk, state): flag[0] = True assert dsk['x'] == 1 assert len(state['cache']) == 1 with MyCallback(): get_sync({'x': 1}, 'x') assert flag[0] is True
def test_finish_always_called(): flag = [False] class MyCallback(Callback): def _finish(self, dsk, state, errored): flag[0] = True assert errored dsk = {'x': (lambda: 1 / 0, )} # `raise_on_exception=True` try: with MyCallback(): get_sync(dsk, 'x') except Exception as e: assert isinstance(e, ZeroDivisionError) assert flag[0] # `raise_on_exception=False` flag[0] = False try: with MyCallback(): get_threaded(dsk, 'x') except Exception as e: assert isinstance(e, ZeroDivisionError) assert flag[0] # KeyboardInterrupt def raise_keyboard(): raise KeyboardInterrupt() dsk = {'x': (raise_keyboard, )} flag[0] = False try: with MyCallback(): get_sync(dsk, 'x') except BaseException as e: assert isinstance(e, KeyboardInterrupt) assert flag[0]
def test_finish_always_called(): flag = [False] class MyCallback(Callback): def _finish(self, dsk, state, errored): flag[0] = True assert errored dsk = {'x': (lambda: 1 / 0,)} # `raise_on_exception=True` try: with MyCallback(): get_sync(dsk, 'x') except Exception as e: assert isinstance(e, ZeroDivisionError) assert flag[0] # `raise_on_exception=False` flag[0] = False try: with MyCallback(): get_threaded(dsk, 'x') except Exception as e: assert isinstance(e, ZeroDivisionError) assert flag[0] # KeyboardInterrupt def raise_keyboard(): raise KeyboardInterrupt() dsk = {'x': (raise_keyboard,)} flag[0] = False try: with MyCallback(): get_sync(dsk, 'x') except BaseException as e: assert isinstance(e, KeyboardInterrupt) assert flag[0]
def test_rearrange(shuffle): df = pd.DataFrame({'x': range(10)}) ddf = dd.from_pandas(df, npartitions=4) ddf2 = ddf.assign(y=ddf.x % 4) result = rearrange_by_column(ddf2, 'y', max_branch=32, shuffle=shuffle) assert result.npartitions == ddf.npartitions assert set(ddf.dask).issubset(result.dask) # Every value in exactly one partition a = result.compute() parts = get_sync(result.dask, result._keys()) for i in a.y.drop_duplicates(): assert sum(i in part.y for part in parts) == 1
def test_divisions(ddf): if not hasattr(ddf, 'divisions'): return if not hasattr(ddf, 'index'): return if not ddf.known_divisions: return results = get_sync(ddf.dask, ddf._keys()) for i, df in enumerate(results[:-1]): if len(df): assert df.index.min() >= ddf.divisions[i] assert df.index.max() < ddf.divisions[i + 1] if len(results[-1]): assert results[-1].index.min() >= ddf.divisions[-2] assert results[-1].index.max() <= ddf.divisions[-1]
def assert_divisions(ddf): if not hasattr(ddf, 'divisions'): return if not hasattr(ddf, 'index'): return if not ddf.known_divisions: return results = get_sync(ddf.dask, ddf._keys()) for i, df in enumerate(results[:-1]): if len(df): assert df.index.min() >= ddf.divisions[i] assert df.index.max() < ddf.divisions[i + 1] if len(results[-1]): assert results[-1].index.min() >= ddf.divisions[-2] assert results[-1].index.max() <= ddf.divisions[-1]