def test_getitem(): df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'B': [9, 8, 7, 6, 5, 4, 3, 2, 1], 'C': [True, False, True] * 3}, columns=list('ABC')) ddf = dd.from_pandas(df, 2) assert eq(ddf['A'], df['A']) # check cache consistency tm.assert_series_equal(ddf['A']._pd, ddf._pd['A']) assert eq(ddf[['A', 'B']], df[['A', 'B']]) tm.assert_frame_equal(ddf[['A', 'B']]._pd, ddf._pd[['A', 'B']]) assert eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._pd, ddf._pd.C) assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) assert raises(KeyError, lambda: df['X']) assert raises(KeyError, lambda: df[['A', 'X']]) assert raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert eq(ddf[0], df[0]) assert eq(ddf[[1, 2]], df[[1, 2]]) assert raises(KeyError, lambda: df[8]) assert raises(KeyError, lambda: df[[1, 8]])
def test_raises(): df = pd.DataFrame({"a": np.random.randn(25).cumsum(), "b": np.random.randn(25).cumsum()}) ddf = dd.from_pandas(df, 3) assert raises(TypeError, lambda: dd.rolling_mean(ddf, 1.5)) assert raises(ValueError, lambda: dd.rolling_mean(ddf, -1)) assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, freq=2)) assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, how="min"))
def test_vindex_errors(): d = da.ones((5, 5, 5), chunks=(3, 3, 3)) assert raises(IndexError, lambda: d.vindex[0]) assert raises(IndexError, lambda: d.vindex[[1, 2, 3]]) assert raises(IndexError, lambda: d.vindex[[1, 2, 3], [1, 2, 3], 0]) assert raises(IndexError, lambda: d.vindex[[1], [1, 2, 3]]) assert raises(IndexError, lambda: d.vindex[[1, 2, 3], [[1], [2], [3]]])
def test_full_groupby(): assert raises(Exception, lambda: d.groupby('does_not_exist')) assert raises(Exception, lambda: d.groupby('a').does_not_exist) assert 'b' in dir(d.groupby('a')) def func(df): df['b'] = df.b - df.b.mean() return df
def test_insert(): x = np.random.randint(10, size=(10, 10)) a = from_array(x, chunks=(5, 5)) y = np.random.randint(10, size=(5, 10)) b = from_array(y, chunks=(4, 4)) assert eq(np.insert(x, 0, -1, axis=0), insert(a, 0, -1, axis=0)) assert eq(np.insert(x, 3, -1, axis=-1), insert(a, 3, -1, axis=-1)) assert eq(np.insert(x, 5, -1, axis=1), insert(a, 5, -1, axis=1)) assert eq(np.insert(x, -1, -1, axis=-2), insert(a, -1, -1, axis=-2)) assert eq(np.insert(x, [2, 3, 3], -1, axis=1), insert(a, [2, 3, 3], -1, axis=1)) assert eq(np.insert(x, [2, 3, 8, 8, -2, -2], -1, axis=0), insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0)) assert eq(np.insert(x, slice(1, 4), -1, axis=1), insert(a, slice(1, 4), -1, axis=1)) assert eq(np.insert(x, [2] * 3 + [5] * 2, y, axis=0), insert(a, [2] * 3 + [5] * 2, b, axis=0)) assert eq(np.insert(x, 0, y[0], axis=1), insert(a, 0, b[0], axis=1)) assert raises(NotImplementedError, lambda: insert(a, [4, 2], -1, axis=0)) assert raises(IndexError, lambda: insert(a, [3], -1, axis=2)) assert raises(IndexError, lambda: insert(a, [3], -1, axis=-3)) assert same_keys(insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0), insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0))
def test_getitem(): df = pd.DataFrame( {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3}, columns=list("ABC"), ) ddf = dd.from_pandas(df, 2) assert eq(ddf["A"], df["A"]) # check cache consistency tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"]) assert eq(ddf[["A", "B"]], df[["A", "B"]]) tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]]) assert eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._meta, ddf._meta.C) assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) assert raises(KeyError, lambda: df["X"]) assert raises(KeyError, lambda: df[["A", "X"]]) assert raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert eq(ddf[0], df[0]) assert eq(ddf[[1, 2]], df[[1, 2]]) assert raises(KeyError, lambda: df[8]) assert raises(KeyError, lambda: df[[1, 8]])
def test_rolling_partition_size(): df = pd.DataFrame(np.random.randn(50, 2)) ddf = dd.from_pandas(df, npartitions=5) for obj, dobj in [(df, ddf), (df[0], ddf[0])]: eq(obj.rolling(10).mean(), dobj.rolling(10).mean()) eq(obj.rolling(11).mean(), dobj.rolling(11).mean()) raises(NotImplementedError, lambda: dobj.rolling(12).mean())
def test_rolling_functions_raises(): df = pd.DataFrame({'a': np.random.randn(25).cumsum(), 'b': np.random.randint(100, size=(25,))}) ddf = dd.from_pandas(df, 3) assert raises(TypeError, lambda: dd.rolling_mean(ddf, 1.5)) assert raises(ValueError, lambda: dd.rolling_mean(ddf, -1)) assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, freq=2)) assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, how='min'))
def test_update(): s = Store() dsk = {'x': 1, 'y': (inc, 'x')} s.update(dsk) assert s['y'] == 2 assert raises(Exception, lambda: s.update({'x': 2})) assert not raises(Exception, lambda: s.update({'x': 1}))
def test_broadcast_to(): x = np.random.randint(10, size=(5, 1, 6)) a = from_array(x, chunks=(3, 1, 3)) for shape in [(5, 4, 6), (2, 5, 1, 6), (3, 4, 5, 4, 6)]: assert eq(chunk.broadcast_to(x, shape), broadcast_to(a, shape)) assert raises(ValueError, lambda: broadcast_to(a, (2, 1, 6))) assert raises(ValueError, lambda: broadcast_to(a, (3,)))
def test_partial_by_order(): f = partial_by_order(add, [(1, 20)]) assert f(5) == 25 assert f.__name__ == 'add(20)' f = partial_by_order(lambda x, y, z: x + y + z, [(1, 10), (2, 15)]) assert f(3) == 28 assert f.__name__ == '<lambda>(...)' assert raises(ValueError, lambda: partial_by_order(add, 1)) assert raises(ValueError, lambda: partial_by_order(add, [1]))
def test_full_groupby(): assert raises(Exception, lambda: d.groupby('does_not_exist')) assert raises(Exception, lambda: d.groupby('a').does_not_exist) assert 'b' in dir(d.groupby('a')) def func(df): df['b'] = df.b - df.b.mean() return df assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func)) assert sorted(d.groupby('a').apply(func).dask) == \ sorted(d.groupby('a').apply(func).dask)
def test_lu_errors(): A = np.random.random_integers(0, 10, (10, 10, 10)) dA = da.from_array(A, chunks=(5, 5, 5)) assert raises(ValueError, lambda: da.linalg.lu(dA)) A = np.random.random_integers(0, 10, (10, 8)) dA = da.from_array(A, chunks=(5, 4)) assert raises(ValueError, lambda: da.linalg.lu(dA)) A = np.random.random_integers(0, 10, (20, 20)) dA = da.from_array(A, chunks=(5, 4)) assert raises(ValueError, lambda: da.linalg.lu(dA))
def test_GetFunctionTestCase_class(): class CustomTestGetFail(GetFunctionTestCase): get = staticmethod(lambda x, y: 1) custom_testget = CustomTestGetFail() raises(AssertionError, custom_testget.test_get) class CustomTestGetPass(GetFunctionTestCase): get = staticmethod(core.get) custom_testget = CustomTestGetPass() custom_testget.test_get()
def test_GetFunctionTestMixin_class(): class TestCustomGetFail(GetFunctionTestMixin): get = staticmethod(lambda x, y: 1) custom_testget = TestCustomGetFail() raises(AssertionError, custom_testget.test_get) class TestCustomGetPass(GetFunctionTestMixin): get = staticmethod(core.get) custom_testget = TestCustomGetPass() custom_testget.test_get()
def test_solve_triangular_errors(): A = np.random.random_integers(0, 10, (10, 10, 10)) b = np.random.random_integers(1, 10, 10) dA = da.from_array(A, chunks=(5, 5, 5)) db = da.from_array(b, chunks=5) assert raises(ValueError, lambda: da.linalg.solve_triangular(dA, db)) A = np.random.random_integers(0, 10, (10, 10)) b = np.random.random_integers(1, 10, 10) dA = da.from_array(A, chunks=(3, 3)) db = da.from_array(b, chunks=5) assert raises(ValueError, lambda: da.linalg.solve_triangular(dA, db))
def test_store(): d = da.ones((4, 4), chunks=(2, 2)) a, b = d + 1, d + 2 at = np.empty(shape=(4, 4)) bt = np.empty(shape=(4, 4)) store([a, b], [at, bt]) assert (at == 2).all() assert (bt == 3).all() assert raises(ValueError, lambda: store([a], [at, bt])) assert raises(ValueError, lambda: store(at, at)) assert raises(ValueError, lambda: store([at, bt], [at, bt]))
def test_dataframe_quantile(): # column X is for test column order and result division df = pd.DataFrame( {"A": np.arange(20), "X": np.arange(20, 40), "B": np.arange(10, 30), "C": ["a", "b", "c", "d"] * 5}, columns=["A", "X", "B", "C"], ) ddf = dd.from_pandas(df, 3) result = ddf.quantile() assert result.npartitions == 1 assert result.divisions == ("A", "X") result = result.compute() assert isinstance(result, pd.Series) tm.assert_index_equal(result.index, pd.Index(["A", "X", "B"])) assert (result > pd.Series([16, 36, 26], index=["A", "X", "B"])).all() assert (result < pd.Series([17, 37, 27], index=["A", "X", "B"])).all() result = ddf.quantile([0.25, 0.75]) assert result.npartitions == 1 assert result.divisions == (0.25, 0.75) result = result.compute() assert isinstance(result, pd.DataFrame) tm.assert_index_equal(result.index, pd.Index([0.25, 0.75])) tm.assert_index_equal(result.columns, pd.Index(["A", "X", "B"])) minexp = pd.DataFrame([[1, 21, 11], [17, 37, 27]], index=[0.25, 0.75], columns=["A", "X", "B"]) assert (result > minexp).all().all() maxexp = pd.DataFrame([[2, 22, 12], [18, 38, 28]], index=[0.25, 0.75], columns=["A", "X", "B"]) assert (result < maxexp).all().all() assert eq(ddf.quantile(axis=1), df.quantile(axis=1)) assert raises(ValueError, lambda: ddf.quantile([0.25, 0.75], axis=1))
def test_reductions_frame(): assert eq(d.sum(), full.sum()) assert eq(d.min(), full.min()) assert eq(d.max(), full.max()) assert eq(d.count(), full.count()) assert eq(d.std(), full.std()) assert eq(d.var(), full.var()) assert eq(d.std(ddof=0), full.std(ddof=0)) assert eq(d.var(ddof=0), full.var(ddof=0)) assert eq(d.mean(), full.mean()) for axis in [0, 1, 'index', 'columns']: assert eq(d.sum(axis=axis), full.sum(axis=axis)) assert eq(d.min(axis=axis), full.min(axis=axis)) assert eq(d.max(axis=axis), full.max(axis=axis)) assert eq(d.count(axis=axis), full.count(axis=axis)) assert eq(d.std(axis=axis), full.std(axis=axis)) assert eq(d.var(axis=axis), full.var(axis=axis)) assert eq(d.std(axis=axis, ddof=0), full.std(axis=axis, ddof=0)) assert eq(d.var(axis=axis, ddof=0), full.var(axis=axis, ddof=0)) assert eq(d.mean(axis=axis), full.mean(axis=axis)) assert raises(ValueError, lambda: d.sum(axis='incorrect').compute()) assert_dask_graph(d.sum(), 'dataframe-sum') assert_dask_graph(d.min(), 'dataframe-min') assert_dask_graph(d.max(), 'dataframe-max') assert_dask_graph(d.count(), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(d.std(), 'dataframe-sum') assert_dask_graph(d.std(), 'dataframe-count') assert_dask_graph(d.var(), 'dataframe-sum') assert_dask_graph(d.var(), 'dataframe-count') assert_dask_graph(d.mean(), 'dataframe-sum') assert_dask_graph(d.mean(), 'dataframe-count')
def test_series_groupby_errors(): s = pd.Series([1, 2, 2, 1, 1]) ss = dd.from_pandas(s, npartitions=2) msg = "Grouper for '1' not 1-dimensional" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([1, 2]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([1, 2]) # dask should raise the same error msg = "Grouper for '2' not 1-dimensional" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([2]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([2]) # dask should raise the same error msg = "No group keys passed!" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([]) # dask should raise the same error sss = dd.from_pandas(s, npartitions=3) assert raises(NotImplementedError, lambda: ss.groupby(sss)) with tm.assertRaises(KeyError): s.groupby('x') # pandas with tm.assertRaises(KeyError): ss.groupby('x') # dask should raise the same error
def test_concatenate(): a, b, c = [Array(getem(name, chunks=(2, 3), shape=(4, 6)), name, shape=(4, 6), chunks=(2, 3)) for name in 'ABC'] x = concatenate([a, b, c], axis=0) assert x.shape == (12, 6) assert x.chunks == ((2, 2, 2, 2, 2, 2), (3, 3)) assert x.dask[(x.name, 0, 1)] == ('A', 0, 1) assert x.dask[(x.name, 5, 0)] == ('C', 1, 0) assert same_keys(x, concatenate([a, b, c], axis=0)) y = concatenate([a, b, c], axis=1) assert y.shape == (4, 18) assert y.chunks == ((2, 2), (3, 3, 3, 3, 3, 3)) assert y.dask[(y.name, 1, 0)] == ('A', 1, 0) assert y.dask[(y.name, 1, 5)] == ('C', 1, 1) assert same_keys(y, concatenate([a, b, c], axis=1)) assert set(b.dask.keys()).issubset(y.dask.keys()) assert concatenate([a, b, c], axis=-1).chunks == \ concatenate([a, b, c], axis=1).chunks assert raises(ValueError, lambda: concatenate([a, b, c], axis=2))
def test_unravel(): x = np.random.randint(10, size=24) # these should use the shortcut for chunks, shape in [(24, (3, 8)), (24, (12, 2)), (6, (4, 6)), (6, (4, 3, 2)), (6, (4, 6, 1)), (((6, 12, 6),), (4, 6))]: a = from_array(x, chunks=chunks) unraveled = unravel(a, shape) assert eq(x.reshape(*shape), unraveled) assert len(unraveled.dask) == len(a.dask) + len(a.chunks[0]) # these cannot for chunks, shape in [(6, (2, 12)), (6, (1, 4, 6)), (6, (2, 1, 12))]: a = from_array(x, chunks=chunks) unraveled = unravel(a, shape) assert eq(x.reshape(*shape), unraveled) assert len(unraveled.dask) > len(a.dask) + len(a.chunks[0]) assert raises(AssertionError, lambda: unravel(unraveled, (3, 8))) assert unravel(a, a.shape) is a
def test_elemwise_with_ndarrays(): x = np.arange(3) y = np.arange(12).reshape(4, 3) a = from_array(x, chunks=(3,)) b = from_array(y, chunks=(2, 3)) assert eq(x + a, 2 * x) assert eq(a + x, 2 * x) assert eq(x + b, x + y) assert eq(b + x, x + y) assert eq(a + y, x + y) assert eq(y + a, x + y) # Error on shape mismatch assert raises(ValueError, lambda: a + y.T) assert raises(ValueError, lambda: a + np.arange(2))
def test_full_groupby(): df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(df, npartitions=3) assert raises(Exception, lambda: df.groupby('does_not_exist')) assert raises(Exception, lambda: df.groupby('a').does_not_exist) assert 'b' in dir(df.groupby('a')) def func(df): df['b'] = df.b - df.b.mean() return df assert eq(df.groupby('a').apply(func), ddf.groupby('a').apply(func))
def test_basic(): s = Store() s['x'] = 1 s['y'] = (inc, 'x') s['z'] = (add, 'x', 'y') assert s.data == set(['x']) assert s['z'] == 3 assert 'x' in s.data assert s.cache['z'] == 3 assert s.cache['y'] == 2 assert len(s.access_times['z']) == 1 assert len(s.access_times['y']) == 1 assert len(s.access_times['x']) == 2 assert s.compute_time['z'] < 0.1 cache = s.cache.copy() assert s['z'] == 3 assert s.cache == cache assert len(s.access_times['z']) == 2 assert len(s.access_times['y']) == 1 assert len(s.access_times['x']) == 2 assert s[5] == 5 assert list(s[['x', 'y']]) == [s['x'], s['y']] def reassign(): s['x'] = 2 assert raises(Exception, reassign)
def test_take(): x = np.arange(400).reshape((20, 20)) a = from_array(x, chunks=(5, 5)) assert eq(np.take(x, 3, axis=0), take(a, 3, axis=0)) assert eq(np.take(x, [3, 4, 5], axis=-1), take(a, [3, 4, 5], axis=-1)) assert raises(ValueError, lambda: take(a, 3, axis=2))
def test_equivalence_uncomparable(): t1 = Uncomparable() t2 = Uncomparable() assert raises(TypeError, lambda: t1 == t2) assert equivalent(t1, t1) assert not equivalent(t1, t2) assert equivalent((add, t1, 0), (add, t1, 0)) assert not equivalent((add, t1, 0), (add, t2, 0))
def test_get_stack_limit(): d = dict(('x%s' % (i+1), (inc, 'x%s' % i)) for i in range(10000)) d['x0'] = 0 assert get(d, 'x10000') == 10000 # introduce cycle d['x5000'] = (inc, 'x5001') assert raises(RuntimeError, lambda: get(d, 'x10000')) assert get(d, 'x4999') == 4999
def test_from_filenames(): with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns: assert set(line.strip() for line in db.from_filenames(fns)) == \ set('ABCD') assert set(line.strip() for line in db.from_filenames('a*.log')) == \ set('ABCD') assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
def test_reshape_unknown_dimensions(): for original_shape in [(24,), (2, 12), (2, 3, 4)]: for new_shape in [(-1,), (2, -1), (-1, 3, 4)]: x = np.random.randint(10, size=original_shape) a = from_array(x, 4) assert eq(x.reshape(new_shape), a.reshape(new_shape)) assert raises(ValueError, lambda: reshape(a, (-1, -1)))
def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5) assert eq(d.loc[5], full.loc[5]) assert eq(d.loc[3:8], full.loc[3:8]) assert eq(d.loc[:8], full.loc[:8]) assert eq(d.loc[3:], full.loc[3:]) assert eq(d.a.loc[5], full.a.loc[5]) assert eq(d.a.loc[3:8], full.a.loc[3:8]) assert eq(d.a.loc[:8], full.a.loc[:8]) assert eq(d.a.loc[3:], full.a.loc[3:]) assert raises(KeyError, lambda: d.loc[1000]) assert eq(d.loc[1000:], full.loc[1000:]) assert eq(d.loc[-2000:-1000], full.loc[-2000:-1000]) assert sorted(d.loc[5].dask) == sorted(d.loc[5].dask) assert sorted(d.loc[5].dask) != sorted(d.loc[6].dask)
def test_unknown_divisions(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }) } d = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None, None, None]) full = d.compute(get=dask.get) assert eq(d.a.sum(), full.a.sum()) assert eq(d.a + d.b + 1, full.a + full.b + 1) assert raises(ValueError, lambda: d.loc[3])
def test_reduction_series_invalid_axis(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() for axis in [1, 'columns']: for s in [ddf1.a, pdf1.a]: # both must behave the same assert raises(ValueError, lambda: s.sum(axis=axis)) assert raises(ValueError, lambda: s.min(axis=axis)) assert raises(ValueError, lambda: s.max(axis=axis)) # only count doesn't have axis keyword assert raises(TypeError, lambda: s.count(axis=axis)) assert raises(ValueError, lambda: s.std(axis=axis)) assert raises(ValueError, lambda: s.var(axis=axis)) assert raises(ValueError, lambda: s.mean(axis=axis))
def test_repartition(): def _check_split_data(orig, d): """Check data is split properly""" keys = [k for k in d.dask if k[0].startswith('repartition-split')] keys = sorted(keys) sp = pd.concat([d._get(d.dask, k) for k in keys]) assert eq(orig, sp) assert eq(orig, d) df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) a = dd.from_pandas(df, 2) b = a.repartition(divisions=[10, 20, 50, 60]) assert b.divisions == (10, 20, 50, 60) assert eq(a, b) assert eq(a._get(b.dask, (b._name, 0)), df.iloc[:1]) for div in [ [20, 60], [10, 50], [1], # first / last element mismatch [0, 60], [10, 70], # do not allow to expand divisions by default [10, 50, 20, 60], # not sorted [10, 10, 20, 60] ]: # not unique (last element can be duplicated) assert raises(ValueError, lambda: a.repartition(divisions=div)) pdf = pd.DataFrame(np.random.randn(7, 5), columns=list('abxyz')) for p in range(1, 7): ddf = dd.from_pandas(pdf, p) assert eq(ddf, pdf) for div in [[0, 6], [0, 6, 6], [0, 5, 6], [0, 4, 6, 6], [0, 2, 6], [0, 2, 6, 6], [0, 2, 3, 6, 6], [0, 1, 2, 3, 4, 5, 6, 6]]: rddf = ddf.repartition(divisions=div) _check_split_data(ddf, rddf) assert rddf.divisions == tuple(div) assert eq(pdf, rddf) rds = ddf.x.repartition(divisions=div) _check_split_data(ddf.x, rds) assert rds.divisions == tuple(div) assert eq(pdf.x, rds) # expand divisions for div in [[-5, 10], [-2, 3, 5, 6], [0, 4, 5, 9, 10]]: rddf = ddf.repartition(divisions=div, force=True) _check_split_data(ddf, rddf) assert rddf.divisions == tuple(div) assert eq(pdf, rddf) rds = ddf.x.repartition(divisions=div, force=True) _check_split_data(ddf.x, rds) assert rds.divisions == tuple(div) assert eq(pdf.x, rds) pdf = pd.DataFrame( { 'x': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'y': [9, 8, 7, 6, 5, 4, 3, 2, 1, 0] }, index=list('abcdefghij')) for p in range(1, 7): ddf = dd.from_pandas(pdf, p) assert eq(ddf, pdf) for div in [ list('aj'), list('ajj'), list('adj'), list('abfj'), list('ahjj'), list('acdj'), list('adfij'), list('abdefgij'), list('abcdefghij') ]: rddf = ddf.repartition(divisions=div) _check_split_data(ddf, rddf) assert rddf.divisions == tuple(div) assert eq(pdf, rddf) rds = ddf.x.repartition(divisions=div) _check_split_data(ddf.x, rds) assert rds.divisions == tuple(div) assert eq(pdf.x, rds) # expand divisions for div in [list('Yadijm'), list('acmrxz'), list('Yajz')]: rddf = ddf.repartition(divisions=div, force=True) _check_split_data(ddf, rddf) assert rddf.divisions == tuple(div) assert eq(pdf, rddf) rds = ddf.x.repartition(divisions=div, force=True) _check_split_data(ddf.x, rds) assert rds.divisions == tuple(div) assert eq(pdf.x, rds)
def test_split_apply_combine_on_series(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 6], 'b': [4, 2, 7] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 4, 6], 'b': [3, 3, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [4, 3, 7], 'b': [1, 1, 3] }, index=[9, 9, 9]) } ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq( ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for i in range(8): assert eq( ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq( ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq( ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq( ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq( ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq( ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq( ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq( ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq( ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq( ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq( ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq( ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq( ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq( ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq( ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq( ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq( ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq( ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq( ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq( ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq( ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq( ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq( ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq( ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert sorted(ddf1.groupby('b').a.sum().dask) == \ sorted(ddf1.groupby('b').a.sum().dask) assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \ sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
def test_pluck_with_default(): b = db.from_sequence(['Hello', '', 'World']) assert raises(IndexError, lambda: list(b.pluck(0))) assert list(b.pluck(0, None)) == ['H', None, 'W']
def check_raises(d, p, func): assert raises((TypeError, ValueError), lambda: getattr(d, func)().compute()) assert raises((TypeError, ValueError), lambda: getattr(p, func)())
def test_Series(): assert isinstance(d.a, dd.Series) assert isinstance(d.a + 1, dd.Series) assert raises(Exception, lambda: d + 1)
def test_rename_index(): renamer = {0: 1} assert raises(ValueError, lambda: d.rename(index=renamer))
def test_attributes(): assert 'a' in dir(d) assert 'foo' not in dir(d) assert raises(AttributeError, lambda: d.foo)
def test_set_index_raises_error_on_bad_input(): df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) ddf = dd.from_pandas(df, 2) assert raises(NotImplementedError, lambda: ddf.set_index(['a', 'b']))
def test_cant_fft_chunked_axis(): bad_darr = da.from_array(nparr, chunks=(5, 5)) assert raises(ValueError, lambda: fft(bad_darr)) assert raises(ValueError, lambda: fft(bad_darr, axis=0))
def test_rechunk_with_empty_input(): x = da.ones((24, 24), chunks=(4, 8)) assert x.rechunk(chunks={}).chunks == x.chunks assert raises(ValueError, lambda: x.rechunk(chunks=()))
def test_from_function_requires_block_args(): x = np.arange(10) assert raises(Exception, lambda: from_array(x))
def test_numpy_compat_is_notimplemented(): a = np.arange(10) x = da.from_array(a, chunks=5) assert raises(NotImplementedError, lambda: x + a)
def test_blockdims_from_blockshape(): assert blockdims_from_blockshape((10, 10), (4, 3)) == ((4, 4, 2), (3, 3, 3, 1)) assert raises(ValueError, lambda: blockdims_from_blockshape((10, ), None))
def test_groupby_set_index(): df = tm.makeTimeDataFrame() ddf = dd.from_pandas(df, npartitions=2) assert raises(NotImplementedError, lambda: ddf.groupby(df.index.month, as_index=False))
def test_iloc_raises(): assert raises(NotImplementedError, lambda: d.iloc[:5])
def test_split_apply_combine_on_series(): pdf1 = pd.DataFrame( { 'a': [1, 2, 6, 4, 4, 6, 4, 3, 7], 'b': [4, 2, 7, 3, 3, 1, 1, 1, 2] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(pdf1, npartitions=3) ddf1 = ddf for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min()) assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max()) assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count()) assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean()) assert eq( ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique()) assert eq(ddf1.groupby(ddkey).a.size(), pdf1.groupby(pdkey).a.size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddkey).a.var(ddof), pdf1.groupby(pdkey).a.var(ddof)) assert eq( ddf1.groupby(ddkey).a.std(ddof), pdf1.groupby(pdkey).a.std(ddof)) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean()) assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size()) for ddof in [0, 1, 2]: assert eq(ddf1.groupby(ddkey).var(ddof), pdf1.groupby(pdkey).var(ddof), check_dtype=False) assert eq(ddf1.groupby(ddkey).std(ddof), pdf1.groupby(pdkey).std(ddof), check_dtype=False) for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]: assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False) assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False) assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False) assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False) assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False) for ddof in [0, 1, 2]: assert eq( ddf1.a.groupby(ddkey).var(ddof), pdf1.a.groupby(pdkey).var(ddof)) assert eq( ddf1.a.groupby(ddkey).std(ddof), pdf1.a.groupby(pdkey).std(ddof)) for i in range(8): assert eq( ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum()) assert eq( ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min()) assert eq( ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max()) assert eq( ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count()) assert eq( ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean()) assert eq( ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique()) assert eq( ddf1.groupby(ddf1.b > i).a.size(), pdf1.groupby(pdf1.b > i).a.size()) assert eq( ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum()) assert eq( ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min()) assert eq( ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max()) assert eq( ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count()) assert eq( ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean()) assert eq( ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique()) assert eq( ddf1.groupby(ddf1.b > i).b.size(), pdf1.groupby(pdf1.b > i).b.size()) assert eq( ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum()) assert eq( ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min()) assert eq( ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max()) assert eq( ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count()) assert eq( ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean()) assert eq( ddf1.groupby(ddf1.b > i).size(), pdf1.groupby(pdf1.b > i).size()) assert eq( ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum()) assert eq( ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min()) assert eq( ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max()) assert eq( ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count()) assert eq( ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean()) assert eq( ddf1.groupby(ddf1.a > i).size(), pdf1.groupby(pdf1.a > i).size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddf1.b > i).std(ddof), pdf1.groupby(pdf1.b > i).std(ddof)) for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]: assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum()) assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min()) assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max()) assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count()) assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean()) assert eq( ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique()) assert eq(ddf1.groupby(ddkey).b.size(), pdf1.groupby(pdkey).b.size()) assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum()) assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min()) assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max()) assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count()) assert eq( ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float)) assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size()) for ddof in [0, 1, 2]: assert eq( ddf1.groupby(ddkey).b.std(ddof), pdf1.groupby(pdkey).b.std(ddof)) assert (sorted(ddf1.groupby('b').a.sum().dask) == sorted( ddf1.groupby('b').a.sum().dask)) assert (sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted( ddf1.groupby(ddf1.a > 3).b.mean().dask)) # test raises with incorrect key assert raises(KeyError, lambda: ddf1.groupby('x')) assert raises(KeyError, lambda: ddf1.groupby(['a', 'x'])) assert raises(KeyError, lambda: ddf1.groupby('a')['x']) assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x']) assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']]) # test graph node labels assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min') assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max') assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum') assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count') assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique') assert_dask_graph(ddf1.groupby('b').a.size(), 'series-groupby-size') assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min') assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max') assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count') # mean consists from sum and count operations assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum') assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count') assert_dask_graph(ddf1.groupby('b').size(), 'dataframe-groupby-size')
def test_series_resample_not_implemented(): index = pd.date_range(start='20120102', periods=100, freq='T') s = pd.Series(range(len(index)), index=index) ds = dd.from_pandas(s, npartitions=5) # Frequency doesn't evenly divide day assert raises(NotImplementedError, lambda: resample(ds, '57T'))
def test_reductions_frame(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0)) assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0)) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.min(axis=1, split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1, split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1, split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1, split_every=split_every), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1, split_every=split_every), 'dataframe-var') assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), 'dataframe-mean')
def test_iloc_raises(): assert raises(AttributeError, lambda: d.iloc[:5])
def test_exceptions_rise_to_top(): dsk = {'x': 1, 'y': (bad, 'x')} assert raises(ValueError, lambda: get(dsk, 'y'))
def test_broadcast_shapes(): assert (3, 4, 5) == broadcast_shapes((3, 4, 5), (4, 1), ()) assert (3, 4) == broadcast_shapes((3, 1), (1, 4), (4, )) assert (5, 6, 7, 3, 4) == broadcast_shapes((3, 1), (), (5, 6, 7, 1, 4)) assert raises(ValueError, lambda: broadcast_shapes((3, ), (3, 4))) assert raises(ValueError, lambda: broadcast_shapes((2, 3), (2, 3, 1)))
def test_frame_series_arithmetic_methods(): pdf1 = pd.DataFrame({'A': np.arange(10), 'B': [np.nan, 1, 2, 3, 4] * 2, 'C': [np.nan] * 10, 'D': np.arange(10)}, index=list('abcdefghij'), columns=list('ABCD')) pdf2 = pd.DataFrame(np.random.randn(10, 4), index=list('abcdefghjk'), columns=list('ABCX')) ps1 = pdf1.A ps2 = pdf2.A ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 2) ds1 = ddf1.A ds2 = ddf2.A s = dd.core.Scalar({('s', 0): 4}, 's') for l, r, el, er in [(ddf1, ddf2, pdf1, pdf2), (ds1, ds2, ps1, ps2), (ddf1.repartition(['a', 'f', 'j']), ddf2, pdf1, pdf2), (ds1.repartition(['a', 'b', 'f', 'j']), ds2, ps1, ps2), (ddf1, ddf2.repartition(['a', 'k']), pdf1, pdf2), (ds1, ds2.repartition(['a', 'b', 'd', 'h', 'k']), ps1, ps2), (ddf1, 3, pdf1, 3), (ds1, 3, ps1, 3), (ddf1, s, pdf1, 4), (ds1, s, ps1, 4)]: # l, r may be repartitioned, test whether repartition keeps original data assert eq(l, el) assert eq(r, er) assert eq(l.add(r, fill_value=0), el.add(er, fill_value=0)) assert eq(l.sub(r, fill_value=0), el.sub(er, fill_value=0)) assert eq(l.mul(r, fill_value=0), el.mul(er, fill_value=0)) assert eq(l.div(r, fill_value=0), el.div(er, fill_value=0)) assert eq(l.truediv(r, fill_value=0), el.truediv(er, fill_value=0)) assert eq(l.floordiv(r, fill_value=1), el.floordiv(er, fill_value=1)) assert eq(l.mod(r, fill_value=0), el.mod(er, fill_value=0)) assert eq(l.pow(r, fill_value=0), el.pow(er, fill_value=0)) assert eq(l.radd(r, fill_value=0), el.radd(er, fill_value=0)) assert eq(l.rsub(r, fill_value=0), el.rsub(er, fill_value=0)) assert eq(l.rmul(r, fill_value=0), el.rmul(er, fill_value=0)) assert eq(l.rdiv(r, fill_value=0), el.rdiv(er, fill_value=0)) assert eq(l.rtruediv(r, fill_value=0), el.rtruediv(er, fill_value=0)) assert eq(l.rfloordiv(r, fill_value=1), el.rfloordiv(er, fill_value=1)) assert eq(l.rmod(r, fill_value=0), el.rmod(er, fill_value=0)) assert eq(l.rpow(r, fill_value=0), el.rpow(er, fill_value=0)) for l, r, el, er in [(ddf1, ds2, pdf1, ps2), (ddf1, ddf2.X, pdf1, pdf2.X)]: assert eq(l, el) assert eq(r, er) # must specify axis=0 to add Series to each column # axis=1 is not supported (add to each row) assert eq(l.add(r, axis=0), el.add(er, axis=0)) assert eq(l.sub(r, axis=0), el.sub(er, axis=0)) assert eq(l.mul(r, axis=0), el.mul(er, axis=0)) assert eq(l.div(r, axis=0), el.div(er, axis=0)) assert eq(l.truediv(r, axis=0), el.truediv(er, axis=0)) assert eq(l.floordiv(r, axis=0), el.floordiv(er, axis=0)) assert eq(l.mod(r, axis=0), el.mod(er, axis=0)) assert eq(l.pow(r, axis=0), el.pow(er, axis=0)) assert eq(l.radd(r, axis=0), el.radd(er, axis=0)) assert eq(l.rsub(r, axis=0), el.rsub(er, axis=0)) assert eq(l.rmul(r, axis=0), el.rmul(er, axis=0)) assert eq(l.rdiv(r, axis=0), el.rdiv(er, axis=0)) assert eq(l.rtruediv(r, axis=0), el.rtruediv(er, axis=0)) assert eq(l.rfloordiv(r, axis=0), el.rfloordiv(er, axis=0)) assert eq(l.rmod(r, axis=0), el.rmod(er, axis=0)) assert eq(l.rpow(r, axis=0), el.rpow(er, axis=0)) assert raises(ValueError, lambda: l.add(r, axis=1)) for l, r, el, er in [(ddf1, pdf2, pdf1, pdf2), (ddf1, ps2, pdf1, ps2)]: assert eq(l, el) assert eq(r, er) for axis in [0, 1, 'index', 'columns']: assert eq(l.add(r, axis=axis), el.add(er, axis=axis)) assert eq(l.sub(r, axis=axis), el.sub(er, axis=axis)) assert eq(l.mul(r, axis=axis), el.mul(er, axis=axis)) assert eq(l.div(r, axis=axis), el.div(er, axis=axis)) assert eq(l.truediv(r, axis=axis), el.truediv(er, axis=axis)) assert eq(l.floordiv(r, axis=axis), el.floordiv(er, axis=axis)) assert eq(l.mod(r, axis=axis), el.mod(er, axis=axis)) assert eq(l.pow(r, axis=axis), el.pow(er, axis=axis)) assert eq(l.radd(r, axis=axis), el.radd(er, axis=axis)) assert eq(l.rsub(r, axis=axis), el.rsub(er, axis=axis)) assert eq(l.rmul(r, axis=axis), el.rmul(er, axis=axis)) assert eq(l.rdiv(r, axis=axis), el.rdiv(er, axis=axis)) assert eq(l.rtruediv(r, axis=axis), el.rtruediv(er, axis=axis)) assert eq(l.rfloordiv(r, axis=axis), el.rfloordiv(er, axis=axis)) assert eq(l.rmod(r, axis=axis), el.rmod(er, axis=axis)) assert eq(l.rpow(r, axis=axis), el.rpow(er, axis=axis))