예제 #1
0
def test_getitem():
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                       'B': [9, 8, 7, 6, 5, 4, 3, 2, 1],
                       'C': [True, False, True] * 3},
                      columns=list('ABC'))
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf['A'], df['A'])
    # check cache consistency
    tm.assert_series_equal(ddf['A']._pd, ddf._pd['A'])

    assert eq(ddf[['A', 'B']], df[['A', 'B']])
    tm.assert_frame_equal(ddf[['A', 'B']]._pd, ddf._pd[['A', 'B']])

    assert eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._pd, ddf._pd.C)

    assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    assert raises(KeyError, lambda: df['X'])
    assert raises(KeyError, lambda: df[['A', 'X']])
    assert raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf[0], df[0])
    assert eq(ddf[[1, 2]], df[[1, 2]])

    assert raises(KeyError, lambda: df[8])
    assert raises(KeyError, lambda: df[[1, 8]])
예제 #2
0
def test_raises():
    df = pd.DataFrame({"a": np.random.randn(25).cumsum(), "b": np.random.randn(25).cumsum()})
    ddf = dd.from_pandas(df, 3)
    assert raises(TypeError, lambda: dd.rolling_mean(ddf, 1.5))
    assert raises(ValueError, lambda: dd.rolling_mean(ddf, -1))
    assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, freq=2))
    assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, how="min"))
예제 #3
0
def test_vindex_errors():
    d = da.ones((5, 5, 5), chunks=(3, 3, 3))
    assert raises(IndexError, lambda: d.vindex[0])
    assert raises(IndexError, lambda: d.vindex[[1, 2, 3]])
    assert raises(IndexError, lambda: d.vindex[[1, 2, 3], [1, 2, 3], 0])
    assert raises(IndexError, lambda: d.vindex[[1], [1, 2, 3]])
    assert raises(IndexError, lambda: d.vindex[[1, 2, 3], [[1], [2], [3]]])
예제 #4
0
def test_full_groupby():
    assert raises(Exception, lambda: d.groupby('does_not_exist'))
    assert raises(Exception, lambda: d.groupby('a').does_not_exist)
    assert 'b' in dir(d.groupby('a'))
    def func(df):
        df['b'] = df.b - df.b.mean()
        return df
예제 #5
0
def test_insert():
    x = np.random.randint(10, size=(10, 10))
    a = from_array(x, chunks=(5, 5))
    y = np.random.randint(10, size=(5, 10))
    b = from_array(y, chunks=(4, 4))

    assert eq(np.insert(x, 0, -1, axis=0), insert(a, 0, -1, axis=0))
    assert eq(np.insert(x, 3, -1, axis=-1), insert(a, 3, -1, axis=-1))
    assert eq(np.insert(x, 5, -1, axis=1), insert(a, 5, -1, axis=1))
    assert eq(np.insert(x, -1, -1, axis=-2), insert(a, -1, -1, axis=-2))
    assert eq(np.insert(x, [2, 3, 3], -1, axis=1),
                 insert(a, [2, 3, 3], -1, axis=1))
    assert eq(np.insert(x, [2, 3, 8, 8, -2, -2], -1, axis=0),
                 insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0))
    assert eq(np.insert(x, slice(1, 4), -1, axis=1),
                 insert(a, slice(1, 4), -1, axis=1))
    assert eq(np.insert(x, [2] * 3 + [5] * 2, y, axis=0),
                 insert(a, [2] * 3 + [5] * 2, b, axis=0))
    assert eq(np.insert(x, 0, y[0], axis=1),
                 insert(a, 0, b[0], axis=1))
    assert raises(NotImplementedError, lambda: insert(a, [4, 2], -1, axis=0))
    assert raises(IndexError, lambda: insert(a, [3], -1, axis=2))
    assert raises(IndexError, lambda: insert(a, [3], -1, axis=-3))
    assert same_keys(insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0),
                    insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0))
예제 #6
0
def test_getitem():
    df = pd.DataFrame(
        {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3},
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf["A"], df["A"])
    # check cache consistency
    tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"])

    assert eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]])

    assert eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    assert raises(KeyError, lambda: df["X"])
    assert raises(KeyError, lambda: df[["A", "X"]])
    assert raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf[0], df[0])
    assert eq(ddf[[1, 2]], df[[1, 2]])

    assert raises(KeyError, lambda: df[8])
    assert raises(KeyError, lambda: df[[1, 8]])
예제 #7
0
파일: test_rolling.py 프로젝트: to266/dask
def test_rolling_partition_size():
    df = pd.DataFrame(np.random.randn(50, 2))
    ddf = dd.from_pandas(df, npartitions=5)

    for obj, dobj in [(df, ddf), (df[0], ddf[0])]:
        eq(obj.rolling(10).mean(), dobj.rolling(10).mean())
        eq(obj.rolling(11).mean(), dobj.rolling(11).mean())
        raises(NotImplementedError, lambda: dobj.rolling(12).mean())
예제 #8
0
def test_rolling_functions_raises():
    df = pd.DataFrame({'a': np.random.randn(25).cumsum(),
                       'b': np.random.randint(100, size=(25,))})
    ddf = dd.from_pandas(df, 3)
    assert raises(TypeError, lambda: dd.rolling_mean(ddf, 1.5))
    assert raises(ValueError, lambda: dd.rolling_mean(ddf, -1))
    assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, freq=2))
    assert raises(NotImplementedError, lambda: dd.rolling_mean(ddf, 3, how='min'))
예제 #9
0
파일: test_store.py 프로젝트: AhmedGS/dask
def test_update():
    s = Store()

    dsk = {'x': 1, 'y': (inc, 'x')}
    s.update(dsk)

    assert s['y'] == 2

    assert raises(Exception, lambda: s.update({'x': 2}))
    assert not raises(Exception, lambda: s.update({'x': 1}))
예제 #10
0
def test_broadcast_to():
    x = np.random.randint(10, size=(5, 1, 6))
    a = from_array(x, chunks=(3, 1, 3))

    for shape in [(5, 4, 6), (2, 5, 1, 6), (3, 4, 5, 4, 6)]:
        assert eq(chunk.broadcast_to(x, shape),
                        broadcast_to(a, shape))

    assert raises(ValueError, lambda: broadcast_to(a, (2, 1, 6)))
    assert raises(ValueError, lambda: broadcast_to(a, (3,)))
예제 #11
0
def test_partial_by_order():
    f = partial_by_order(add, [(1, 20)])
    assert f(5) == 25
    assert f.__name__ == 'add(20)'

    f = partial_by_order(lambda x, y, z: x + y + z, [(1, 10), (2, 15)])
    assert f(3) == 28
    assert f.__name__ == '<lambda>(...)'

    assert raises(ValueError, lambda: partial_by_order(add, 1))
    assert raises(ValueError, lambda: partial_by_order(add, [1]))
예제 #12
0
def test_full_groupby():
    assert raises(Exception, lambda: d.groupby('does_not_exist'))
    assert raises(Exception, lambda: d.groupby('a').does_not_exist)
    assert 'b' in dir(d.groupby('a'))
    def func(df):
        df['b'] = df.b - df.b.mean()
        return df
    assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func))

    assert sorted(d.groupby('a').apply(func).dask) == \
           sorted(d.groupby('a').apply(func).dask)
예제 #13
0
파일: test_linalg.py 프로젝트: gochoam/dask
def test_lu_errors():
    A = np.random.random_integers(0, 10, (10, 10, 10))
    dA = da.from_array(A, chunks=(5, 5, 5))
    assert raises(ValueError, lambda: da.linalg.lu(dA))

    A = np.random.random_integers(0, 10, (10, 8))
    dA = da.from_array(A, chunks=(5, 4))
    assert raises(ValueError, lambda: da.linalg.lu(dA))

    A = np.random.random_integers(0, 10, (20, 20))
    dA = da.from_array(A, chunks=(5, 4))
    assert raises(ValueError, lambda: da.linalg.lu(dA))
예제 #14
0
파일: test_core.py 프로젝트: datastark/dask
def test_GetFunctionTestCase_class():
    class CustomTestGetFail(GetFunctionTestCase):
        get = staticmethod(lambda x, y: 1)

    custom_testget = CustomTestGetFail()
    raises(AssertionError, custom_testget.test_get)

    class CustomTestGetPass(GetFunctionTestCase):
        get = staticmethod(core.get)

    custom_testget = CustomTestGetPass()
    custom_testget.test_get()
예제 #15
0
파일: test_core.py 프로젝트: ankravch/dask
def test_GetFunctionTestMixin_class():
    class TestCustomGetFail(GetFunctionTestMixin):
        get = staticmethod(lambda x, y: 1)

    custom_testget = TestCustomGetFail()
    raises(AssertionError, custom_testget.test_get)

    class TestCustomGetPass(GetFunctionTestMixin):
        get = staticmethod(core.get)

    custom_testget = TestCustomGetPass()
    custom_testget.test_get()
예제 #16
0
파일: test_linalg.py 프로젝트: AhmedGS/dask
def test_solve_triangular_errors():
    A = np.random.random_integers(0, 10, (10, 10, 10))
    b = np.random.random_integers(1, 10, 10)
    dA = da.from_array(A, chunks=(5, 5, 5))
    db = da.from_array(b, chunks=5)
    assert raises(ValueError, lambda: da.linalg.solve_triangular(dA, db))

    A = np.random.random_integers(0, 10, (10, 10))
    b = np.random.random_integers(1, 10, 10)
    dA = da.from_array(A, chunks=(3, 3))
    db = da.from_array(b, chunks=5)
    assert raises(ValueError, lambda: da.linalg.solve_triangular(dA, db))
예제 #17
0
def test_store():
    d = da.ones((4, 4), chunks=(2, 2))
    a, b = d + 1, d + 2

    at = np.empty(shape=(4, 4))
    bt = np.empty(shape=(4, 4))

    store([a, b], [at, bt])
    assert (at == 2).all()
    assert (bt == 3).all()

    assert raises(ValueError, lambda: store([a], [at, bt]))
    assert raises(ValueError, lambda: store(at, at))
    assert raises(ValueError, lambda: store([at, bt], [at, bt]))
예제 #18
0
def test_dataframe_quantile():

    # column X is for test column order and result division
    df = pd.DataFrame(
        {"A": np.arange(20), "X": np.arange(20, 40), "B": np.arange(10, 30), "C": ["a", "b", "c", "d"] * 5},
        columns=["A", "X", "B", "C"],
    )
    ddf = dd.from_pandas(df, 3)

    result = ddf.quantile()
    assert result.npartitions == 1
    assert result.divisions == ("A", "X")

    result = result.compute()
    assert isinstance(result, pd.Series)
    tm.assert_index_equal(result.index, pd.Index(["A", "X", "B"]))
    assert (result > pd.Series([16, 36, 26], index=["A", "X", "B"])).all()
    assert (result < pd.Series([17, 37, 27], index=["A", "X", "B"])).all()

    result = ddf.quantile([0.25, 0.75])
    assert result.npartitions == 1
    assert result.divisions == (0.25, 0.75)

    result = result.compute()
    assert isinstance(result, pd.DataFrame)
    tm.assert_index_equal(result.index, pd.Index([0.25, 0.75]))
    tm.assert_index_equal(result.columns, pd.Index(["A", "X", "B"]))
    minexp = pd.DataFrame([[1, 21, 11], [17, 37, 27]], index=[0.25, 0.75], columns=["A", "X", "B"])
    assert (result > minexp).all().all()
    maxexp = pd.DataFrame([[2, 22, 12], [18, 38, 28]], index=[0.25, 0.75], columns=["A", "X", "B"])
    assert (result < maxexp).all().all()

    assert eq(ddf.quantile(axis=1), df.quantile(axis=1))
    assert raises(ValueError, lambda: ddf.quantile([0.25, 0.75], axis=1))
예제 #19
0
def test_reductions_frame():
    assert eq(d.sum(), full.sum())
    assert eq(d.min(), full.min())
    assert eq(d.max(), full.max())
    assert eq(d.count(), full.count())
    assert eq(d.std(), full.std())
    assert eq(d.var(), full.var())
    assert eq(d.std(ddof=0), full.std(ddof=0))
    assert eq(d.var(ddof=0), full.var(ddof=0))
    assert eq(d.mean(), full.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert eq(d.sum(axis=axis), full.sum(axis=axis))
        assert eq(d.min(axis=axis), full.min(axis=axis))
        assert eq(d.max(axis=axis), full.max(axis=axis))
        assert eq(d.count(axis=axis), full.count(axis=axis))
        assert eq(d.std(axis=axis), full.std(axis=axis))
        assert eq(d.var(axis=axis), full.var(axis=axis))
        assert eq(d.std(axis=axis, ddof=0), full.std(axis=axis, ddof=0))
        assert eq(d.var(axis=axis, ddof=0), full.var(axis=axis, ddof=0))
        assert eq(d.mean(axis=axis), full.mean(axis=axis))

    assert raises(ValueError, lambda: d.sum(axis='incorrect').compute())

    assert_dask_graph(d.sum(), 'dataframe-sum')
    assert_dask_graph(d.min(), 'dataframe-min')
    assert_dask_graph(d.max(), 'dataframe-max')
    assert_dask_graph(d.count(), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(d.std(), 'dataframe-sum')
    assert_dask_graph(d.std(), 'dataframe-count')
    assert_dask_graph(d.var(), 'dataframe-sum')
    assert_dask_graph(d.var(), 'dataframe-count')
    assert_dask_graph(d.mean(), 'dataframe-sum')
    assert_dask_graph(d.mean(), 'dataframe-count')
예제 #20
0
def test_series_groupby_errors():
    s = pd.Series([1, 2, 2, 1, 1])

    ss = dd.from_pandas(s, npartitions=2)

    msg = "Grouper for '1' not 1-dimensional"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([1, 2])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([1, 2]) # dask should raise the same error
    msg = "Grouper for '2' not 1-dimensional"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([2])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([2]) # dask should raise the same error

    msg = "No group keys passed!"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([]) # dask should raise the same error

    sss = dd.from_pandas(s, npartitions=3)
    assert raises(NotImplementedError, lambda: ss.groupby(sss))

    with tm.assertRaises(KeyError):
        s.groupby('x')  # pandas
    with tm.assertRaises(KeyError):
        ss.groupby('x') # dask should raise the same error
예제 #21
0
def test_concatenate():
    a, b, c = [Array(getem(name, chunks=(2, 3), shape=(4, 6)),
                     name, shape=(4, 6), chunks=(2, 3))
                for name in 'ABC']

    x = concatenate([a, b, c], axis=0)

    assert x.shape == (12, 6)
    assert x.chunks == ((2, 2, 2, 2, 2, 2), (3, 3))
    assert x.dask[(x.name, 0, 1)] == ('A', 0, 1)
    assert x.dask[(x.name, 5, 0)] == ('C', 1, 0)
    assert same_keys(x, concatenate([a, b, c], axis=0))

    y = concatenate([a, b, c], axis=1)

    assert y.shape == (4, 18)
    assert y.chunks == ((2, 2), (3, 3, 3, 3, 3, 3))
    assert y.dask[(y.name, 1, 0)] == ('A', 1, 0)
    assert y.dask[(y.name, 1, 5)] == ('C', 1, 1)
    assert same_keys(y, concatenate([a, b, c], axis=1))

    assert set(b.dask.keys()).issubset(y.dask.keys())

    assert concatenate([a, b, c], axis=-1).chunks == \
            concatenate([a, b, c], axis=1).chunks

    assert raises(ValueError, lambda: concatenate([a, b, c], axis=2))
예제 #22
0
def test_unravel():
    x = np.random.randint(10, size=24)

    # these should use the shortcut
    for chunks, shape in [(24, (3, 8)),
                          (24, (12, 2)),
                          (6, (4, 6)),
                          (6, (4, 3, 2)),
                          (6, (4, 6, 1)),
                          (((6, 12, 6),), (4, 6))]:
        a = from_array(x, chunks=chunks)
        unraveled = unravel(a, shape)
        assert eq(x.reshape(*shape), unraveled)
        assert len(unraveled.dask) == len(a.dask) + len(a.chunks[0])

    # these cannot
    for chunks, shape in [(6, (2, 12)),
                          (6, (1, 4, 6)),
                          (6, (2, 1, 12))]:
        a = from_array(x, chunks=chunks)
        unraveled = unravel(a, shape)
        assert eq(x.reshape(*shape), unraveled)
        assert len(unraveled.dask) > len(a.dask) + len(a.chunks[0])

    assert raises(AssertionError, lambda: unravel(unraveled, (3, 8)))
    assert unravel(a, a.shape) is a
예제 #23
0
def test_elemwise_with_ndarrays():
    x = np.arange(3)
    y = np.arange(12).reshape(4, 3)
    a = from_array(x, chunks=(3,))
    b = from_array(y, chunks=(2, 3))

    assert eq(x + a, 2 * x)
    assert eq(a + x, 2 * x)

    assert eq(x + b, x + y)
    assert eq(b + x, x + y)
    assert eq(a + y, x + y)
    assert eq(y + a, x + y)
    # Error on shape mismatch
    assert raises(ValueError, lambda: a + y.T)
    assert raises(ValueError, lambda: a + np.arange(2))
예제 #24
0
def test_full_groupby():
    df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                       'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]},
                      index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(df, npartitions=3)

    assert raises(Exception, lambda: df.groupby('does_not_exist'))
    assert raises(Exception, lambda: df.groupby('a').does_not_exist)
    assert 'b' in dir(df.groupby('a'))

    def func(df):
        df['b'] = df.b - df.b.mean()
        return df

    assert eq(df.groupby('a').apply(func),
              ddf.groupby('a').apply(func))
예제 #25
0
파일: test_store.py 프로젝트: AhmedGS/dask
def test_basic():
    s = Store()
    s['x'] = 1
    s['y'] = (inc, 'x')
    s['z'] = (add, 'x', 'y')

    assert s.data == set(['x'])

    assert s['z'] == 3
    assert 'x' in s.data
    assert s.cache['z'] == 3
    assert s.cache['y'] == 2

    assert len(s.access_times['z']) == 1
    assert len(s.access_times['y']) == 1
    assert len(s.access_times['x']) == 2
    assert s.compute_time['z'] < 0.1

    cache = s.cache.copy()
    assert s['z'] == 3
    assert s.cache == cache
    assert len(s.access_times['z']) == 2
    assert len(s.access_times['y']) == 1
    assert len(s.access_times['x']) == 2

    assert s[5] == 5
    assert list(s[['x', 'y']]) == [s['x'], s['y']]

    def reassign():
        s['x'] = 2
    assert raises(Exception, reassign)
예제 #26
0
def test_take():
    x = np.arange(400).reshape((20, 20))
    a = from_array(x, chunks=(5, 5))

    assert eq(np.take(x, 3, axis=0), take(a, 3, axis=0))
    assert eq(np.take(x, [3, 4, 5], axis=-1), take(a, [3, 4, 5], axis=-1))
    assert raises(ValueError, lambda: take(a, 3, axis=2))
예제 #27
0
def test_equivalence_uncomparable():
    t1 = Uncomparable()
    t2 = Uncomparable()
    assert raises(TypeError, lambda: t1 == t2)
    assert equivalent(t1, t1)
    assert not equivalent(t1, t2)
    assert equivalent((add, t1, 0), (add, t1, 0))
    assert not equivalent((add, t1, 0), (add, t2, 0))
예제 #28
0
def test_get_stack_limit():
    d = dict(('x%s' % (i+1), (inc, 'x%s' % i)) for i in range(10000))
    d['x0'] = 0
    assert get(d, 'x10000') == 10000
    # introduce cycle
    d['x5000'] = (inc, 'x5001')
    assert raises(RuntimeError, lambda: get(d, 'x10000'))
    assert get(d, 'x4999') == 4999
예제 #29
0
def test_from_filenames():
    with filetexts({'a1.log': 'A\nB', 'a2.log': 'C\nD'}) as fns:
        assert set(line.strip() for line in db.from_filenames(fns)) == \
                set('ABCD')
        assert set(line.strip() for line in db.from_filenames('a*.log')) == \
                set('ABCD')

    assert raises(ValueError, lambda: db.from_filenames('non-existent-*-path'))
예제 #30
0
def test_reshape_unknown_dimensions():
    for original_shape in [(24,), (2, 12), (2, 3, 4)]:
        for new_shape in [(-1,), (2, -1), (-1, 3, 4)]:
            x = np.random.randint(10, size=original_shape)
            a = from_array(x, 4)
            assert eq(x.reshape(new_shape), a.reshape(new_shape))

    assert raises(ValueError, lambda: reshape(a, (-1, -1)))
예제 #31
0
def test_loc():
    assert d.loc[3:8].divisions[0] == 3
    assert d.loc[3:8].divisions[-1] == 8

    assert d.loc[5].divisions == (5, 5)

    assert eq(d.loc[5], full.loc[5])
    assert eq(d.loc[3:8], full.loc[3:8])
    assert eq(d.loc[:8], full.loc[:8])
    assert eq(d.loc[3:], full.loc[3:])

    assert eq(d.a.loc[5], full.a.loc[5])
    assert eq(d.a.loc[3:8], full.a.loc[3:8])
    assert eq(d.a.loc[:8], full.a.loc[:8])
    assert eq(d.a.loc[3:], full.a.loc[3:])

    assert raises(KeyError, lambda: d.loc[1000])
    assert eq(d.loc[1000:], full.loc[1000:])
    assert eq(d.loc[-2000:-1000], full.loc[-2000:-1000])

    assert sorted(d.loc[5].dask) == sorted(d.loc[5].dask)
    assert sorted(d.loc[5].dask) != sorted(d.loc[6].dask)
예제 #32
0
def test_unknown_divisions():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        }),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        }),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        })
    }
    d = dd.DataFrame(dsk, 'x', ['a', 'b'], [None, None, None, None])
    full = d.compute(get=dask.get)

    assert eq(d.a.sum(), full.a.sum())
    assert eq(d.a + d.b + 1, full.a + full.b + 1)

    assert raises(ValueError, lambda: d.loc[3])
예제 #33
0
def test_reduction_series_invalid_axis():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for axis in [1, 'columns']:
        for s in [ddf1.a, pdf1.a]: # both must behave the same
            assert raises(ValueError, lambda: s.sum(axis=axis))
            assert raises(ValueError, lambda: s.min(axis=axis))
            assert raises(ValueError, lambda: s.max(axis=axis))
            # only count doesn't have axis keyword
            assert raises(TypeError, lambda: s.count(axis=axis))
            assert raises(ValueError, lambda: s.std(axis=axis))
            assert raises(ValueError, lambda: s.var(axis=axis))
            assert raises(ValueError, lambda: s.mean(axis=axis))
예제 #34
0
def test_repartition():
    def _check_split_data(orig, d):
        """Check data is split properly"""
        keys = [k for k in d.dask if k[0].startswith('repartition-split')]
        keys = sorted(keys)
        sp = pd.concat([d._get(d.dask, k) for k in keys])
        assert eq(orig, sp)
        assert eq(orig, d)

    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6],
        'y': list('abdabd')
    },
                      index=[10, 20, 30, 40, 50, 60])
    a = dd.from_pandas(df, 2)

    b = a.repartition(divisions=[10, 20, 50, 60])
    assert b.divisions == (10, 20, 50, 60)
    assert eq(a, b)
    assert eq(a._get(b.dask, (b._name, 0)), df.iloc[:1])

    for div in [
        [20, 60],
        [10, 50],
        [1],  # first / last element mismatch
        [0, 60],
        [10, 70],  # do not allow to expand divisions by default
        [10, 50, 20, 60],  # not sorted
        [10, 10, 20, 60]
    ]:  # not unique (last element can be duplicated)

        assert raises(ValueError, lambda: a.repartition(divisions=div))

    pdf = pd.DataFrame(np.random.randn(7, 5), columns=list('abxyz'))
    for p in range(1, 7):
        ddf = dd.from_pandas(pdf, p)
        assert eq(ddf, pdf)
        for div in [[0, 6], [0, 6, 6], [0, 5, 6], [0, 4, 6, 6], [0, 2, 6],
                    [0, 2, 6, 6], [0, 2, 3, 6, 6], [0, 1, 2, 3, 4, 5, 6, 6]]:
            rddf = ddf.repartition(divisions=div)
            _check_split_data(ddf, rddf)
            assert rddf.divisions == tuple(div)
            assert eq(pdf, rddf)

            rds = ddf.x.repartition(divisions=div)
            _check_split_data(ddf.x, rds)
            assert rds.divisions == tuple(div)
            assert eq(pdf.x, rds)

        # expand divisions
        for div in [[-5, 10], [-2, 3, 5, 6], [0, 4, 5, 9, 10]]:
            rddf = ddf.repartition(divisions=div, force=True)
            _check_split_data(ddf, rddf)
            assert rddf.divisions == tuple(div)
            assert eq(pdf, rddf)

            rds = ddf.x.repartition(divisions=div, force=True)
            _check_split_data(ddf.x, rds)
            assert rds.divisions == tuple(div)
            assert eq(pdf.x, rds)

    pdf = pd.DataFrame(
        {
            'x': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
            'y': [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
        },
        index=list('abcdefghij'))
    for p in range(1, 7):
        ddf = dd.from_pandas(pdf, p)
        assert eq(ddf, pdf)
        for div in [
                list('aj'),
                list('ajj'),
                list('adj'),
                list('abfj'),
                list('ahjj'),
                list('acdj'),
                list('adfij'),
                list('abdefgij'),
                list('abcdefghij')
        ]:
            rddf = ddf.repartition(divisions=div)
            _check_split_data(ddf, rddf)
            assert rddf.divisions == tuple(div)
            assert eq(pdf, rddf)

            rds = ddf.x.repartition(divisions=div)
            _check_split_data(ddf.x, rds)
            assert rds.divisions == tuple(div)
            assert eq(pdf.x, rds)

        # expand divisions
        for div in [list('Yadijm'), list('acmrxz'), list('Yajz')]:
            rddf = ddf.repartition(divisions=div, force=True)
            _check_split_data(ddf, rddf)
            assert rddf.divisions == tuple(div)
            assert eq(pdf, rddf)

            rds = ddf.x.repartition(divisions=div, force=True)
            _check_split_data(ddf.x, rds)
            assert rds.divisions == tuple(div)
            assert eq(pdf.x, rds)
예제 #35
0
def test_split_apply_combine_on_series():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 6],
            'b': [4, 2, 7]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 4, 6],
            'b': [3, 3, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [4, 3, 7],
            'b': [1, 1, 3]
        },
                               index=[9, 9, 9])
    }
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(
            ddf1.groupby(ddkey).a.nunique(),
            pdf1.groupby(pdkey).a.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(),
                  pdf1.a.groupby(pdkey).sum(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(),
                  pdf1.a.groupby(pdkey).max(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(),
                  pdf1.a.groupby(pdkey).count(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(),
                  pdf1.a.groupby(pdkey).mean(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(),
                  pdf1.a.groupby(pdkey).nunique(),
                  check_names=False)

    for i in range(8):
        assert eq(
            ddf1.groupby(ddf1.b > i).a.sum(),
            pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.min(),
            pdf1.groupby(pdf1.b > i).a.min())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.max(),
            pdf1.groupby(pdf1.b > i).a.max())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.count(),
            pdf1.groupby(pdf1.b > i).a.count())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.mean(),
            pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.nunique(),
            pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(
            ddf1.groupby(ddf1.a > i).b.sum(),
            pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.min(),
            pdf1.groupby(pdf1.a > i).b.min())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.max(),
            pdf1.groupby(pdf1.a > i).b.max())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.count(),
            pdf1.groupby(pdf1.a > i).b.count())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.mean(),
            pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.nunique(),
            pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(
            ddf1.groupby(ddf1.b > i).sum(),
            pdf1.groupby(pdf1.b > i).sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).min(),
            pdf1.groupby(pdf1.b > i).min())
        assert eq(
            ddf1.groupby(ddf1.b > i).max(),
            pdf1.groupby(pdf1.b > i).max())
        assert eq(
            ddf1.groupby(ddf1.b > i).count(),
            pdf1.groupby(pdf1.b > i).count())
        assert eq(
            ddf1.groupby(ddf1.b > i).mean(),
            pdf1.groupby(pdf1.b > i).mean())

        assert eq(
            ddf1.groupby(ddf1.a > i).sum(),
            pdf1.groupby(pdf1.a > i).sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).min(),
            pdf1.groupby(pdf1.a > i).min())
        assert eq(
            ddf1.groupby(ddf1.a > i).max(),
            pdf1.groupby(pdf1.a > i).max())
        assert eq(
            ddf1.groupby(ddf1.a > i).count(),
            pdf1.groupby(pdf1.a > i).count())
        assert eq(
            ddf1.groupby(ddf1.a > i).mean(),
            pdf1.groupby(pdf1.a > i).mean())

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(
            ddf1.groupby(ddkey).b.nunique(),
            pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(
            ddf1.groupby(ddkey).mean(),
            pdf1.groupby(pdkey).mean().astype(float))

    assert sorted(ddf1.groupby('b').a.sum().dask) == \
           sorted(ddf1.groupby('b').a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \
           sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
예제 #36
0
파일: test_bag.py 프로젝트: fortiema/dask
def test_pluck_with_default():
    b = db.from_sequence(['Hello', '', 'World'])
    assert raises(IndexError, lambda: list(b.pluck(0)))
    assert list(b.pluck(0, None)) == ['H', None, 'W']
예제 #37
0
 def check_raises(d, p, func):
     assert raises((TypeError, ValueError),
                   lambda: getattr(d, func)().compute())
     assert raises((TypeError, ValueError),
                   lambda: getattr(p, func)())
예제 #38
0
def test_Series():
    assert isinstance(d.a, dd.Series)
    assert isinstance(d.a + 1, dd.Series)
    assert raises(Exception, lambda: d + 1)
예제 #39
0
def test_rename_index():
    renamer = {0: 1}
    assert raises(ValueError, lambda: d.rename(index=renamer))
예제 #40
0
def test_attributes():
    assert 'a' in dir(d)
    assert 'foo' not in dir(d)
    assert raises(AttributeError, lambda: d.foo)
예제 #41
0
def test_set_index_raises_error_on_bad_input():
    df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]})
    ddf = dd.from_pandas(df, 2)

    assert raises(NotImplementedError, lambda: ddf.set_index(['a', 'b']))
예제 #42
0
def test_cant_fft_chunked_axis():
    bad_darr = da.from_array(nparr, chunks=(5, 5))
    assert raises(ValueError, lambda: fft(bad_darr))
    assert raises(ValueError, lambda: fft(bad_darr, axis=0))
예제 #43
0
def test_rechunk_with_empty_input():
    x = da.ones((24, 24), chunks=(4, 8))
    assert x.rechunk(chunks={}).chunks == x.chunks
    assert raises(ValueError, lambda: x.rechunk(chunks=()))
예제 #44
0
def test_from_function_requires_block_args():
    x = np.arange(10)
    assert raises(Exception, lambda: from_array(x))
예제 #45
0
def test_numpy_compat_is_notimplemented():
    a = np.arange(10)
    x = da.from_array(a, chunks=5)
    assert raises(NotImplementedError, lambda: x + a)
예제 #46
0
def test_blockdims_from_blockshape():
    assert blockdims_from_blockshape((10, 10),
                                     (4, 3)) == ((4, 4, 2), (3, 3, 3, 1))
    assert raises(ValueError, lambda: blockdims_from_blockshape((10, ), None))
예제 #47
0
def test_groupby_set_index():
    df = tm.makeTimeDataFrame()
    ddf = dd.from_pandas(df, npartitions=2)
    assert raises(NotImplementedError,
                  lambda: ddf.groupby(df.index.month, as_index=False))
예제 #48
0
def test_iloc_raises():
    assert raises(NotImplementedError, lambda: d.iloc[:5])
예제 #49
0
def test_split_apply_combine_on_series():
    pdf1 = pd.DataFrame(
        {
            'a': [1, 2, 6, 4, 4, 6, 4, 3, 7],
            'b': [4, 2, 7, 3, 3, 1, 1, 1, 2]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(pdf1, npartitions=3)
    ddf1 = ddf

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(
            ddf1.groupby(ddkey).a.nunique(),
            pdf1.groupby(pdkey).a.nunique())
        assert eq(ddf1.groupby(ddkey).a.size(), pdf1.groupby(pdkey).a.size())
        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddkey).a.var(ddof),
                pdf1.groupby(pdkey).a.var(ddof))
            assert eq(
                ddf1.groupby(ddkey).a.std(ddof),
                pdf1.groupby(pdkey).a.std(ddof))

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())
        assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size())
        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddkey).var(ddof),
                      pdf1.groupby(pdkey).var(ddof),
                      check_dtype=False)
            assert eq(ddf1.groupby(ddkey).std(ddof),
                      pdf1.groupby(pdkey).std(ddof),
                      check_dtype=False)

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(),
                  pdf1.a.groupby(pdkey).sum(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(),
                  pdf1.a.groupby(pdkey).max(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(),
                  pdf1.a.groupby(pdkey).count(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(),
                  pdf1.a.groupby(pdkey).mean(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(),
                  pdf1.a.groupby(pdkey).nunique(),
                  check_names=False)
        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.a.groupby(ddkey).var(ddof),
                pdf1.a.groupby(pdkey).var(ddof))
            assert eq(
                ddf1.a.groupby(ddkey).std(ddof),
                pdf1.a.groupby(pdkey).std(ddof))

    for i in range(8):
        assert eq(
            ddf1.groupby(ddf1.b > i).a.sum(),
            pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.min(),
            pdf1.groupby(pdf1.b > i).a.min())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.max(),
            pdf1.groupby(pdf1.b > i).a.max())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.count(),
            pdf1.groupby(pdf1.b > i).a.count())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.mean(),
            pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.nunique(),
            pdf1.groupby(pdf1.b > i).a.nunique())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.size(),
            pdf1.groupby(pdf1.b > i).a.size())

        assert eq(
            ddf1.groupby(ddf1.a > i).b.sum(),
            pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.min(),
            pdf1.groupby(pdf1.a > i).b.min())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.max(),
            pdf1.groupby(pdf1.a > i).b.max())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.count(),
            pdf1.groupby(pdf1.a > i).b.count())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.mean(),
            pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.nunique(),
            pdf1.groupby(pdf1.a > i).b.nunique())
        assert eq(
            ddf1.groupby(ddf1.b > i).b.size(),
            pdf1.groupby(pdf1.b > i).b.size())

        assert eq(
            ddf1.groupby(ddf1.b > i).sum(),
            pdf1.groupby(pdf1.b > i).sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).min(),
            pdf1.groupby(pdf1.b > i).min())
        assert eq(
            ddf1.groupby(ddf1.b > i).max(),
            pdf1.groupby(pdf1.b > i).max())
        assert eq(
            ddf1.groupby(ddf1.b > i).count(),
            pdf1.groupby(pdf1.b > i).count())
        assert eq(
            ddf1.groupby(ddf1.b > i).mean(),
            pdf1.groupby(pdf1.b > i).mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).size(),
            pdf1.groupby(pdf1.b > i).size())

        assert eq(
            ddf1.groupby(ddf1.a > i).sum(),
            pdf1.groupby(pdf1.a > i).sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).min(),
            pdf1.groupby(pdf1.a > i).min())
        assert eq(
            ddf1.groupby(ddf1.a > i).max(),
            pdf1.groupby(pdf1.a > i).max())
        assert eq(
            ddf1.groupby(ddf1.a > i).count(),
            pdf1.groupby(pdf1.a > i).count())
        assert eq(
            ddf1.groupby(ddf1.a > i).mean(),
            pdf1.groupby(pdf1.a > i).mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).size(),
            pdf1.groupby(pdf1.a > i).size())

        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddf1.b > i).std(ddof),
                pdf1.groupby(pdf1.b > i).std(ddof))

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(
            ddf1.groupby(ddkey).b.nunique(),
            pdf1.groupby(pdkey).b.nunique())
        assert eq(ddf1.groupby(ddkey).b.size(), pdf1.groupby(pdkey).b.size())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(
            ddf1.groupby(ddkey).mean(),
            pdf1.groupby(pdkey).mean().astype(float))
        assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size())

        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddkey).b.std(ddof),
                pdf1.groupby(pdkey).b.std(ddof))

    assert (sorted(ddf1.groupby('b').a.sum().dask) == sorted(
        ddf1.groupby('b').a.sum().dask))
    assert (sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted(
        ddf1.groupby(ddf1.a > 3).b.mean().dask))

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')
    assert_dask_graph(ddf1.groupby('b').a.size(), 'series-groupby-size')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
    assert_dask_graph(ddf1.groupby('b').size(), 'dataframe-groupby-size')
예제 #50
0
def test_series_resample_not_implemented():
    index = pd.date_range(start='20120102', periods=100, freq='T')
    s = pd.Series(range(len(index)), index=index)
    ds = dd.from_pandas(s, npartitions=5)
    # Frequency doesn't evenly divide day
    assert raises(NotImplementedError, lambda: resample(ds, '57T'))
예제 #51
0
def test_reductions_frame(split_every):
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert_eq(ddf1.sum(split_every=split_every), pdf1.sum())
    assert_eq(ddf1.min(split_every=split_every), pdf1.min())
    assert_eq(ddf1.max(split_every=split_every), pdf1.max())
    assert_eq(ddf1.count(split_every=split_every), pdf1.count())
    assert_eq(ddf1.std(split_every=split_every), pdf1.std())
    assert_eq(ddf1.var(split_every=split_every), pdf1.var())
    assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0))
    assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0))
    assert_eq(ddf1.mean(split_every=split_every), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert_eq(ddf1.sum(axis=axis, split_every=split_every),
                  pdf1.sum(axis=axis))
        assert_eq(ddf1.min(axis=axis, split_every=split_every),
                  pdf1.min(axis=axis))
        assert_eq(ddf1.max(axis=axis, split_every=split_every),
                  pdf1.max(axis=axis))
        assert_eq(ddf1.count(axis=axis, split_every=split_every),
                  pdf1.count(axis=axis))
        assert_eq(ddf1.std(axis=axis, split_every=split_every),
                  pdf1.std(axis=axis))
        assert_eq(ddf1.var(axis=axis, split_every=split_every),
                  pdf1.var(axis=axis))
        assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every),
                  pdf1.std(axis=axis, ddof=0))
        assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every),
                  pdf1.var(axis=axis, ddof=0))
        assert_eq(ddf1.mean(axis=axis, split_every=split_every),
                  pdf1.mean(axis=axis))

    assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min')
    assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max')
    assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1, split_every=split_every),
                      'dataframe-sum')
    assert_dask_graph(ddf1.min(axis=1, split_every=split_every),
                      'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1, split_every=split_every),
                      'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1, split_every=split_every),
                      'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1, split_every=split_every),
                      'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1, split_every=split_every),
                      'dataframe-var')
    assert_dask_graph(ddf1.mean(axis=1, split_every=split_every),
                      'dataframe-mean')
예제 #52
0
def test_iloc_raises():
    assert raises(AttributeError, lambda: d.iloc[:5])
예제 #53
0
def test_exceptions_rise_to_top():
    dsk = {'x': 1, 'y': (bad, 'x')}
    assert raises(ValueError, lambda: get(dsk, 'y'))
예제 #54
0
def test_broadcast_shapes():
    assert (3, 4, 5) == broadcast_shapes((3, 4, 5), (4, 1), ())
    assert (3, 4) == broadcast_shapes((3, 1), (1, 4), (4, ))
    assert (5, 6, 7, 3, 4) == broadcast_shapes((3, 1), (), (5, 6, 7, 1, 4))
    assert raises(ValueError, lambda: broadcast_shapes((3, ), (3, 4)))
    assert raises(ValueError, lambda: broadcast_shapes((2, 3), (2, 3, 1)))
예제 #55
0
def test_frame_series_arithmetic_methods():
    pdf1 = pd.DataFrame({'A': np.arange(10),
                         'B': [np.nan, 1, 2, 3, 4] * 2,
                         'C': [np.nan] * 10,
                         'D': np.arange(10)},
                        index=list('abcdefghij'), columns=list('ABCD'))
    pdf2 = pd.DataFrame(np.random.randn(10, 4),
                        index=list('abcdefghjk'), columns=list('ABCX'))
    ps1 = pdf1.A
    ps2 = pdf2.A

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 2)
    ds1 = ddf1.A
    ds2 = ddf2.A

    s = dd.core.Scalar({('s', 0): 4}, 's')

    for l, r, el, er in [(ddf1, ddf2, pdf1, pdf2), (ds1, ds2, ps1, ps2),
                         (ddf1.repartition(['a', 'f', 'j']), ddf2, pdf1, pdf2),
                         (ds1.repartition(['a', 'b', 'f', 'j']), ds2, ps1, ps2),
                         (ddf1, ddf2.repartition(['a', 'k']), pdf1, pdf2),
                         (ds1, ds2.repartition(['a', 'b', 'd', 'h', 'k']), ps1, ps2),
                         (ddf1, 3, pdf1, 3), (ds1, 3, ps1, 3),
                         (ddf1, s, pdf1, 4), (ds1, s, ps1, 4)]:
        # l, r may be repartitioned, test whether repartition keeps original data
        assert eq(l, el)
        assert eq(r, er)

        assert eq(l.add(r, fill_value=0), el.add(er, fill_value=0))
        assert eq(l.sub(r, fill_value=0), el.sub(er, fill_value=0))
        assert eq(l.mul(r, fill_value=0), el.mul(er, fill_value=0))
        assert eq(l.div(r, fill_value=0), el.div(er, fill_value=0))
        assert eq(l.truediv(r, fill_value=0), el.truediv(er, fill_value=0))
        assert eq(l.floordiv(r, fill_value=1), el.floordiv(er, fill_value=1))
        assert eq(l.mod(r, fill_value=0), el.mod(er, fill_value=0))
        assert eq(l.pow(r, fill_value=0), el.pow(er, fill_value=0))

        assert eq(l.radd(r, fill_value=0), el.radd(er, fill_value=0))
        assert eq(l.rsub(r, fill_value=0), el.rsub(er, fill_value=0))
        assert eq(l.rmul(r, fill_value=0), el.rmul(er, fill_value=0))
        assert eq(l.rdiv(r, fill_value=0), el.rdiv(er, fill_value=0))
        assert eq(l.rtruediv(r, fill_value=0), el.rtruediv(er, fill_value=0))
        assert eq(l.rfloordiv(r, fill_value=1), el.rfloordiv(er, fill_value=1))
        assert eq(l.rmod(r, fill_value=0), el.rmod(er, fill_value=0))
        assert eq(l.rpow(r, fill_value=0), el.rpow(er, fill_value=0))

    for l, r, el, er in [(ddf1, ds2, pdf1, ps2), (ddf1, ddf2.X, pdf1, pdf2.X)]:
        assert eq(l, el)
        assert eq(r, er)

        # must specify axis=0 to add Series to each column
        # axis=1 is not supported (add to each row)
        assert eq(l.add(r, axis=0), el.add(er, axis=0))
        assert eq(l.sub(r, axis=0), el.sub(er, axis=0))
        assert eq(l.mul(r, axis=0), el.mul(er, axis=0))
        assert eq(l.div(r, axis=0), el.div(er, axis=0))
        assert eq(l.truediv(r, axis=0), el.truediv(er, axis=0))
        assert eq(l.floordiv(r, axis=0), el.floordiv(er, axis=0))
        assert eq(l.mod(r, axis=0), el.mod(er, axis=0))
        assert eq(l.pow(r, axis=0), el.pow(er, axis=0))

        assert eq(l.radd(r, axis=0), el.radd(er, axis=0))
        assert eq(l.rsub(r, axis=0), el.rsub(er, axis=0))
        assert eq(l.rmul(r, axis=0), el.rmul(er, axis=0))
        assert eq(l.rdiv(r, axis=0), el.rdiv(er, axis=0))
        assert eq(l.rtruediv(r, axis=0), el.rtruediv(er, axis=0))
        assert eq(l.rfloordiv(r, axis=0), el.rfloordiv(er, axis=0))
        assert eq(l.rmod(r, axis=0), el.rmod(er, axis=0))
        assert eq(l.rpow(r, axis=0), el.rpow(er, axis=0))

        assert raises(ValueError, lambda: l.add(r, axis=1))

    for l, r, el, er in [(ddf1, pdf2, pdf1, pdf2), (ddf1, ps2, pdf1, ps2)]:
        assert eq(l, el)
        assert eq(r, er)

        for axis in [0, 1, 'index', 'columns']:
            assert eq(l.add(r, axis=axis), el.add(er, axis=axis))
            assert eq(l.sub(r, axis=axis), el.sub(er, axis=axis))
            assert eq(l.mul(r, axis=axis), el.mul(er, axis=axis))
            assert eq(l.div(r, axis=axis), el.div(er, axis=axis))
            assert eq(l.truediv(r, axis=axis), el.truediv(er, axis=axis))
            assert eq(l.floordiv(r, axis=axis), el.floordiv(er, axis=axis))
            assert eq(l.mod(r, axis=axis), el.mod(er, axis=axis))
            assert eq(l.pow(r, axis=axis), el.pow(er, axis=axis))

            assert eq(l.radd(r, axis=axis), el.radd(er, axis=axis))
            assert eq(l.rsub(r, axis=axis), el.rsub(er, axis=axis))
            assert eq(l.rmul(r, axis=axis), el.rmul(er, axis=axis))
            assert eq(l.rdiv(r, axis=axis), el.rdiv(er, axis=axis))
            assert eq(l.rtruediv(r, axis=axis), el.rtruediv(er, axis=axis))
            assert eq(l.rfloordiv(r, axis=axis), el.rfloordiv(er, axis=axis))
            assert eq(l.rmod(r, axis=axis), el.rmod(er, axis=axis))
            assert eq(l.rpow(r, axis=axis), el.rpow(er, axis=axis))