Exemplo n.º 1
0
def test_reductions(split_every):
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())
        assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
        assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
        assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every),
                  pds.sum(skipna=False))
        assert_eq(dds.prod(skipna=False, split_every=split_every),
                  pds.prod(skipna=False))
        assert_eq(dds.min(skipna=False, split_every=split_every),
                  pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every),
                  pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every),
                  pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every),
                  pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every),
                  pds.sem(skipna=False))
        assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every),
                  pds.std(skipna=False, ddof=0))
        assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every),
                  pds.var(skipna=False, ddof=0))
        assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every),
                  pds.sem(skipna=False, ddof=0))
        assert_eq(dds.mean(skipna=False, split_every=split_every),
                  pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum')
    assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod')
    assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min')
    assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max')
    assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count')
    assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every),
                      'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every),
                      'series-var')
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every),
                      'series-sem')
    assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every),
                      'drop-duplicates')

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every),
              pd.notnull(pdf1.index).sum())
Exemplo n.º 2
0
def test_split_apply_combine_on_series():
    pdf1 = pd.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7],
                         'b': [4, 2, 7, 3, 3, 1, 1, 1, 2]},
                        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(pdf1, npartitions=3)
    ddf1 = ddf

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique())
        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddkey).a.var(ddof),
                      pdf1.groupby(pdkey).a.var(ddof))
            assert eq(ddf1.groupby(ddkey).a.std(ddof),
                      pdf1.groupby(pdkey).a.std(ddof))

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())
        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddkey).var(ddof),
                      pdf1.groupby(pdkey).var(ddof), check_dtype=False)
            assert eq(ddf1.groupby(ddkey).std(ddof),
                      pdf1.groupby(pdkey).std(ddof), check_dtype=False)

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False)
        for ddof in [0, 1, 2]:
            assert eq(ddf1.a.groupby(ddkey).var(ddof),
                      pdf1.a.groupby(pdkey).var(ddof))
            assert eq(ddf1.a.groupby(ddkey).std(ddof),
                      pdf1.a.groupby(pdkey).std(ddof))

    for i in range(8):
        assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min())
        assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max())
        assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count())
        assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min())
        assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max())
        assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count())
        assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum())
        assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min())
        assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max())
        assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count())
        assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean())

        assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum())
        assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min())
        assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max())
        assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count())
        assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean())

        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddf1.b > i).std(ddof),
                      pdf1.groupby(pdf1.b > i).std(ddof))

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float))

        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddkey).b.std(ddof),
                      pdf1.groupby(pdkey).b.std(ddof))

    assert sorted(ddf1.groupby('b').a.sum().dask) == \
           sorted(ddf1.groupby('b').a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \
           sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
Exemplo n.º 3
0
def test_reductions_frame(split_every):
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 3],
            'b': [4, 5, 6]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 5, 6],
            'b': [3, 2, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [7, 8, 9],
            'b': [0, 0, 0]
        },
                               index=[9, 9, 9])
    }
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert_eq(ddf1.sum(split_every=split_every), pdf1.sum())
    assert_eq(ddf1.prod(split_every=split_every), pdf1.prod())
    assert_eq(ddf1.min(split_every=split_every), pdf1.min())
    assert_eq(ddf1.max(split_every=split_every), pdf1.max())
    assert_eq(ddf1.count(split_every=split_every), pdf1.count())
    assert_eq(ddf1.std(split_every=split_every), pdf1.std())
    assert_eq(ddf1.var(split_every=split_every), pdf1.var())
    assert_eq(ddf1.sem(split_every=split_every), pdf1.sem())
    assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0))
    assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0))
    assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0))
    assert_eq(ddf1.mean(split_every=split_every), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert_eq(ddf1.sum(axis=axis, split_every=split_every),
                  pdf1.sum(axis=axis))
        assert_eq(ddf1.prod(axis=axis, split_every=split_every),
                  pdf1.prod(axis=axis))
        assert_eq(ddf1.min(axis=axis, split_every=split_every),
                  pdf1.min(axis=axis))
        assert_eq(ddf1.max(axis=axis, split_every=split_every),
                  pdf1.max(axis=axis))
        assert_eq(ddf1.count(axis=axis, split_every=split_every),
                  pdf1.count(axis=axis))
        assert_eq(ddf1.std(axis=axis, split_every=split_every),
                  pdf1.std(axis=axis))
        assert_eq(ddf1.var(axis=axis, split_every=split_every),
                  pdf1.var(axis=axis))
        assert_eq(ddf1.sem(axis=axis, split_every=split_every),
                  pdf1.sem(axis=axis))
        assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every),
                  pdf1.std(axis=axis, ddof=0))
        assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every),
                  pdf1.var(axis=axis, ddof=0))
        assert_eq(ddf1.sem(axis=axis, ddof=0, split_every=split_every),
                  pdf1.sem(axis=axis, ddof=0))
        assert_eq(ddf1.mean(axis=axis, split_every=split_every),
                  pdf1.mean(axis=axis))

    pytest.raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.prod(split_every=split_every), 'dataframe-prod')
    assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min')
    assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max')
    assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count')
    # std, var, sem, and mean consist of sum and count operations
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-count')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1, split_every=split_every),
                      'dataframe-sum')
    assert_dask_graph(ddf1.prod(axis=1, split_every=split_every),
                      'dataframe-prod')
    assert_dask_graph(ddf1.min(axis=1, split_every=split_every),
                      'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1, split_every=split_every),
                      'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1, split_every=split_every),
                      'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1, split_every=split_every),
                      'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1, split_every=split_every),
                      'dataframe-var')
    assert_dask_graph(ddf1.sem(axis=1, split_every=split_every),
                      'dataframe-sem')
    assert_dask_graph(ddf1.mean(axis=1, split_every=split_every),
                      'dataframe-mean')
Exemplo n.º 4
0
def test_reductions():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)
        assert eq(dds.sum(), pds.sum())
        assert eq(dds.min(), pds.min())
        assert eq(dds.max(), pds.max())
        assert eq(dds.count(), pds.count())
        assert eq(dds.std(), pds.std())
        assert eq(dds.var(), pds.var())
        assert eq(dds.std(ddof=0), pds.std(ddof=0))
        assert eq(dds.var(ddof=0), pds.var(ddof=0))
        assert eq(dds.mean(), pds.mean())
        assert eq(dds.nunique(), pds.nunique())
        assert eq(dds.nbytes, pds.nbytes)

        assert eq(dds.sum(skipna=False), pds.sum(skipna=False))
        assert eq(dds.min(skipna=False), pds.min(skipna=False))
        assert eq(dds.max(skipna=False), pds.max(skipna=False))
        assert eq(dds.std(skipna=False), pds.std(skipna=False))
        assert eq(dds.var(skipna=False), pds.var(skipna=False))
        assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0))
        assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0))
        assert eq(dds.mean(skipna=False), pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(), 'series-sum')
    assert_dask_graph(ddf1.b.min(), 'series-min')
    assert_dask_graph(ddf1.b.max(), 'series-max')
    assert_dask_graph(ddf1.b.count(), 'series-count')
    assert_dask_graph(ddf1.b.std(), 'series-std')
    assert_dask_graph(ddf1.b.var(), 'series-var')
    assert_dask_graph(ddf1.b.std(ddof=0), 'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0), 'series-var')
    assert_dask_graph(ddf1.b.mean(), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')

    eq(ddf1.index.min(), pdf1.index.min())
    eq(ddf1.index.max(), pdf1.index.max())
Exemplo n.º 5
0
def test_reductions_frame():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert eq(ddf1.sum(), pdf1.sum())
    assert eq(ddf1.min(), pdf1.min())
    assert eq(ddf1.max(), pdf1.max())
    assert eq(ddf1.count(), pdf1.count())
    assert eq(ddf1.std(), pdf1.std())
    assert eq(ddf1.var(), pdf1.var())
    assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0))
    assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0))
    assert eq(ddf1.mean(), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis))
        assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis))
        assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis))
        assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis))
        assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis))
        assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis))
        assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0))
        assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0))
        assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis))

    assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(), 'dataframe-sum')
    assert_dask_graph(ddf1.min(), 'dataframe-min')
    assert_dask_graph(ddf1.max(), 'dataframe-max')
    assert_dask_graph(ddf1.count(), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(ddf1.std(), 'dataframe-sum')
    assert_dask_graph(ddf1.std(), 'dataframe-count')
    assert_dask_graph(ddf1.var(), 'dataframe-sum')
    assert_dask_graph(ddf1.var(), 'dataframe-count')
    assert_dask_graph(ddf1.mean(), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum')
    assert_dask_graph(ddf1.min(axis=1), 'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1), 'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1), 'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1), 'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1), 'dataframe-var')
    assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
Exemplo n.º 6
0
def test_reductions(split_every):
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())
        assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
        assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
        assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every),
                  pds.sum(skipna=False))
        assert_eq(dds.prod(skipna=False, split_every=split_every),
                  pds.prod(skipna=False))
        assert_eq(dds.min(skipna=False, split_every=split_every),
                  pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every),
                  pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every),
                  pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every),
                  pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every),
                  pds.sem(skipna=False))
        assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every),
                  pds.std(skipna=False, ddof=0))
        assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every),
                  pds.var(skipna=False, ddof=0))
        assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every),
                  pds.sem(skipna=False, ddof=0))
        assert_eq(dds.mean(skipna=False, split_every=split_every),
                  pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum')
    assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod')
    assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min')
    assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max')
    assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count')
    assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates')

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
Exemplo n.º 7
0
def test_split_apply_combine_on_series():
    pdf1 = pd.DataFrame(
        {
            'a': [1, 2, 6, 4, 4, 6, 4, 3, 7],
            'b': [4, 2, 7, 3, 3, 1, 1, 1, 2]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(pdf1, npartitions=3)
    ddf1 = ddf

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(
            ddf1.groupby(ddkey).a.nunique(),
            pdf1.groupby(pdkey).a.nunique())
        assert eq(ddf1.groupby(ddkey).a.size(), pdf1.groupby(pdkey).a.size())
        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddkey).a.var(ddof),
                pdf1.groupby(pdkey).a.var(ddof))
            assert eq(
                ddf1.groupby(ddkey).a.std(ddof),
                pdf1.groupby(pdkey).a.std(ddof))

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())
        assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size())
        for ddof in [0, 1, 2]:
            assert eq(ddf1.groupby(ddkey).var(ddof),
                      pdf1.groupby(pdkey).var(ddof),
                      check_dtype=False)
            assert eq(ddf1.groupby(ddkey).std(ddof),
                      pdf1.groupby(pdkey).std(ddof),
                      check_dtype=False)

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(),
                  pdf1.a.groupby(pdkey).sum(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(),
                  pdf1.a.groupby(pdkey).max(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(),
                  pdf1.a.groupby(pdkey).count(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(),
                  pdf1.a.groupby(pdkey).mean(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(),
                  pdf1.a.groupby(pdkey).nunique(),
                  check_names=False)
        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.a.groupby(ddkey).var(ddof),
                pdf1.a.groupby(pdkey).var(ddof))
            assert eq(
                ddf1.a.groupby(ddkey).std(ddof),
                pdf1.a.groupby(pdkey).std(ddof))

    for i in range(8):
        assert eq(
            ddf1.groupby(ddf1.b > i).a.sum(),
            pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.min(),
            pdf1.groupby(pdf1.b > i).a.min())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.max(),
            pdf1.groupby(pdf1.b > i).a.max())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.count(),
            pdf1.groupby(pdf1.b > i).a.count())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.mean(),
            pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.nunique(),
            pdf1.groupby(pdf1.b > i).a.nunique())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.size(),
            pdf1.groupby(pdf1.b > i).a.size())

        assert eq(
            ddf1.groupby(ddf1.a > i).b.sum(),
            pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.min(),
            pdf1.groupby(pdf1.a > i).b.min())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.max(),
            pdf1.groupby(pdf1.a > i).b.max())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.count(),
            pdf1.groupby(pdf1.a > i).b.count())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.mean(),
            pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.nunique(),
            pdf1.groupby(pdf1.a > i).b.nunique())
        assert eq(
            ddf1.groupby(ddf1.b > i).b.size(),
            pdf1.groupby(pdf1.b > i).b.size())

        assert eq(
            ddf1.groupby(ddf1.b > i).sum(),
            pdf1.groupby(pdf1.b > i).sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).min(),
            pdf1.groupby(pdf1.b > i).min())
        assert eq(
            ddf1.groupby(ddf1.b > i).max(),
            pdf1.groupby(pdf1.b > i).max())
        assert eq(
            ddf1.groupby(ddf1.b > i).count(),
            pdf1.groupby(pdf1.b > i).count())
        assert eq(
            ddf1.groupby(ddf1.b > i).mean(),
            pdf1.groupby(pdf1.b > i).mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).size(),
            pdf1.groupby(pdf1.b > i).size())

        assert eq(
            ddf1.groupby(ddf1.a > i).sum(),
            pdf1.groupby(pdf1.a > i).sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).min(),
            pdf1.groupby(pdf1.a > i).min())
        assert eq(
            ddf1.groupby(ddf1.a > i).max(),
            pdf1.groupby(pdf1.a > i).max())
        assert eq(
            ddf1.groupby(ddf1.a > i).count(),
            pdf1.groupby(pdf1.a > i).count())
        assert eq(
            ddf1.groupby(ddf1.a > i).mean(),
            pdf1.groupby(pdf1.a > i).mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).size(),
            pdf1.groupby(pdf1.a > i).size())

        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddf1.b > i).std(ddof),
                pdf1.groupby(pdf1.b > i).std(ddof))

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(
            ddf1.groupby(ddkey).b.nunique(),
            pdf1.groupby(pdkey).b.nunique())
        assert eq(ddf1.groupby(ddkey).b.size(), pdf1.groupby(pdkey).b.size())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(
            ddf1.groupby(ddkey).mean(),
            pdf1.groupby(pdkey).mean().astype(float))
        assert eq(ddf1.groupby(ddkey).size(), pdf1.groupby(pdkey).size())

        for ddof in [0, 1, 2]:
            assert eq(
                ddf1.groupby(ddkey).b.std(ddof),
                pdf1.groupby(pdkey).b.std(ddof))

    assert (sorted(ddf1.groupby('b').a.sum().dask) == sorted(
        ddf1.groupby('b').a.sum().dask))
    assert (sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted(
        ddf1.groupby(ddf1.a > 3).b.mean().dask))

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.var(), 'series-groupby-var')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')
    assert_dask_graph(ddf1.groupby('b').a.size(), 'series-groupby-size')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
    assert_dask_graph(ddf1.groupby('b').size(), 'dataframe-groupby-size')
Exemplo n.º 8
0
def test_reductions():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)
        assert eq(dds.sum(), pds.sum())
        assert eq(dds.min(), pds.min())
        assert eq(dds.max(), pds.max())
        assert eq(dds.count(), pds.count())
        assert eq(dds.std(), pds.std())
        assert eq(dds.var(), pds.var())
        assert eq(dds.std(ddof=0), pds.std(ddof=0))
        assert eq(dds.var(ddof=0), pds.var(ddof=0))
        assert eq(dds.mean(), pds.mean())
        assert eq(dds.nunique(), pds.nunique())
        assert eq(dds.nbytes, pds.nbytes)

        assert eq(dds.sum(skipna=False), pds.sum(skipna=False))
        assert eq(dds.min(skipna=False), pds.min(skipna=False))
        assert eq(dds.max(skipna=False), pds.max(skipna=False))
        assert eq(dds.std(skipna=False), pds.std(skipna=False))
        assert eq(dds.var(skipna=False), pds.var(skipna=False))
        assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0))
        assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0))
        assert eq(dds.mean(skipna=False), pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(), 'series-sum')
    assert_dask_graph(ddf1.b.min(), 'series-min')
    assert_dask_graph(ddf1.b.max(), 'series-max')
    assert_dask_graph(ddf1.b.count(), 'series-count')
    assert_dask_graph(ddf1.b.std(), 'series-std(ddof=1)')
    assert_dask_graph(ddf1.b.var(), 'series-var(ddof=1)')
    assert_dask_graph(ddf1.b.std(ddof=0), 'series-std(ddof=0)')
    assert_dask_graph(ddf1.b.var(ddof=0), 'series-var(ddof=0)')
    assert_dask_graph(ddf1.b.mean(), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')
Exemplo n.º 9
0
def test_reductions_frame():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert eq(ddf1.sum(), pdf1.sum())
    assert eq(ddf1.min(), pdf1.min())
    assert eq(ddf1.max(), pdf1.max())
    assert eq(ddf1.count(), pdf1.count())
    assert eq(ddf1.std(), pdf1.std())
    assert eq(ddf1.var(), pdf1.var())
    assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0))
    assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0))
    assert eq(ddf1.mean(), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis))
        assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis))
        assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis))
        assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis))
        assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis))
        assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis))
        assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0))
        assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0))
        assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis))

    assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(), 'dataframe-sum')
    assert_dask_graph(ddf1.min(), 'dataframe-min')
    assert_dask_graph(ddf1.max(), 'dataframe-max')
    assert_dask_graph(ddf1.count(), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(ddf1.std(), 'dataframe-sum')
    assert_dask_graph(ddf1.std(), 'dataframe-count')
    assert_dask_graph(ddf1.var(), 'dataframe-sum')
    assert_dask_graph(ddf1.var(), 'dataframe-count')
    assert_dask_graph(ddf1.mean(), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum(axis=1)')
    assert_dask_graph(ddf1.min(axis=1), 'dataframe-min(axis=1)')
    assert_dask_graph(ddf1.max(axis=1), 'dataframe-max(axis=1)')
    assert_dask_graph(ddf1.count(axis=1), 'dataframe-count(axis=1)')
    assert_dask_graph(ddf1.std(axis=1), 'dataframe-std(axis=1, ddof=1)')
    assert_dask_graph(ddf1.var(axis=1), 'dataframe-var(axis=1, ddof=1)')
    assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean(axis=1)')
Exemplo n.º 10
0
def test_split_apply_combine_on_series():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 6], 'b': [4, 2, 7]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 4, 6], 'b': [3, 3, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [4, 3, 7], 'b': [1, 1, 3]},
                                  index=[9, 9, 9])}
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False)

    for i in range(8):
        assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min())
        assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max())
        assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count())
        assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min())
        assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max())
        assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count())
        assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum())
        assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min())
        assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max())
        assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count())
        assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean())

        assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum())
        assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min())
        assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max())
        assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count())
        assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean())

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float))

    assert sorted(ddf1.groupby('b').a.sum().dask) == \
           sorted(ddf1.groupby('b').a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \
           sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
Exemplo n.º 11
0
def test_reductions(split_every):
    dsk = {
        ("x", 0): pd.DataFrame(
            {"a": [1, 2, 3], "b": [4, 5, 6], "c": [True, True, False]}, index=[0, 1, 3]
        ),
        ("x", 1): pd.DataFrame(
            {"a": [4, 5, 6], "b": [3, 2, 1], "c": [False, False, False]},
            index=[5, 6, 8],
        ),
        ("x", 2): pd.DataFrame(
            {
                "a": [13094304034, 3489385935, 100006774],
                "b": [0, 0, 0],
                "c": [True, True, True],
            },
            index=[9, 9, 9],
        ),
    }
    meta = make_meta({"a": "i8", "b": "i8", "c": "bool"}, index=pd.Index([], "i8"))
    ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [
        (ddf1.a, pdf1.a),
        (ddf1.b, pdf1.b),
        (ddf1.c, pdf1.c),
        (ddf1["a"], pdf1["a"]),
        (ddf1["b"], pdf1["b"]),
        (nands1, nans1),
        (nands2, nans2),
        (nands3, nans3),
        (boolds, bools),
    ]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())

        if scipy:
            # pandas uses unbiased skew, need to correct for that
            n = pds.shape[0]
            bias_factor = (n * (n - 1)) ** 0.5 / (n - 2)
            assert_eq(dds.skew(), pds.skew() / bias_factor)

        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())

        with warnings.catch_warnings():
            # dask.dataframe should probably filter this, to match pandas, but
            # it seems quite difficult.
            warnings.simplefilter("ignore", RuntimeWarning)
            assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
            assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
            assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False))
        assert_eq(
            dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)
        )
        assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False))
        assert_eq(
            dds.std(skipna=False, ddof=0, split_every=split_every),
            pds.std(skipna=False, ddof=0),
        )
        assert_eq(
            dds.var(skipna=False, ddof=0, split_every=split_every),
            pds.var(skipna=False, ddof=0),
        )
        assert_eq(
            dds.sem(skipna=False, ddof=0, split_every=split_every),
            pds.sem(skipna=False, ddof=0),
        )
        assert_eq(
            dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)
        )

    assert_dask_graph(ddf1.b.sum(split_every=split_every), "series-sum")
    assert_dask_graph(ddf1.b.prod(split_every=split_every), "series-prod")
    assert_dask_graph(ddf1.b.min(split_every=split_every), "series-min")
    assert_dask_graph(ddf1.b.max(split_every=split_every), "series-max")
    assert_dask_graph(ddf1.b.count(split_every=split_every), "series-count")
    assert_dask_graph(ddf1.b.std(split_every=split_every), "series-std")
    assert_dask_graph(ddf1.b.var(split_every=split_every), "series-var")
    assert_dask_graph(ddf1.b.sem(split_every=split_every), "series-sem")
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), "series-std")
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), "series-var")
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), "series-sem")
    assert_dask_graph(ddf1.b.mean(split_every=split_every), "series-mean")
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every), "drop-duplicates")

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
Exemplo n.º 12
0
def test_reductions_frame(split_every):
    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
        ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
        ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
    }
    meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
    ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert_eq(ddf1.sum(split_every=split_every), pdf1.sum())
    assert_eq(ddf1.prod(split_every=split_every), pdf1.prod())
    assert_eq(ddf1.min(split_every=split_every), pdf1.min())
    assert_eq(ddf1.max(split_every=split_every), pdf1.max())
    assert_eq(ddf1.count(split_every=split_every), pdf1.count())
    assert_eq(ddf1.std(split_every=split_every), pdf1.std())
    assert_eq(ddf1.var(split_every=split_every), pdf1.var())
    assert_eq(ddf1.sem(split_every=split_every), pdf1.sem())
    assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0))
    assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0))
    assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0))
    assert_eq(ddf1.mean(split_every=split_every), pdf1.mean())

    for axis in [0, 1, "index", "columns"]:
        assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis))
        assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis))
        assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis))
        assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis))
        assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis))
        assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis))
        assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis))
        assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis))
        assert_eq(
            ddf1.std(axis=axis, ddof=0, split_every=split_every),
            pdf1.std(axis=axis, ddof=0),
        )
        assert_eq(
            ddf1.var(axis=axis, ddof=0, split_every=split_every),
            pdf1.var(axis=axis, ddof=0),
        )
        assert_eq(
            ddf1.sem(axis=axis, ddof=0, split_every=split_every),
            pdf1.sem(axis=axis, ddof=0),
        )
        assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis))

    pytest.raises(ValueError, lambda: ddf1.sum(axis="incorrect").compute())

    # axis=0
    assert_dask_graph(ddf1.sum(split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.prod(split_every=split_every), "dataframe-prod")
    assert_dask_graph(ddf1.min(split_every=split_every), "dataframe-min")
    assert_dask_graph(ddf1.max(split_every=split_every), "dataframe-max")
    assert_dask_graph(ddf1.count(split_every=split_every), "dataframe-count")

    # std, var, sem, and mean consist of moment_* operations
    assert_dask_graph(ddf1.std(split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.std(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.std(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.std(split_every=split_every), "values")

    assert_dask_graph(ddf1.var(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.var(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.var(split_every=split_every), "values")

    assert_dask_graph(ddf1.sem(split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.sem(split_every=split_every), "moment_chunk")
    assert_dask_graph(ddf1.sem(split_every=split_every), "moment_agg")
    assert_dask_graph(ddf1.sem(split_every=split_every), "values")

    assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.mean(split_every=split_every), "dataframe-count")

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), "dataframe-sum")
    assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), "dataframe-prod")
    assert_dask_graph(ddf1.min(axis=1, split_every=split_every), "dataframe-min")
    assert_dask_graph(ddf1.max(axis=1, split_every=split_every), "dataframe-max")
    assert_dask_graph(ddf1.count(axis=1, split_every=split_every), "dataframe-count")
    assert_dask_graph(ddf1.std(axis=1, split_every=split_every), "dataframe-std")
    assert_dask_graph(ddf1.var(axis=1, split_every=split_every), "dataframe-var")
    assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), "dataframe-sem")
    assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), "dataframe-mean")
Exemplo n.º 13
0
def test_split_apply_combine_on_series():
    dsk = {
        ('x', 0): pd.DataFrame({
            'a': [1, 2, 6],
            'b': [4, 2, 7]
        },
                               index=[0, 1, 3]),
        ('x', 1): pd.DataFrame({
            'a': [4, 4, 6],
            'b': [3, 3, 1]
        },
                               index=[5, 6, 8]),
        ('x', 2): pd.DataFrame({
            'a': [4, 3, 7],
            'b': [1, 1, 3]
        },
                               index=[9, 9, 9])
    }
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for ddkey, pdkey in [('b', 'b'), (ddf1.b, pdf1.b),
                         (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(
            ddf1.groupby(ddkey).a.nunique(),
            pdf1.groupby(pdkey).a.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(),
                  pdf1.a.groupby(pdkey).sum(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(),
                  pdf1.a.groupby(pdkey).max(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(),
                  pdf1.a.groupby(pdkey).count(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(),
                  pdf1.a.groupby(pdkey).mean(),
                  check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(),
                  pdf1.a.groupby(pdkey).nunique(),
                  check_names=False)

    for i in range(8):
        assert eq(
            ddf1.groupby(ddf1.b > i).a.sum(),
            pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.min(),
            pdf1.groupby(pdf1.b > i).a.min())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.max(),
            pdf1.groupby(pdf1.b > i).a.max())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.count(),
            pdf1.groupby(pdf1.b > i).a.count())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.mean(),
            pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(
            ddf1.groupby(ddf1.b > i).a.nunique(),
            pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(
            ddf1.groupby(ddf1.a > i).b.sum(),
            pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.min(),
            pdf1.groupby(pdf1.a > i).b.min())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.max(),
            pdf1.groupby(pdf1.a > i).b.max())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.count(),
            pdf1.groupby(pdf1.a > i).b.count())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.mean(),
            pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(
            ddf1.groupby(ddf1.a > i).b.nunique(),
            pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(
            ddf1.groupby(ddf1.b > i).sum(),
            pdf1.groupby(pdf1.b > i).sum())
        assert eq(
            ddf1.groupby(ddf1.b > i).min(),
            pdf1.groupby(pdf1.b > i).min())
        assert eq(
            ddf1.groupby(ddf1.b > i).max(),
            pdf1.groupby(pdf1.b > i).max())
        assert eq(
            ddf1.groupby(ddf1.b > i).count(),
            pdf1.groupby(pdf1.b > i).count())
        assert eq(
            ddf1.groupby(ddf1.b > i).mean(),
            pdf1.groupby(pdf1.b > i).mean())

        assert eq(
            ddf1.groupby(ddf1.a > i).sum(),
            pdf1.groupby(pdf1.a > i).sum())
        assert eq(
            ddf1.groupby(ddf1.a > i).min(),
            pdf1.groupby(pdf1.a > i).min())
        assert eq(
            ddf1.groupby(ddf1.a > i).max(),
            pdf1.groupby(pdf1.a > i).max())
        assert eq(
            ddf1.groupby(ddf1.a > i).count(),
            pdf1.groupby(pdf1.a > i).count())
        assert eq(
            ddf1.groupby(ddf1.a > i).mean(),
            pdf1.groupby(pdf1.a > i).mean())

    for ddkey, pdkey in [('a', 'a'), (ddf1.a, pdf1.a),
                         (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(
            ddf1.groupby(ddkey).b.nunique(),
            pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(
            ddf1.groupby(ddkey).mean(),
            pdf1.groupby(pdkey).mean().astype(float))

    assert sorted(ddf1.groupby('b').a.sum().dask) == \
           sorted(ddf1.groupby('b').a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == \
           sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby('x'))
    assert raises(KeyError, lambda: ddf1.groupby(['a', 'x']))
    assert raises(KeyError, lambda: ddf1.groupby('a')['x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')['b', 'x'])
    assert raises(KeyError, lambda: ddf1.groupby('a')[['b', 'x']])

    # test graph node labels
    assert_dask_graph(ddf1.groupby('b').a.sum(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.min(), 'series-groupby-min')
    assert_dask_graph(ddf1.groupby('b').a.max(), 'series-groupby-max')
    assert_dask_graph(ddf1.groupby('b').a.count(), 'series-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').a.mean(), 'series-groupby-count')
    assert_dask_graph(ddf1.groupby('b').a.nunique(), 'series-groupby-nunique')

    assert_dask_graph(ddf1.groupby('b').sum(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').min(), 'dataframe-groupby-min')
    assert_dask_graph(ddf1.groupby('b').max(), 'dataframe-groupby-max')
    assert_dask_graph(ddf1.groupby('b').count(), 'dataframe-groupby-count')
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-sum')
    assert_dask_graph(ddf1.groupby('b').mean(), 'dataframe-groupby-count')
Exemplo n.º 14
0
def test_split_apply_combine_on_series():
    dsk = {
        ("x", 0): pd.DataFrame({"a": [1, 2, 6], "b": [4, 2, 7]}, index=[0, 1, 3]),
        ("x", 1): pd.DataFrame({"a": [4, 4, 6], "b": [3, 3, 1]}, index=[5, 6, 8]),
        ("x", 2): pd.DataFrame({"a": [4, 3, 7], "b": [1, 1, 3]}, index=[9, 9, 9]),
    }
    ddf1 = dd.DataFrame(dsk, "x", ["a", "b"], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for ddkey, pdkey in [("b", "b"), (ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.groupby(ddkey).a.min(), pdf1.groupby(pdkey).a.min())
        assert eq(ddf1.groupby(ddkey).a.max(), pdf1.groupby(pdkey).a.max())
        assert eq(ddf1.groupby(ddkey).a.count(), pdf1.groupby(pdkey).a.count())
        assert eq(ddf1.groupby(ddkey).a.mean(), pdf1.groupby(pdkey).a.mean())
        assert eq(ddf1.groupby(ddkey).a.nunique(), pdf1.groupby(pdkey).a.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean())

    for ddkey, pdkey in [(ddf1.b, pdf1.b), (ddf1.b + 1, pdf1.b + 1)]:
        assert eq(ddf1.a.groupby(ddkey).sum(), pdf1.a.groupby(pdkey).sum(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).max(), pdf1.a.groupby(pdkey).max(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).count(), pdf1.a.groupby(pdkey).count(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).mean(), pdf1.a.groupby(pdkey).mean(), check_names=False)
        assert eq(ddf1.a.groupby(ddkey).nunique(), pdf1.a.groupby(pdkey).nunique(), check_names=False)

    for i in range(8):
        assert eq(ddf1.groupby(ddf1.b > i).a.sum(), pdf1.groupby(pdf1.b > i).a.sum())
        assert eq(ddf1.groupby(ddf1.b > i).a.min(), pdf1.groupby(pdf1.b > i).a.min())
        assert eq(ddf1.groupby(ddf1.b > i).a.max(), pdf1.groupby(pdf1.b > i).a.max())
        assert eq(ddf1.groupby(ddf1.b > i).a.count(), pdf1.groupby(pdf1.b > i).a.count())
        assert eq(ddf1.groupby(ddf1.b > i).a.mean(), pdf1.groupby(pdf1.b > i).a.mean())
        assert eq(ddf1.groupby(ddf1.b > i).a.nunique(), pdf1.groupby(pdf1.b > i).a.nunique())

        assert eq(ddf1.groupby(ddf1.a > i).b.sum(), pdf1.groupby(pdf1.a > i).b.sum())
        assert eq(ddf1.groupby(ddf1.a > i).b.min(), pdf1.groupby(pdf1.a > i).b.min())
        assert eq(ddf1.groupby(ddf1.a > i).b.max(), pdf1.groupby(pdf1.a > i).b.max())
        assert eq(ddf1.groupby(ddf1.a > i).b.count(), pdf1.groupby(pdf1.a > i).b.count())
        assert eq(ddf1.groupby(ddf1.a > i).b.mean(), pdf1.groupby(pdf1.a > i).b.mean())
        assert eq(ddf1.groupby(ddf1.a > i).b.nunique(), pdf1.groupby(pdf1.a > i).b.nunique())

        assert eq(ddf1.groupby(ddf1.b > i).sum(), pdf1.groupby(pdf1.b > i).sum())
        assert eq(ddf1.groupby(ddf1.b > i).min(), pdf1.groupby(pdf1.b > i).min())
        assert eq(ddf1.groupby(ddf1.b > i).max(), pdf1.groupby(pdf1.b > i).max())
        assert eq(ddf1.groupby(ddf1.b > i).count(), pdf1.groupby(pdf1.b > i).count())
        assert eq(ddf1.groupby(ddf1.b > i).mean(), pdf1.groupby(pdf1.b > i).mean())

        assert eq(ddf1.groupby(ddf1.a > i).sum(), pdf1.groupby(pdf1.a > i).sum())
        assert eq(ddf1.groupby(ddf1.a > i).min(), pdf1.groupby(pdf1.a > i).min())
        assert eq(ddf1.groupby(ddf1.a > i).max(), pdf1.groupby(pdf1.a > i).max())
        assert eq(ddf1.groupby(ddf1.a > i).count(), pdf1.groupby(pdf1.a > i).count())
        assert eq(ddf1.groupby(ddf1.a > i).mean(), pdf1.groupby(pdf1.a > i).mean())

    for ddkey, pdkey in [("a", "a"), (ddf1.a, pdf1.a), (ddf1.a + 1, pdf1.a + 1), (ddf1.a > 3, pdf1.a > 3)]:
        assert eq(ddf1.groupby(ddkey).b.sum(), pdf1.groupby(pdkey).b.sum())
        assert eq(ddf1.groupby(ddkey).b.min(), pdf1.groupby(pdkey).b.min())
        assert eq(ddf1.groupby(ddkey).b.max(), pdf1.groupby(pdkey).b.max())
        assert eq(ddf1.groupby(ddkey).b.count(), pdf1.groupby(pdkey).b.count())
        assert eq(ddf1.groupby(ddkey).b.mean(), pdf1.groupby(pdkey).b.mean())
        assert eq(ddf1.groupby(ddkey).b.nunique(), pdf1.groupby(pdkey).b.nunique())

        assert eq(ddf1.groupby(ddkey).sum(), pdf1.groupby(pdkey).sum())
        assert eq(ddf1.groupby(ddkey).min(), pdf1.groupby(pdkey).min())
        assert eq(ddf1.groupby(ddkey).max(), pdf1.groupby(pdkey).max())
        assert eq(ddf1.groupby(ddkey).count(), pdf1.groupby(pdkey).count())
        assert eq(ddf1.groupby(ddkey).mean(), pdf1.groupby(pdkey).mean().astype(float))

    assert sorted(ddf1.groupby("b").a.sum().dask) == sorted(ddf1.groupby("b").a.sum().dask)
    assert sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask) == sorted(ddf1.groupby(ddf1.a > 3).b.mean().dask)

    # test raises with incorrect key
    assert raises(KeyError, lambda: ddf1.groupby("x"))
    assert raises(KeyError, lambda: ddf1.groupby(["a", "x"]))
    assert raises(KeyError, lambda: ddf1.groupby("a")["x"])
    assert raises(KeyError, lambda: ddf1.groupby("a")["b", "x"])
    assert raises(KeyError, lambda: ddf1.groupby("a")[["b", "x"]])

    # test graph node labels
    assert_dask_graph(ddf1.groupby("b").a.sum(), "series-groupby-sum")
    assert_dask_graph(ddf1.groupby("b").a.min(), "series-groupby-min")
    assert_dask_graph(ddf1.groupby("b").a.max(), "series-groupby-max")
    assert_dask_graph(ddf1.groupby("b").a.count(), "series-groupby-count")
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby("b").a.mean(), "series-groupby-sum")
    assert_dask_graph(ddf1.groupby("b").a.mean(), "series-groupby-count")
    assert_dask_graph(ddf1.groupby("b").a.nunique(), "series-groupby-nunique")

    assert_dask_graph(ddf1.groupby("b").sum(), "dataframe-groupby-sum")
    assert_dask_graph(ddf1.groupby("b").min(), "dataframe-groupby-min")
    assert_dask_graph(ddf1.groupby("b").max(), "dataframe-groupby-max")
    assert_dask_graph(ddf1.groupby("b").count(), "dataframe-groupby-count")
    # mean consists from sum and count operations
    assert_dask_graph(ddf1.groupby("b").mean(), "dataframe-groupby-sum")
    assert_dask_graph(ddf1.groupby("b").mean(), "dataframe-groupby-count")