Пример #1
0
def test_append2():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})}
    meta = make_meta({'a': 'i8', 'b': 'i8'})
    ddf1 = dd.DataFrame(dsk, 'x', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}),
           ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})}
    ddf2 = dd.DataFrame(dsk, 'y', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})}
    meta = make_meta({'b': 'i8', 'c': 'i8'})
    ddf3 = dd.DataFrame(dsk, 'y', meta, [None, None])

    assert_eq(ddf1.append(ddf2), ddf1.compute().append(ddf2.compute()))
    assert_eq(ddf2.append(ddf1), ddf2.compute().append(ddf1.compute()))
    # Series + DataFrame
    with pytest.warns(None):
        # RuntimeWarning from pandas on comparing int and str
        assert_eq(ddf1.a.append(ddf2), ddf1.a.compute().append(ddf2.compute()))
        assert_eq(ddf2.a.append(ddf1), ddf2.a.compute().append(ddf1.compute()))

    # different columns
    assert_eq(ddf1.append(ddf3), ddf1.compute().append(ddf3.compute()))
    assert_eq(ddf3.append(ddf1), ddf3.compute().append(ddf1.compute()))
    # Series + DataFrame
    with pytest.warns(None):
        # RuntimeWarning from pandas on comparing int and str
        assert_eq(ddf1.a.append(ddf3), ddf1.a.compute().append(ddf3.compute()))
        assert_eq(ddf3.b.append(ddf1), ddf3.b.compute().append(ddf1.compute()))

    # Dask + pandas
    assert_eq(ddf1.append(ddf2.compute()), ddf1.compute().append(ddf2.compute()))
    assert_eq(ddf2.append(ddf1.compute()), ddf2.compute().append(ddf1.compute()))
    # Series + DataFrame
    with pytest.warns(None):
        # RuntimeWarning from pandas on comparing int and str
        assert_eq(ddf1.a.append(ddf2.compute()), ddf1.a.compute().append(ddf2.compute()))
        assert_eq(ddf2.a.append(ddf1.compute()), ddf2.a.compute().append(ddf1.compute()))

    # different columns
    assert_eq(ddf1.append(ddf3.compute()), ddf1.compute().append(ddf3.compute()))
    assert_eq(ddf3.append(ddf1.compute()), ddf3.compute().append(ddf1.compute()))
    # Series + DataFrame
    with pytest.warns(None):
        # RuntimeWarning from pandas on comparing int and str
        assert_eq(ddf1.a.append(ddf3.compute()), ddf1.a.compute().append(ddf3.compute()))
        assert_eq(ddf3.b.append(ddf1.compute()), ddf3.b.compute().append(ddf1.compute()))
Пример #2
0
def test_reductions_frame():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    assert eq(ddf1.sum(), pdf1.sum())
    assert eq(ddf1.min(), pdf1.min())
    assert eq(ddf1.max(), pdf1.max())
    assert eq(ddf1.count(), pdf1.count())
    assert eq(ddf1.std(), pdf1.std())
    assert eq(ddf1.var(), pdf1.var())
    assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0))
    assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0))
    assert eq(ddf1.mean(), pdf1.mean())

    for axis in [0, 1, 'index', 'columns']:
        assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis))
        assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis))
        assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis))
        assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis))
        assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis))
        assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis))
        assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0))
        assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0))
        assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis))

    assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute())

    # axis=0
    assert_dask_graph(ddf1.sum(), 'dataframe-sum')
    assert_dask_graph(ddf1.min(), 'dataframe-min')
    assert_dask_graph(ddf1.max(), 'dataframe-max')
    assert_dask_graph(ddf1.count(), 'dataframe-count')
    # std, var, mean consists from sum and count operations
    assert_dask_graph(ddf1.std(), 'dataframe-sum')
    assert_dask_graph(ddf1.std(), 'dataframe-count')
    assert_dask_graph(ddf1.var(), 'dataframe-sum')
    assert_dask_graph(ddf1.var(), 'dataframe-count')
    assert_dask_graph(ddf1.mean(), 'dataframe-sum')
    assert_dask_graph(ddf1.mean(), 'dataframe-count')

    # axis=1
    assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum')
    assert_dask_graph(ddf1.min(axis=1), 'dataframe-min')
    assert_dask_graph(ddf1.max(axis=1), 'dataframe-max')
    assert_dask_graph(ddf1.count(axis=1), 'dataframe-count')
    assert_dask_graph(ddf1.std(axis=1), 'dataframe-std')
    assert_dask_graph(ddf1.var(axis=1), 'dataframe-var')
    assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
Пример #3
0
def test_concat2():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})}
    meta = make_meta({'a': 'i8', 'b': 'i8'})
    a = dd.DataFrame(dsk, 'x', meta, [None, None])
    dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}),
           ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})}
    b = dd.DataFrame(dsk, 'y', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}),
           ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})}
    meta = make_meta({'b': 'i8', 'c': 'i8'})
    c = dd.DataFrame(dsk, 'y', meta, [None, None])

    dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60],
                                   'd': [70, 80, 90]}),
           ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10],
                                   'd': [90, 80, 70]},
                                  index=[3, 4, 5])}
    meta = make_meta({'b': 'i8', 'c': 'i8', 'd': 'i8'},
                     index=pd.Index([], 'i8'))
    d = dd.DataFrame(dsk, 'y', meta, [0, 3, 5])

    cases = [[a, b], [a, c], [a, d]]
    assert dd.concat([a]) is a
    for case in cases:
        result = dd.concat(case)
        pdcase = [_c.compute() for _c in case]

        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert_eq(pd.concat(pdcase), result)
        assert set(result.dask) == set(dd.concat(case).dask)

        result = dd.concat(case, join='inner')
        assert result.npartitions == case[0].npartitions + case[1].npartitions
        assert result.divisions == (None, ) * (result.npartitions + 1)
        assert_eq(pd.concat(pdcase, join='inner'), result)
        assert set(result.dask) == set(dd.concat(case, join='inner').dask)
Пример #4
0
def test_unknown_categoricals():
    ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames)},
                       'unknown',
                       make_meta({'v': 'object', 'w': 'category',
                                  'x': 'i8', 'y': 'category', 'z': 'f8'}),
                       [None] * 4)
    # Compute
    df = ddf.compute()

    assert_eq(ddf.w.value_counts(), df.w.value_counts())
    assert_eq(ddf.w.nunique(), df.w.nunique())

    assert_eq(ddf.groupby(ddf.w).sum(), df.groupby(df.w).sum())
    assert_eq(ddf.groupby(ddf.w).y.nunique(), df.groupby(df.w).y.nunique())
    assert_eq(ddf.y.groupby(ddf.w).count(), df.y.groupby(df.w).count())
Пример #5
0
def test_categorize():
    dsk = {('x', 0): pd.DataFrame({'a': ['Alice', 'Bob', 'Alice'],
                                   'b': ['C', 'D', 'E']},
                                  index=[0, 1, 2]),
           ('x', 1): pd.DataFrame({'a': ['Bob', 'Charlie', 'Charlie'],
                                   'b': ['A', 'A', 'B']},
                                  index=[3, 4, 5])}
    meta = make_meta({'a': 'O', 'b': 'O'}, index=pd.Index([], 'i8'))
    d = dd.DataFrame(dsk, 'x', meta, [0, 3, 5])
    full = d.compute()

    c = d.categorize('a')
    cfull = c.compute()
    assert cfull.dtypes['a'] == 'category'
    assert cfull.dtypes['b'] == 'O'

    assert list(cfull.a.astype('O')) == list(full.a)
    assert (d._get(c.dask, c._keys()[:1])[0].dtypes == cfull.dtypes).all()
    assert (d.categorize().compute().dtypes == 'category').all()
Пример #6
0
def test_pivot_table_errors():
    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=10))})
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    assert msg in str(err.value)
    msg = "'columns' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    assert msg in str(err.value)
    msg = "'values' must be the name of an existing column"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum'])
    assert msg in str(err.value)

    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')
    assert msg in str(err.value)

    # unknown categories
    ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'})
    msg = "'columns' must have known categories"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])
    assert msg in str(err.value)

    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': np.random.choice(list('abc'), size=10)})
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with pytest.raises(ValueError) as err:
        dd.pivot_table(ddf, index='A', columns='C', values='B')
    assert msg in str(err.value)
Пример #7
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)

    # unknown categories
    df = pd.DataFrame({'x': list('abcbc'), 'y': list('bcbcb')})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf._meta = make_meta({'x': 'category', 'y': 'category'})

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=['x', 'y'])

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.x)
Пример #8
0
def test_reduction_series_invalid_axis():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    for axis in [1, 'columns']:
        for s in [ddf1.a, pdf1.a]: # both must behave the same
            assert raises(ValueError, lambda: s.sum(axis=axis))
            assert raises(ValueError, lambda: s.min(axis=axis))
            assert raises(ValueError, lambda: s.max(axis=axis))
            # only count doesn't have axis keyword
            assert raises(TypeError, lambda: s.count(axis=axis))
            assert raises(ValueError, lambda: s.std(axis=axis))
            assert raises(ValueError, lambda: s.var(axis=axis))
            assert raises(ValueError, lambda: s.mean(axis=axis))
Пример #9
0
from dask.dataframe.shuffle import (shuffle,
                                    partitioning_index,
                                    rearrange_by_column,
                                    rearrange_by_divisions,
                                    maybe_buffered_partd)

from dask.async import get_sync
from dask.dataframe.utils import assert_eq, make_meta

dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [1, 4, 7]},
                              index=[0, 1, 3]),
       ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [2, 5, 8]},
                              index=[5, 6, 8]),
       ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [3, 6, 9]},
                              index=[9, 9, 9])}
meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
full = d.compute()


shuffle_func = shuffle  # conflicts with keyword argument


@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])
def test_shuffle(shuffle):
    s = shuffle_func(d, d.b, shuffle=shuffle)
    assert isinstance(s, dd.DataFrame)
    assert s.npartitions == d.npartitions

    x = get_sync(s.dask, (s._name, 0))
    y = get_sync(s.dask, (s._name, 1))
Пример #10
0
def test_reductions():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)
        assert eq(dds.sum(), pds.sum())
        assert eq(dds.min(), pds.min())
        assert eq(dds.max(), pds.max())
        assert eq(dds.count(), pds.count())
        assert eq(dds.std(), pds.std())
        assert eq(dds.var(), pds.var())
        assert eq(dds.std(ddof=0), pds.std(ddof=0))
        assert eq(dds.var(ddof=0), pds.var(ddof=0))
        assert eq(dds.mean(), pds.mean())
        assert eq(dds.nunique(), pds.nunique())
        assert eq(dds.nbytes, pds.nbytes)

        assert eq(dds.sum(skipna=False), pds.sum(skipna=False))
        assert eq(dds.min(skipna=False), pds.min(skipna=False))
        assert eq(dds.max(skipna=False), pds.max(skipna=False))
        assert eq(dds.std(skipna=False), pds.std(skipna=False))
        assert eq(dds.var(skipna=False), pds.var(skipna=False))
        assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0))
        assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0))
        assert eq(dds.mean(skipna=False), pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(), 'series-sum')
    assert_dask_graph(ddf1.b.min(), 'series-min')
    assert_dask_graph(ddf1.b.max(), 'series-max')
    assert_dask_graph(ddf1.b.count(), 'series-count')
    assert_dask_graph(ddf1.b.std(), 'series-std')
    assert_dask_graph(ddf1.b.var(), 'series-var')
    assert_dask_graph(ddf1.b.std(ddof=0), 'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0), 'series-var')
    assert_dask_graph(ddf1.b.mean(), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')

    eq(ddf1.index.min(), pdf1.index.min())
    eq(ddf1.index.max(), pdf1.index.max())
Пример #11
0
def test_arithmetics():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    pdf2 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8],
                         'b': [5, 6, 7, 8, 1, 2, 3, 4]})
    pdf3 = pd.DataFrame({'a': [5, 6, 7, 8, 4, 3, 2, 1],
                         'b': [2, 4, 5, 3, 4, 2, 1, 0]})
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    dsk4 = {('y', 0): pd.DataFrame({'a': [3, 2, 1], 'b': [7, 8, 9]},
                                   index=[0, 1, 3]),
            ('y', 1): pd.DataFrame({'a': [5, 2, 8], 'b': [4, 2, 3]},
                                   index=[5, 6, 8]),
            ('y', 2): pd.DataFrame({'a': [1, 4, 10], 'b': [1, 0, 5]},
                                   index=[9, 9, 9])}
    ddf4 = dd.DataFrame(dsk4, 'y', meta, [0, 4, 9, 9])
    pdf4 = ddf4.compute()

    # Arithmetics
    cases = [(ddf1, ddf1, pdf1, pdf1),
             (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1),
             (ddf2, ddf3, pdf2, pdf3),
             (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]),
              pdf2, pdf3),
             (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]),
              pdf2, pdf3),
             (ddf1, ddf4, pdf1, pdf4),
             (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4),
             (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]),
              pdf1, pdf4),
             # dask + pandas
             (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3)]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a, r.b, el.a, er.b)
        check_frame_arithmetics(l, r, el, er)

    # different index, pandas raises ValueError in comparison ops

    pdf5 = pd.DataFrame({'a': [3, 2, 1, 5, 2, 8, 1, 4, 10],
                         'b': [7, 8, 9, 4, 2, 3, 1, 0, 5]},
                        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf5 = dd.from_pandas(pdf5, 2)

    pdf6 = pd.DataFrame({'a': [3, 2, 1, 5, 2, 8, 1, 4, 10],
                         'b': [7, 8, 9, 5, 7, 8, 4, 2, 5]},
                        index=[0, 1, 2, 3, 4, 5, 6, 7, 9])
    ddf6 = dd.from_pandas(pdf6, 4)

    pdf7 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8],
                         'b': [5, 6, 7, 8, 1, 2, 3, 4]},
                        index=list('aaabcdeh'))
    pdf8 = pd.DataFrame({'a': [5, 6, 7, 8, 4, 3, 2, 1],
                         'b': [2, 4, 5, 3, 4, 2, 1, 0]},
                        index=list('abcdefgh'))
    ddf7 = dd.from_pandas(pdf7, 3)
    ddf8 = dd.from_pandas(pdf8, 4)

    pdf9 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8],
                         'b': [5, 6, 7, 8, 1, 2, 3, 4],
                         'c': [5, 6, 7, 8, 1, 2, 3, 4]},
                        index=list('aaabcdeh'))
    pdf10 = pd.DataFrame({'b': [5, 6, 7, 8, 4, 3, 2, 1],
                          'c': [2, 4, 5, 3, 4, 2, 1, 0],
                          'd': [2, 4, 5, 3, 4, 2, 1, 0]},
                         index=list('abcdefgh'))
    ddf9 = dd.from_pandas(pdf9, 3)
    ddf10 = dd.from_pandas(pdf10, 4)

    # Arithmetics with different index
    cases = [(ddf5, ddf6, pdf5, pdf6),
             (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6),
             (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]),
              pdf5, pdf6),
             (ddf7, ddf8, pdf7, pdf8),
             (ddf7.repartition(['a', 'c', 'h']), ddf8.repartition(['a', 'h']),
              pdf7, pdf8),
             (ddf7.repartition(['a', 'b', 'e', 'h']),
              ddf8.repartition(['a', 'e', 'h']), pdf7, pdf8),
             (ddf9, ddf10, pdf9, pdf10),
             (ddf9.repartition(['a', 'c', 'h']), ddf10.repartition(['a', 'h']),
              pdf9, pdf10),
             # dask + pandas
             (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8),
             (ddf9, pdf10, pdf9, pdf10)]

    for (l, r, el, er) in cases:
        check_series_arithmetics(l.a, r.b, el.a, er.b,
                                 allow_comparison_ops=False)
        check_frame_arithmetics(l, r, el, er,
                                allow_comparison_ops=False)
Пример #12
0
def test_reductions(split_every):
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8'))
    ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)

        assert_eq(dds.sum(split_every=split_every), pds.sum())
        assert_eq(dds.prod(split_every=split_every), pds.prod())
        assert_eq(dds.min(split_every=split_every), pds.min())
        assert_eq(dds.max(split_every=split_every), pds.max())
        assert_eq(dds.count(split_every=split_every), pds.count())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.std(split_every=split_every), pds.std())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.var(split_every=split_every), pds.var())
        with pytest.warns(None):
            # runtime warnings; https://github.com/dask/dask/issues/2381
            assert_eq(dds.sem(split_every=split_every), pds.sem())
        assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0))
        assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0))
        assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0))
        assert_eq(dds.mean(split_every=split_every), pds.mean())
        assert_eq(dds.nunique(split_every=split_every), pds.nunique())

        assert_eq(dds.sum(skipna=False, split_every=split_every),
                  pds.sum(skipna=False))
        assert_eq(dds.prod(skipna=False, split_every=split_every),
                  pds.prod(skipna=False))
        assert_eq(dds.min(skipna=False, split_every=split_every),
                  pds.min(skipna=False))
        assert_eq(dds.max(skipna=False, split_every=split_every),
                  pds.max(skipna=False))
        assert_eq(dds.std(skipna=False, split_every=split_every),
                  pds.std(skipna=False))
        assert_eq(dds.var(skipna=False, split_every=split_every),
                  pds.var(skipna=False))
        assert_eq(dds.sem(skipna=False, split_every=split_every),
                  pds.sem(skipna=False))
        assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every),
                  pds.std(skipna=False, ddof=0))
        assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every),
                  pds.var(skipna=False, ddof=0))
        assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every),
                  pds.sem(skipna=False, ddof=0))
        assert_eq(dds.mean(skipna=False, split_every=split_every),
                  pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum')
    assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod')
    assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min')
    assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max')
    assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count')
    assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std')
    assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var')
    assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem')
    assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates')

    # testing index
    assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min())
    assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max())
    assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
Пример #13
0
def test_make_meta():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]},
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')])
    assert (meta.columns == ['a', 'c', 'b']).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(('a', 'i8'))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == 'i8'
    assert meta.name == 'a'

    # With index
    meta = make_meta({'a': 'i8', 'b': 'i4'}, pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(('a', 'i8'), pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({'a': 'category'})
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(('a', 'category'))
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0))
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x)
    assert meta is x

    # Dtype expressions
    meta = make_meta('i8')
    assert isinstance(meta, np.int64)
    meta = make_meta(float)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype('bool'))
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Пример #14
0
def test_make_meta():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]},
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')])
    assert (meta.columns == ['a', 'c', 'b']).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(('a', 'i8'))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == 'i8'
    assert meta.name == 'a'

    # With index
    meta = make_meta({'a': 'i8', 'b': 'i4'}, pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(('a', 'i8'), pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Numpy scalar
    meta = make_meta(np.float64(1.0))
    assert isinstance(meta, np.ndarray)
    assert meta.shape == (0,)
    assert meta.dtype == 'f8'

    # Python scalar
    meta = make_meta(1.0)
    assert isinstance(meta, np.ndarray)
    assert meta.shape == (0,)
    assert meta.dtype == 'f8'

    # datetime
    meta = make_meta(pd.NaT)
    assert isinstance(meta, np.ndarray)
    assert meta.shape == (0,)
    assert meta.dtype == pd.Series(pd.NaT).dtype
Пример #15
0
import numpy as np

import dask
from dask.utils import raises
import dask.dataframe as dd

from dask.dataframe.core import _coerce_loc_index
from dask.dataframe.utils import eq, make_meta


dsk = {
    ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
    ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
    ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
}
meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()


def test_loc():
    assert d.loc[3:8].divisions[0] == 3
    assert d.loc[3:8].divisions[-1] == 8

    assert d.loc[5].divisions == (5, 5)

    assert eq(d.loc[5], full.loc[5:5])
    assert eq(d.loc[3:8], full.loc[3:8])
    assert eq(d.loc[:8], full.loc[:8])
    assert eq(d.loc[3:], full.loc[3:])