def test_categorize(): dsk = { ('x', 0): pd.DataFrame({ 'a': ['Alice', 'Bob', 'Alice'], 'b': ['C', 'D', 'E'] }, index=[0, 1, 2]), ('x', 1): pd.DataFrame({ 'a': ['Bob', 'Charlie', 'Charlie'], 'b': ['A', 'A', 'B'] }, index=[3, 4, 5]) } d = dd.DataFrame(dsk, 'x', ['a', 'b'], [3]) full = d.compute() c = d.categorize('a') cfull = c.compute() assert cfull.dtypes['a'] == 'category' assert cfull.dtypes['b'] == 'O' assert list(cfull.a.astype('O')) == list(full.a) assert (get(c.dask, c._keys()[:1])[0].dtypes == cfull.dtypes).all() assert (d.categorize().compute().dtypes == 'category').all()
def test_repartition(): df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.from_pandas(df, 2) b = a.repartition(divisions=[10, 20, 50, 60]) assert b.divisions == (10, 20, 50, 60) assert eq(a, b) assert eq(get(b.dask, (b._name, 0)), df.iloc[:1])
def test_repartition(): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) a = dd.from_pandas(df, 2) b = a.repartition(divisions=[10, 20, 50, 60]) assert b.divisions == (10, 20, 50, 60) assert eq(a, b) assert eq(get(b.dask, (b._name, 0)), df.iloc[:1])
def test_categorize(): dsk = {('x', 0): pd.DataFrame({'a': ['Alice', 'Bob', 'Alice'], 'b': ['C', 'D', 'E']}, index=[0, 1, 2]), ('x', 1): pd.DataFrame({'a': ['Bob', 'Charlie', 'Charlie'], 'b': ['A', 'A', 'B']}, index=[3, 4, 5])} d = dd.DataFrame(dsk, 'x', ['a', 'b'], [3]) full = d.compute() c = d.categorize('a') cfull = c.compute() assert cfull.dtypes['a'] == 'category' assert cfull.dtypes['b'] == 'O' assert list(cfull.a.astype('O')) == list(full.a) assert (get(c.dask, c._keys()[:1])[0].dtypes == cfull.dtypes).all() assert (d.categorize().compute().dtypes == 'category').all()