def test_apply_infer_columns(): df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}) ddf = dd.from_pandas(df, npartitions=2) def return_df(x): # will create new DataFrame which columns is ['sum', 'mean'] return pd.Series([x.sum(), x.mean()], index=["sum", "mean"]) # DataFrame to completely different DataFrame result = ddf.apply(return_df, axis=1) assert isinstance(result, dd.DataFrame) tm.assert_index_equal(result.columns, pd.Index(["sum", "mean"])) assert eq(result, df.apply(return_df, axis=1)) # DataFrame to Series result = ddf.apply(lambda x: 1, axis=1) assert isinstance(result, dd.Series) assert result.name is None assert eq(result, df.apply(lambda x: 1, axis=1)) def return_df2(x): return pd.Series([x * 2, x * 3], index=["x2", "x3"]) # Series to completely different DataFrame result = ddf.x.apply(return_df2) assert isinstance(result, dd.DataFrame) tm.assert_index_equal(result.columns, pd.Index(["x2", "x3"])) assert eq(result, df.x.apply(return_df2)) # Series to Series result = ddf.x.apply(lambda x: 1) assert isinstance(result, dd.Series) assert result.name == "x" assert eq(result, df.x.apply(lambda x: 1))
def test_join_indexed_dataframe_to_indexed_dataframe(): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({"y": list("abcdef")}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) c = join_indexed_dataframes(a, b, how="left") assert c.divisions[0] == a.divisions[0] assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c, A.join(B)) c = join_indexed_dataframes(a, b, how="right") assert c.divisions[0] == b.divisions[0] assert c.divisions[-1] == b.divisions[-1] assert eq(c, A.join(B, how="right")) c = join_indexed_dataframes(a, b, how="inner") assert c.divisions[0] == 1 assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c.compute(), A.join(B, how="inner")) c = join_indexed_dataframes(a, b, how="outer") assert c.divisions[0] == 1 assert c.divisions[-1] == 8 assert eq(c.compute(), A.join(B, how="outer")) assert sorted(join_indexed_dataframes(a, b, how="inner").dask) == sorted( join_indexed_dataframes(a, b, how="inner").dask ) assert sorted(join_indexed_dataframes(a, b, how="inner").dask) != sorted( join_indexed_dataframes(a, b, how="outer").dask )
def test_concat(join): pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) ddf1 = dd.from_pandas(pdf1, 2) pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf2 = dd.from_pandas(pdf2, 2) # different columns pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'z': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf3 = dd.from_pandas(pdf3, 2) for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3)]: result = dd.concat([dd1, dd2], join=join) expected = pd.concat([pd1, pd2], join=join) assert eq(result, expected) # test outer only, inner has a problem on pandas side for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]: result = dd.concat([dd1, dd2]) expected = pd.concat([pd1, pd2]) assert eq(result, expected)
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij")) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr")) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr")) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ( "All inputs have known divisions which cannnot be " "concatenated in order. Specify " "interleave_partitions=True to ignore order" ) cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with tm.assertRaisesRegexp(ValueError, msg): dd.concat(case) assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner")) msg = "'join' must be 'inner' or 'outer'" with tm.assertRaisesRegexp(ValueError, msg): dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
def _check_split_data(orig, d): """Check data is split properly""" keys = [k for k in d.dask if k[0].startswith("repartition-split")] keys = sorted(keys) sp = pd.concat([d._get(d.dask, k) for k in keys]) assert eq(orig, sp) assert eq(orig, d)
def test_header_None(): with filetexts({'.tmp.1.csv': '1,2', '.tmp.2.csv': '', '.tmp.3.csv': '3,4'}): df = read_csv('.tmp.*.csv', header=None) expected = pd.DataFrame({0: [1, 3], 1: [2, 4]}) eq(df.compute().reset_index(drop=True), expected)
def test_join_indexed_dataframe_to_indexed_dataframe(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'y': list('abcdef')}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) c = join_indexed_dataframes(a, b, how='left') assert c.divisions[0] == a.divisions[0] assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c, A.join(B)) c = join_indexed_dataframes(a, b, how='right') assert c.divisions[0] == b.divisions[0] assert c.divisions[-1] == b.divisions[-1] assert eq(c, A.join(B, how='right')) c = join_indexed_dataframes(a, b, how='inner') assert c.divisions[0] == 1 assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c.compute(), A.join(B, how='inner')) c = join_indexed_dataframes(a, b, how='outer') assert c.divisions[0] == 1 assert c.divisions[-1] == 8 assert eq(c.compute(), A.join(B, how='outer')) assert sorted(join_indexed_dataframes(a, b, how='inner').dask) == \ sorted(join_indexed_dataframes(a, b, how='inner').dask) assert sorted(join_indexed_dataframes(a, b, how='inner').dask) != \ sorted(join_indexed_dataframes(a, b, how='outer').dask)
def test_groupby_on_index(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) full = d.compute() e = d.set_index('a') efull = full.set_index('a') assert eq(d.groupby('a').b.mean(), e.groupby(e.index).b.mean()) def func(df): df.loc[:, 'b'] = df.b - df.b.mean() return df assert eq(d.groupby('a').apply(func).set_index('a'), e.groupby(e.index).apply(func)) assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func)) assert eq(d.groupby('a').apply(func).set_index('a'), full.groupby('a').apply(func).set_index('a')) assert eq(efull.groupby(efull.index).apply(func), e.groupby(e.index).apply(func))
def test_to_hdf_lock_delays(): pytest.importorskip('tables') df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) a = dd.from_pandas(df16, 16) # adding artifichial delays to make sure last tasks finish first # that's a way to simulate last tasks finishing last def delayed_nop(i): if i[1] < 10: sleep(0.1*(10-i[1])) return i # saving to multiple hdf nodes with tmpfile() as fn: a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data*') out = dd.read_hdf(fn, '/data*') eq(df16, out) # saving to multiple hdf files # adding artifichial delays to make sure last tasks finish first with tmpdir() as dn: fn = os.path.join(dn, 'data*') a = a.apply(delayed_nop, axis=1, columns=a.columns) a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df16, out)
def test_unique(): pdf = pd.DataFrame( {"x": [1, 2, 1, 3, 3, 1, 4, 2, 3, 1], "y": ["a", "c", "b", np.nan, "c", "b", "a", "d", np.nan, "a"]} ) ddf = dd.from_pandas(pdf, npartitions=3) assert eq(ddf.x.unique(), pd.Series(pdf.x.unique(), name="x")) assert eq(ddf.y.unique(), pd.Series(pdf.y.unique(), name="y"))
def test_read_csv_with_datetime_index_partitions_n(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) # because fn is so small, by default, set chunksize small ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4], parse_dates=['Date'], chunkbytes=400) eq(df, ddf)
def test_merge_index_without_divisions(shuffle): a = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) b = pd.DataFrame({"y": [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=3, sort=False) bb = dd.from_pandas(b, npartitions=2) eq(aa.join(bb, how="inner", shuffle=shuffle), a.join(b, how="inner"))
def test_from_pandas_with_datetime_index(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) ddf = dd.from_pandas(df, 2) eq(df, ddf) ddf = dd.from_pandas(df, chunksize=2) eq(df, ddf)
def test_read_csv_header_issue_823(): text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t') with filetext(text) as fn: df = dd.read_csv(fn, sep='\t') eq(df, pd.read_csv(fn, sep='\t')) df = dd.read_csv(fn, delimiter='\t') eq(df, pd.read_csv(fn, delimiter='\t'))
def test_rolling_partition_size(): df = pd.DataFrame(np.random.randn(50, 2)) ddf = dd.from_pandas(df, npartitions=5) for obj, dobj in [(df, ddf), (df[0], ddf[0])]: eq(obj.rolling(10).mean(), dobj.rolling(10).mean()) eq(obj.rolling(11).mean(), dobj.rolling(11).mean()) raises(NotImplementedError, lambda: dobj.rolling(12).mean())
def test_set_partition_tasks_3(shuffle): df = pd.DataFrame(np.random.random((10, 2)), columns=['x', 'y']) ddf = dd.from_pandas(df, npartitions=5) ddf2 = ddf.set_index('x', shuffle=shuffle, max_branch=2) df2 = df.set_index('x') eq(df2, ddf2) assert ddf2.npartitions == ddf.npartitions
def test_shuffle_sort(shuffle): df = pd.DataFrame({'x': [1, 2, 3, 2, 1], 'y': [9, 8, 7, 1, 5]}) ddf = dd.from_pandas(df, npartitions=3) df2 = df.set_index('x').sort_index() ddf2 = ddf.set_index('x', shuffle=shuffle) eq(ddf2.loc[2:3], df2.loc[2:3])
def test_numeric_column_names(): # df.groupby(0)[df.columns] fails if all columns are numbers (pandas bug) # This ensures that we cast all column iterables to list beforehand. df = pd.DataFrame({0: [0, 1, 0, 1], 1: [1, 2, 3, 4]}) ddf = dd.from_pandas(df, npartitions=2) eq(ddf.groupby(0).sum(), df.groupby(0).sum()) eq(ddf.groupby(0).apply(lambda x: x), df.groupby(0).apply(lambda x: x))
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1,)) df = dd.from_dask_array(y) tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) assert eq(df, pd.DataFrame(x)) assert eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def test_read_csv_files(): with filetexts(files, mode='b'): df = read_csv('2014-01-*.csv') eq(df, expected, check_dtype=False) fn = '2014-01-01.csv' df = read_csv(fn) expected2 = pd.read_csv(BytesIO(files[fn])) eq(df, expected2, check_dtype=False)
def test_map(): assert eq(d.a.map(lambda x: x + 1), full.a.map(lambda x: x + 1)) lk = dict((v, v + 1) for v in full.a.values) assert eq(d.a.map(lk), full.a.map(lk)) assert eq(d.b.map(lk), full.b.map(lk)) lk = pd.Series(lk) assert eq(d.a.map(lk), full.a.map(lk)) assert eq(d.b.map(lk), full.b.map(lk)) assert raises(TypeError, lambda: d.a.map(d.b))
def test_from_dask_array_struct_dtype(): x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')]) y = da.from_array(x, chunks=(1,)) df = dd.from_dask_array(y) assert tuple(df.columns) == y.dtype.names eq(df, pd.DataFrame(x)) eq(dd.from_dask_array(y, columns=['b', 'a']), pd.DataFrame(x, columns=['b', 'a']))
def test_from_pandas_non_sorted(): df = pd.DataFrame({'x': [1, 2, 3]}, index=[3, 1, 2]) ddf = dd.from_pandas(df, npartitions=2, sort=False) assert not ddf.known_divisions eq(df, ddf) ddf = dd.from_pandas(df, chunksize=2, sort=False) assert not ddf.known_divisions eq(df, ddf)
def test_getitem_slice(): df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'B': [9, 8, 7, 6, 5, 4, 3, 2, 1], 'C': [True, False, True] * 3}, index=list('abcdefghi')) ddf = dd.from_pandas(df, 3) assert eq(ddf['a':'e'], df['a':'e']) assert eq(ddf['a':'b'], df['a':'b']) assert eq(ddf['f':], df['f':])
def test_nlargest(): from string import ascii_lowercase df = pd.DataFrame({"a": np.random.permutation(10), "b": list(ascii_lowercase[:10])}) ddf = dd.from_pandas(df, npartitions=2) res = ddf.nlargest(5, "a") exp = df.nlargest(5, "a") eq(res, exp)
def test_getitem_slice(): df = pd.DataFrame( {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3}, index=list("abcdefghi"), ) ddf = dd.from_pandas(df, 3) assert eq(ddf["a":"e"], df["a":"e"]) assert eq(ddf["a":"b"], df["a":"b"]) assert eq(ddf["f":], df["f":])
def test_set_index_self_index(shuffle): df = pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100) // 0.2}, index=np.random.random(100)) a = dd.from_pandas(df, npartitions=4) b = a.set_index(a.index, shuffle=shuffle) assert a is b eq(b, df.set_index(df.index))
def _check(a, b, aa, bb): assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert isinstance(aa, dd.DataFrame) assert isinstance(bb, dd.DataFrame) assert eq(a, aa) assert eq(b, bb) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L)
def test_from_pandas_small(): for sort in [True, False]: for i in [0, 2]: df = pd.DataFrame({'x': [0] * i}) ddf = dd.from_pandas(df, npartitions=5, sort=sort) eq(df, ddf) s = pd.Series([0] * i, name='x') ds = dd.from_pandas(s, npartitions=5, sort=sort) eq(s, ds)
def test_apply_shuffle(): pdf = pd.DataFrame({'A': [1, 2, 3, 4] * 5, 'B': np.random.randn(20), 'C': np.random.randn(20), 'D': np.random.randn(20)}) ddf = dd.from_pandas(pdf, 3) assert eq(ddf.groupby('A').apply(lambda x: x.sum()), pdf.groupby('A').apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()), pdf.groupby(pdf['A']).apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()), pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum())) # SeriesGroupBy assert eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()), pdf.groupby('A')['B'].apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()), pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()), pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum())) # DataFrameGroupBy with column slice assert eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()), pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()), pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum())) assert eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()), pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
def test_read_csv_with_datetime_index_partitions_one(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) # chunkbytes set to explicitly set to single chunk ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4], parse_dates=['Date'], chunkbytes=10000000) eq(df, ddf) # because fn is so small, by default, this will only be one chunk ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4], parse_dates=['Date']) eq(df, ddf)
def test_read_csv(open_comp_pair, infer): myopen, compression = open_comp_pair text_ = text if compression is None else text.encode() ext = dict((v, k) for (k, v) in compressions.items()).get(compression, '') with filetext(text_, open=myopen, extension=ext) as fn: compression = 'infer' if infer else compression f = dd.read_csv(fn, chunkbytes=30, compression=compression, lineterminator='\n') assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get) # index may be different assert eq(result.reset_index(drop=True), pd.read_csv(fn, compression=compression, lineterminator='\n'))
def test_reductions_frame_dtypes(): df = pd.DataFrame({ 'int': [1, 2, 3, 4, 5, 6, 7, 8], 'float': [1., 2., 3., 4., np.nan, 6., 7., 8.], 'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)], 'str': list('abcdefgh') }) ddf = dd.from_pandas(df, 3) assert eq(df.sum(), ddf.sum()) assert eq(df.min(), ddf.min()) assert eq(df.max(), ddf.max()) assert eq(df.count(), ddf.count()) assert eq(df.std(), ddf.std()) assert eq(df.var(), ddf.var()) assert eq(df.std(ddof=0), ddf.std(ddof=0)) assert eq(df.var(ddof=0), ddf.var(ddof=0)) assert eq(df.mean(), ddf.mean()) assert eq(df._get_numeric_data(), ddf._get_numeric_data()) numerics = ddf[['int', 'float']] assert numerics._get_numeric_data().dask == numerics.dask
def test_map_partitions_column_info(): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]}) a = dd.from_pandas(df, npartitions=2) b = dd.map_partitions(lambda x: x, a.columns, a) assert b.columns == a.columns assert eq(df, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda x: x, a.x.name, a.x) assert b.name == a.x.name assert eq(df.x, b) b = dd.map_partitions(lambda df: df.x + df.y, None, a) assert b.name == None assert isinstance(b, dd.Series) b = dd.map_partitions(lambda df: df.x + 1, 'x', a) assert isinstance(b, dd.Series) assert b.name == 'x'
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ pytest.importorskip('castra') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5]}, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert eq(b[b.y > 3].x, df[df.y > 3].x)
def test_Series_from_dask_array(): x = da.ones(10, chunks=4) ser = from_dask_array(x, 'a') assert ser.name == 'a' assert list(ser.divisions) == [0, 4, 8, 9] assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all() ser = from_dask_array(x) assert ser.name is None # dd.from_array should re-route to from_dask_array ser2 = dd.from_array(x) assert eq(ser, ser2)
def test_read_csv(myopen, compression): text_ = text if compression is None else text.encode() with filetext(text_, open=myopen) as fn: f = dd.read_csv(fn, chunkbytes=30, compression=compression, lineterminator='\n') assert list(f.columns) == ['name', 'amount'] assert f.npartitions > 1 result = f.compute(get=dask.get) # index may be different assert eq( result.reset_index(drop=True), pd.read_csv(fn, compression=compression, lineterminator='\n'))
def test_scalar_arithmetics_with_dask_instances(): s = dd.core.Scalar({('s', 0): 10}, 's') e = 10 pds = pd.Series([1, 2, 3, 4, 5, 6, 7]) dds = dd.from_pandas(pds, 2) pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) ddf = dd.from_pandas(pdf, 2) # pandas Series result = pds + s # this result pd.Series (automatically computed) assert isinstance(result, pd.Series) assert eq(result, pds + e) result = s + pds # this result dd.Series assert isinstance(result, dd.Series) assert eq(result, pds + e) # dask Series result = dds + s # this result dd.Series assert isinstance(result, dd.Series) assert eq(result, pds + e) result = s + dds # this result dd.Series assert isinstance(result, dd.Series) assert eq(result, pds + e) # pandas DataFrame result = pdf + s # this result pd.DataFrame (automatically computed) assert isinstance(result, pd.DataFrame) assert eq(result, pdf + e) result = s + pdf # this result dd.DataFrame assert isinstance(result, dd.DataFrame) assert eq(result, pdf + e) # dask DataFrame result = ddf + s # this result dd.DataFrame assert isinstance(result, dd.DataFrame) assert eq(result, pdf + e) result = s + ddf # this result dd.DataFrame assert isinstance(result, dd.DataFrame) assert eq(result, pdf + e)
def test_read_csv_index(): with filetext(text) as fn: f = dd.read_csv(fn, blocksize=20).set_index('amount') result = f.compute(get=get_sync) assert result.index.name == 'amount' blocks = dd.DataFrame._get(f.dask, f._keys(), get=get_sync) for i, block in enumerate(blocks): if i < len(f.divisions) - 2: assert (block.index < f.divisions[i + 1]).all() if i > 0: assert (block.index >= f.divisions[i]).all() expected = pd.read_csv(fn).set_index('amount') assert eq(result, expected)
def test_datetime_accessor(): df = pd.DataFrame({'x': [1, 2, 3, 4]}) df['x'] = df.x.astype('M8[us]') a = dd.from_pandas(df, 2) assert 'date' in dir(a.x.dt) # pandas loses Series.name via datetime accessor # see https://github.com/pydata/pandas/issues/10712 assert eq(a.x.dt.date, df.x.dt.date, check_names=False) assert (a.x.dt.to_pydatetime().compute() == df.x.dt.to_pydatetime()).all() assert a.x.dt.date.dask == a.x.dt.date.dask assert a.x.dt.to_pydatetime().dask == a.x.dt.to_pydatetime().dask
def test_quantile(): # series / multiple result = d.b.quantile([.3, .7]) exp = full.b.quantile([.3, .7]) # result may different assert len(result) == 2 assert result.divisions == (.3, .7) assert eq(result.index, exp.index) assert isinstance(result, dd.Series) result = result.compute() assert isinstance(result, pd.Series) assert result.iloc[0] == 0 assert 5 < result.iloc[1] < 6 # index s = pd.Series(np.arange(10), index=np.arange(10)) ds = dd.from_pandas(s, 2) result = ds.index.quantile([.3, .7]) exp = s.quantile([.3, .7]) assert len(result) == 2 assert result.divisions == (.3, .7) assert eq(result.index, exp.index) assert isinstance(result, dd.Series) result = result.compute() assert isinstance(result, pd.Series) assert 1 < result.iloc[0] < 2 assert 7 < result.iloc[1] < 8 # series / single result = d.b.quantile(.5) exp = full.b.quantile(.5) # result may different assert isinstance(result, dd.core.Scalar) result = result.compute() assert 4 < result < 6
def test_get_dummies_kwargs(): s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category') exp = pd.get_dummies(s, prefix='X', prefix_sep='-') ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, prefix='X', prefix_sep='-') assert eq(res, exp) tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4'])) exp = pd.get_dummies(s, drop_first=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, drop_first=True) assert eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # nan s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category') exp = pd.get_dummies(s) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds) assert eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # dummy_na exp = pd.get_dummies(s, dummy_na=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, dummy_na=True) assert eq(res, exp) tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan])) msg = 'sparse=True is not supported' with tm.assertRaisesRegexp(NotImplementedError, msg): dd.get_dummies(ds, sparse=True)
def test_encoding_gh601(encoding): ar = pd.Series(range(0, 100)) br = ar % 7 cr = br * 3.3 dr = br / 1.9836 test_df = pd.DataFrame({'a': ar, 'b': br, 'c': cr, 'd': dr}) with tmpfile('.csv') as fn: test_df.to_csv(fn, encoding=encoding, index=False) a = pd.read_csv(fn, encoding=encoding) d = dd.read_csv(fn, encoding=encoding, chunkbytes=1000) d = d.compute() d.index = range(len(d.index)) assert eq(d, a)
def test_groupby_multilevel_getitem(): df = pd.DataFrame({ 'a': [1, 2, 3, 1, 2, 3], 'b': [1, 2, 1, 4, 2, 1], 'c': [1, 3, 2, 1, 1, 2], 'd': [1, 2, 1, 1, 2, 2] }) ddf = dd.from_pandas(df, 2) cases = [(ddf.groupby('a')['b'], df.groupby('a')['b']), (ddf.groupby(['a', 'b']), df.groupby(['a', 'b'])), (ddf.groupby(['a', 'b'])['c'], df.groupby(['a', 'b'])['c']), (ddf.groupby('a')[['b', 'c']], df.groupby('a')[['b', 'c']]), (ddf.groupby('a')[['b']], df.groupby('a')[['b']]), (ddf.groupby(['a', 'b', 'c']), df.groupby(['a', 'b', 'c']))] for d, p in cases: assert isinstance(d, dd.groupby._GroupBy) assert isinstance(p, pd.core.groupby.GroupBy) assert eq(d.sum(), p.sum()) assert eq(d.min(), p.min()) assert eq(d.max(), p.max()) assert eq(d.count(), p.count()) assert eq(d.mean(), p.mean().astype(float))
def test_to_hdf_multiple_files(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}, index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.]) b = dd.from_pandas(df16, 16) # saving to multiple files with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df, out) # saving to multiple files making sure order is kept with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') b.to_hdf(fn, '/data') out = dd.read_hdf(fn, '/data') eq(df16, out) # saving to multiple files with custom name_function with tmpdir() as dn: fn = os.path.join(dn, 'data_*.h5') a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1)) out = dd.read_hdf(fn, '/data') eq(df, out) out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data') tm.assert_frame_equal(out, df.iloc[:2]) out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data') tm.assert_frame_equal(out, df.iloc[2:]) # test hdf object with tmpfile('h5') as fn: with pd.HDFStore(fn) as hdf: a.to_hdf(hdf, '/data*') out = dd.read_hdf(fn, '/data*') eq(df, out)
def test_indexed_concat(join): A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) result = concat_indexed_dataframes([a, b], join=join) expected = pd.concat([A, B], axis=0, join=join) assert eq(result, expected) assert sorted(concat_indexed_dataframes([a, b], join=join).dask) == \ sorted(concat_indexed_dataframes([a, b], join=join).dask) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \ sorted(concat_indexed_dataframes([a, b], join='outer').dask)
def test_to_hdf_modes_multiple_files(): pytest.importorskip('tables') df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4] }, index=[1., 2., 3., 4.]) # appending a single partition to existing data a = dd.from_pandas(df, 1) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data2'), '/data') a.to_hdf(fn, '/data', mode='a') out = dd.read_hdf(fn, '/data*') eq(df.append(df), out) # appending two partitions to existing data a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data2'), '/data') a.to_hdf(fn, '/data', mode='a') out = dd.read_hdf(fn, '/data') eq(df.append(df), out) # overwriting a file with two partitions a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data1'), '/data') a.to_hdf(fn, '/data', mode='w') out = dd.read_hdf(fn, '/data') eq(df, out) # overwriting a single partition, keeping other partitions a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data*') a.to_hdf(os.path.join(dn, 'data1'), '/data') a.to_hdf(fn, '/data', mode='a', append=False) out = dd.read_hdf(fn, '/data') eq(df.append(df), out)
def test_full_groupby(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) full = d.compute() assert raises(Exception, lambda: d.groupby('does_not_exist')) assert raises(Exception, lambda: d.groupby('a').does_not_exist) assert 'b' in dir(d.groupby('a')) def func(df): df['b'] = df.b - df.b.mean() return df assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func))
def test_full_groupby(): df = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(df, npartitions=3) assert raises(Exception, lambda: df.groupby('does_not_exist')) assert raises(Exception, lambda: df.groupby('a').does_not_exist) assert 'b' in dir(df.groupby('a')) def func(df): df['b'] = df.b - df.b.mean() return df assert eq(df.groupby('a').apply(func), ddf.groupby('a').apply(func))
def test_dropna(): df = pd.DataFrame( { 'x': [np.nan, 2, 3, 4, np.nan, 6], 'y': [1, 2, np.nan, 4, np.nan, np.nan], 'z': [1, 2, 3, 4, np.nan, np.nan] }, index=[10, 20, 30, 40, 50, 60]) ddf = dd.from_pandas(df, 3) assert eq(ddf.x.dropna(), df.x.dropna()) assert eq(ddf.y.dropna(), df.y.dropna()) assert eq(ddf.z.dropna(), df.z.dropna()) assert eq(ddf.dropna(), df.dropna()) assert eq(ddf.dropna(how='all'), df.dropna(how='all')) assert eq(ddf.dropna(subset=['x']), df.dropna(subset=['x'])) assert eq(ddf.dropna(subset=['y', 'z']), df.dropna(subset=['y', 'z'])) assert eq(ddf.dropna(subset=['y', 'z'], how='all'), df.dropna(subset=['y', 'z'], how='all'))
def test_dataframe_quantile(): # column X is for test column order and result division df = pd.DataFrame( { 'A': np.arange(20), 'X': np.arange(20, 40), 'B': np.arange(10, 30), 'C': ['a', 'b', 'c', 'd'] * 5 }, columns=['A', 'X', 'B', 'C']) ddf = dd.from_pandas(df, 3) result = ddf.quantile() assert result.npartitions == 1 assert result.divisions == ('A', 'X') result = result.compute() assert isinstance(result, pd.Series) tm.assert_index_equal(result.index, pd.Index(['A', 'X', 'B'])) assert (result > pd.Series([16, 36, 26], index=['A', 'X', 'B'])).all() assert (result < pd.Series([17, 37, 27], index=['A', 'X', 'B'])).all() result = ddf.quantile([0.25, 0.75]) assert result.npartitions == 1 assert result.divisions == (0.25, 0.75) result = result.compute() assert isinstance(result, pd.DataFrame) tm.assert_index_equal(result.index, pd.Index([0.25, 0.75])) tm.assert_index_equal(result.columns, pd.Index(['A', 'X', 'B'])) minexp = pd.DataFrame([[1, 21, 11], [17, 37, 27]], index=[0.25, 0.75], columns=['A', 'X', 'B']) assert (result > minexp).all().all() maxexp = pd.DataFrame([[2, 22, 12], [18, 38, 28]], index=[0.25, 0.75], columns=['A', 'X', 'B']) assert (result < maxexp).all().all() assert eq(ddf.quantile(axis=1), df.quantile(axis=1)) assert raises(ValueError, lambda: ddf.quantile([0.25, 0.75], axis=1))
def test_from_castra_with_selection(): """ Optimizations fuse getitems with load_partitions We used to use getitem for both column access and selections """ castra = pytest.importorskip('castra') blosc = pytest.importorskip('blosc') if (LooseVersion(blosc.__version__) == '1.3.0' or LooseVersion(castra.__version__) < '0.1.8'): pytest.skip() df = pd.DataFrame({ 'x': ['a', 'b', 'c', 'd'], 'y': [2, 3, 4, 5] }, index=pd.Index([1., 2., 3., 4.], name='ind')) a = dd.from_pandas(df, 2) b = dd.from_castra(a.to_castra()) assert eq(b[b.y > 3].x, df[df.y > 3].x)
def test_apply(): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [10, 20, 30, 40]}) a = dd.from_pandas(df, npartitions=2) func = lambda row: row['x'] + row['y'] eq(a.x.apply(lambda x: x + 1), df.x.apply(lambda x: x + 1)) eq(a.apply(lambda xy: xy[0] + xy[1], axis=1, columns=None), df.apply(lambda xy: xy[0] + xy[1], axis=1)) assert raises(NotImplementedError, lambda: a.apply(lambda xy: xy, axis=0)) assert raises(ValueError, lambda: a.apply(lambda xy: xy, axis=1)) func = lambda x: pd.Series([x, x]) eq(a.x.apply(func, name=[0, 1]), df.x.apply(func))
def test_to_csv_multiple_files_cornercases(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) a = dd.from_pandas(df, 2) with tmpdir() as dn: with pytest.raises(ValueError): fn = os.path.join(dn, "data_*_*.csv") a.to_csv(fn) df16 = pd.DataFrame({ 'x': [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p' ], 'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] }) a = dd.from_pandas(df16, 16) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df16) # test handling existing files when links are optimized out a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_1.csv') a.to_csv(fn, index=False) fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, mode='w', index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) # test handling existing files when links are optimized out a = dd.from_pandas(df16, 16) with tmpdir() as dn: fn = os.path.join(dn, 'data_01.csv') a.to_csv(fn, index=False) fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, mode='w', index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df16) # test handling existing files when mode isn't 'w' a = dd.from_pandas(df, 2) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') with pytest.raises(ValueError): a.to_csv(fn, mode='a')
def test_set_partition_compute(): d2 = d.set_partition('b', [0, 2, 9]) d3 = d.set_partition('b', [0, 2, 9], compute=True) assert eq(d2, d3) assert eq(d2, full.set_index('b')) assert eq(d3, full.set_index('b')) assert len(d2.dask) > len(d3.dask) d4 = d.set_partition(d.b, [0, 2, 9]) d5 = d.set_partition(d.b, [0, 2, 9], compute=True) exp = full.copy() exp.index = exp.b assert eq(d4, d5) assert eq(d4, exp) assert eq(d5, exp) assert len(d4.dask) > len(d5.dask)
def test_embarrassingly_parallel_operations(): df = pd.DataFrame({ 'x': [1, 2, 3, 4, None, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) a = dd.from_pandas(df, 2) assert eq(a.x.astype('float32'), df.x.astype('float32')) assert a.x.astype('float32').compute().dtype == 'float32' assert eq(a.x.dropna(), df.x.dropna()) assert eq(a.x.fillna(100), df.x.fillna(100)) assert eq(a.fillna(100), df.fillna(100)) assert eq(a.x.between(2, 4), df.x.between(2, 4)) assert eq(a.x.clip(2, 4), df.x.clip(2, 4)) assert eq(a.x.notnull(), df.x.notnull()) assert len(a.sample(0.5).compute()) < len(df)
def test_to_csv(): df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}) for npartitions in [1, 2]: a = dd.from_pandas(df, npartitions) with tmpfile('csv') as fn: a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) with tmpfile('csv') as fn: r = a.to_csv(fn, index=False, compute=False) r.compute() result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df) with tmpdir() as dn: fn = os.path.join(dn, 'data_*.csv') a.to_csv(fn, index=False) result = dd.read_csv(fn).compute().reset_index(drop=True) eq(result, df)
def test_getitem_timestamp_str(): df = pd.DataFrame({ 'A': np.random.randn(100), 'B': np.random.randn(100) }, index=pd.date_range('2011-01-01', freq='H', periods=100)) ddf = dd.from_pandas(df, 10) # partial string slice assert eq(df['2011-01-02'], ddf['2011-01-02']) assert eq(df['2011-01-02':'2011-01-10'], df['2011-01-02':'2011-01-10']) df = pd.DataFrame({ 'A': np.random.randn(100), 'B': np.random.randn(100) }, index=pd.date_range('2011-01-01', freq='D', periods=100)) ddf = dd.from_pandas(df, 50) assert eq(df['2011-01'], ddf['2011-01']) assert eq(df['2011'], ddf['2011']) assert eq(df['2011-01':'2012-05'], ddf['2011-01':'2012-05']) assert eq(df['2011':'2015'], ddf['2011':'2015'])
def test_read_csv_with_nrows(): with filetext(text) as fn: f = dd.read_csv(fn, nrows=3) assert list(f.columns) == ['name', 'amount'] assert f.npartitions == 1 assert eq(dd.read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
def test_from_pandas_single_row(): df = pd.DataFrame({'x': [1]}, index=[1]) ddf = dd.from_pandas(df, npartitions=1) assert ddf.divisions == (1, 1) assert eq(ddf, df)