def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n', header=True, names=None, **kwargs): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = hdfs.glob(fn) blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator) for fn in filenames] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative raise gen.Return(from_imperative(dfs2, columns=names)) else: futures = executor.compute(*dfs2) from distributed.collections import _futures_to_dask_dataframe df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df)
def test_dataframes(e, s, a, b): dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.groupby(['x', 'y']).count(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote)
def test_dataframes(e, s, a, b): dfs = [ pd.DataFrame({ 'x': np.random.random(100), 'y': np.random.random(100) }, index=list(range(i, i + 100))) for i in range(0, 100 * 10, 100) ] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [ lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.groupby(['x', 'y']).count(), lambda df: df.loc[50:75] ] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote)
def test_no_divisions(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test__futures_to_dask_dataframe(c, s, a, b): remote_dfs = c.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, client=c) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield c._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])]
def test__futures_to_dask_dataframe(e, s, a, b): remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) assert ddf._known_dtype expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])]
def f(c, a, b): e = Executor((c.ip, c.port), start=False) IOLoop.current().spawn_callback(e._go) remote_dfs = e.map(lambda x: x, dfs) ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test__futures_to_dask_dataframe(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test__futures_to_collection(c, s, a, b): remote_dfs = c.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = c.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [ pd.DataFrame({ 'x': np.random.random(100), 'y': np.random.random(100) }, index=list(range(i, i + 100))) for i in range(0, 100 * 10, 100) ] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote, = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [ lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75] ] for f in exprs: local = f(ldf).compute(get=dask.get) remote, = e.compute(f(rdf)) remote = yield remote._result() assert_equal(local, remote)
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote, = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote, = e.compute(f(rdf)) remote = yield remote._result() assert_equal(local, remote)
def test_no_divisions(c, s, a, b): dfs = c.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test_no_divisions(e, s, a, b): dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)