def test_set_index_divisions_sorted(): p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']}) p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']}) p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']}) ddf = dd.DataFrame({ ('x', 0): p1, ('x', 1): p2, ('x', 2): p3 }, 'x', p1, [None, None, None, None]) df = ddf.compute() def throw(*args, **kwargs): raise Exception("Shouldn't have computed") with dask.set_options(get=throw): res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True) assert_eq(res, df.set_index('x')) with dask.set_options(get=throw): res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True) assert_eq(res, df.set_index('y')) # with sorted=True, divisions must be same length as df.divisions with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True) # Divisions must be sorted with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
def test_set_index_divisions_sorted(): p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']}) p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']}) p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']}) ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3}, 'x', p1, [None, None, None, None]) df = ddf.compute() def throw(*args, **kwargs): raise Exception("Shouldn't have computed") with dask.set_options(get=throw): res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True) assert_eq(res, df.set_index('x')) with dask.set_options(get=throw): res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True) assert_eq(res, df.set_index('y')) # with sorted=True, divisions must be same length as df.divisions with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True) # Divisions must be sorted with pytest.raises(ValueError): ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
def test_maybe_compress(): pass try_converters = [bytes, memoryview] try_compressions = ['zlib', 'lz4'] payload = b'123' with dask.set_options(compression=None): for f in try_converters: assert maybe_compress(f(payload)) == (None, payload) for compression in try_compressions: try: __import__(compression) except ImportError: continue with dask.set_options(compression=compression): for f in try_converters: payload = b'123' assert maybe_compress(f(payload)) == (None, payload) payload = b'0' * 10000 rc, rd = maybe_compress(f(payload)) # For some reason compressing memoryviews can force blosc... assert rc in (compression, 'blosc') assert compressions[rc]['decompress'](rd) == payload
def _futures_to_dask_array(futures, executor=None): executor = default_executor(executor) futures = np.array(futures, dtype=object) slices = [ ((0, ) * i + (slice(None, None), ) + (0, ) * (futures.ndim - i - 1)) for i in range(futures.ndim) ] chunks = [[executor.submit(get_dim, x, i) for x in futures[slc]] for i, slc in enumerate(slices)] dtype = executor.submit(get_dtype, futures.flat[0]) chunks, dtype = yield executor._gather([chunks, dtype]) chunks = tuple(map(tuple, chunks)) name = 'array-from-futures-' + tokenize(*futures.flat) keys = list(product([name], *map(range, futures.shape))) values = list(futures.flat) dsk = dict(zip(keys, values)) if _globals['get'] != executor.get: print("Setting global dask scheduler to use distributed") dask.set_options(get=executor.get) raise gen.Return(da.Array(dsk, name, chunks, dtype))
def test_read_text(hdfs): db = pytest.importorskip('dask.bag') import multiprocessing as mp pool = mp.get_context('spawn').Pool(2) with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('%s/other.txt' % basedir, 'wb') as f: f.write('a b\nc d'.encode()) b = db.read_text('hdfs://%s/text.*.txt' % basedir) with dask.set_options(pool=pool): result = b.str.strip().str.split().map(len).compute() assert result == [2, 2, 2, 2, 2, 2] b = db.read_text('hdfs://%s/other.txt' % basedir) with dask.set_options(pool=pool): result = b.str.split().flatten().compute() assert result == ['a', 'b', 'c', 'd']
def get_x_hourlywrf_PNNL2018( homedir, spatialbounds, subdir='PNNL2018/Hourly_WRF_1981_2015/SaukSpatialBounds', nworkers=4, start_date='2005-01-01', end_date='2007-12-31', time_resolution='H', time_steps=24, file_prefix='sp_', rename_timelatlong_names={ 'south_north': 'SN', 'west_east': 'WE', 'time': 'TIME' }, replace_file=True): """ get hourly WRF data from a 2018 PNNL WRF run using xarray on netcdf files """ # check and generate data directory filedir = os.path.join(homedir, subdir) ogh.ensure_dir(filedir) # modify each month between start_date and end_date to year-month dates = [ x.strftime('%Y%m%d') for x in pd.date_range(start=start_date, end=end_date, freq='D') ] # initialize parallel workers da.set_options(pool=ThreadPool(nworkers)) ProgressBar().register() # generate the list of files to download filelist = compile_x_wrfpnnl2018_raw_locations(dates) # download files of interest NetCDFs = [] for url, date in zip(filelist, dates): NetCDFs.append( da.delayed(wget_x_download_spSubset_PNNL)( fileurl=url, filedate=date, time_resolution=time_resolution, time_steps=time_steps, spatialbounds=spatialbounds, file_prefix=file_prefix, rename_timelatlong_names=rename_timelatlong_names, replace_file=replace_file)) # run operations outputfiles = da.compute(NetCDFs)[0] # reset working directory os.chdir(homedir) return (outputfiles)
def test_big(fit_intercept): import dask dask.set_options(get=dask.get) X, y = make_classification() lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def _futures_to_dask_bag(futures, executor=None): executor = default_executor(executor) name = "bag-from-futures-" + tokenize(*futures) dsk = {(name, i): future for i, future in enumerate(futures)} if _globals["get"] != executor.get: print("Setting global dask scheduler to use distributed") dask.set_options(get=executor.get) raise gen.Return(db.Bag(dsk, name, len(futures)))
def test_maybe_compress(): import zlib payload = b'123' with dask.set_options(compression=None): assert maybe_compress(payload) == (None, payload) with dask.set_options(compression='zlib'): assert maybe_compress(payload) == (None, payload) assert maybe_compress(b'111') == (None, b'111') payload = b'0' * 10000 assert maybe_compress(payload) == ('zlib', zlib.compress(payload))
def get_x_dailymet_Livneh2013_raw( homedir, spatialbounds, subdir='livneh2013/Daily_MET_1915_2011/raw_netcdf', nworkers=4, start_date='1915-01-01', end_date='2011-12-31', rename_timelatlong_names={ 'lat': 'LAT', 'lon': 'LON', 'time': 'TIME' }, file_prefix='sp_', replace_file=True): """ get Daily MET data from Livneh et al. (2013) using xarray on netcdf files """ # check and generate DailyMET livneh 2013 data directory filedir = os.path.join(homedir, subdir) ogh.ensure_dir(filedir) # modify each month between start_date and end_date to year-month dates = [ x.strftime('%Y%m') for x in pd.date_range(start=start_date, end=end_date, freq='M') ] # initialize parallel workers da.set_options(pool=ThreadPool(nworkers)) ProgressBar().register() # generate the list of files to download filelist = compile_x_dailymet_Livneh2013_raw_locations(dates) # download files of interest NetCDFs = [] for url in filelist: NetCDFs.append( da.delayed(ftp_x_download_spSubset)( fileurl=url, spatialbounds=spatialbounds, file_prefix=file_prefix, rename_timelatlong_names=rename_timelatlong_names, replace_file=replace_file)) # run operations outputfiles = da.compute(NetCDFs)[0] # reset working directory os.chdir(homedir) return (outputfiles)
def getStore(cfg, mode='r', cell=-1): data = None dataPoca = None if cfg['data']['storeType'] == 'hdf': Store = HDFDataStore elif cfg['data']['storeType'] == 'dask': Store = DaskDataStore if mode == 'w': dask.set_options(get=dask.multiprocessing.get) dask.set_options(temporary_directory=cfg['data']['daskTmp']) elif cfg['data']['storeType'] == 'multihdf': Store = MultiHDFDataStore else: raise ValueError, 'Unkown store type %s' % cfg['data']['storeType'] if mode == 'r': if cfg['data']['input'] != 'none': data = Store(cfg['data']['store'], mode='r') if cfg['data']['inputPoca'] != 'none': dataPoca = Store(cfg['data']['storePoca'], mode='r') elif mode == 'w': allCols = False if 'allColumns' in cfg['data']: allCols = cfg['data']['allColumns'] if cfg['data']['input'] != 'none': if allCols: data = Store(cfg['data']['store'], reader=L2SwathReaderAll, mode='w', cell=cell) else: data = Store(cfg['data']['store'], reader=L2SwathReader, mode='w', cell=cell) if cfg['data']['inputPoca'] != 'none': if allCols: dataPoca = Store(cfg['data']['storePoca'], reader=L2SwathReaderAll, mode='w', cell=cell) else: dataPoca = Store(cfg['data']['storePoca'], reader=L2SwathReader, mode='w', cell=cell) else: raise ValueError, 'mode must be either r or w' return data, dataPoca
def load_datasets(cache_dir, dbc, dsets, parse_all=False, resume=True): import cytoolz.dicttoolz as dz cache = dask.cache.Cache(8e9) cache.register() dask.set_options(get=dask.threaded.get, pool=ThreadPool()) configs = list( map(lambda x: os.path.join('config/data', x), os.listdir('config/data'))) cmap = {k: DatasetConfig.from_yaml(k) for k in configs} dsids = dz.valmap(lambda ds: ds.id, cmap) cmap = dz.merge(cmap, {d.id: d for d in cmap.values()}) if parse_all: dsets = dsids for d in dsets: process_dataset(cmap[d], dbc, cache_dir)
def test_optimize(): x = dask.delayed(inc)(1) y = dask.delayed(inc)(x) z = x + y x2, y2, z2, constant = optimize(x, y, z, 1) assert constant == 1 # Same graphs for each dsk = dict(x2.dask) assert dict(y2.dask) == dsk assert dict(z2.dask) == dsk # Computationally equivalent assert dask.compute(x2, y2, z2) == dask.compute(x, y, z) # Applying optimizations before compute and during compute gives # same results. Shows optimizations are occurring. sols = dask.compute(x, y, z, optimizations=[inc_to_dec]) x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec]) assert dask.compute(x3, y3, z3) == sols # Optimize respects global optimizations as well with dask.set_options(optimizations=[inc_to_dec]): x4, y4, z4 = optimize(x, y, z) for a, b in zip([x3, y3, z3], [x4, y4, z4]): assert dict(a.dask) == dict(b.dask)
def test_persistent_dask(capsys): global IS_COMPUTED IS_COMPUTED = dict() g = setup_graph() # the first time the grap is created it has functions assert not all(g.is_computed().values()) assert g.persistent_dask == g.dask # run the graph with dask.set_options(get=dask. async .get_sync): futures = g.run() assert IS_COMPUTED assert all(g.is_computed().values()) assert g.persistent_dask != g.dask # then the graph is replaced by cached data values = g.results(futures).values() assert values == [ 'cleaned_data', 'analyzed_cleaned_data', 'data', 'cleaned_data', 'data', 'analyzed_cleaned_data' ] # We recreate a new graph => the cache is deleted g = setup_graph() assert all(g.is_computed().values()) # the graph contains the load methods assert g.persistent_dask != g.dask assert all( map(lambda f: f[0].func_name == 'load', g.persistent_dask.values())) # get multiple results data = g.get([('analyzed_data', 'pool1'), ('analyzed_data', 'pool2')])
def test_temporary_directory(tmpdir): b = db.range(10, npartitions=4) with dask.set_options(temporary_directory=str(tmpdir)): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
def test_hdf_globbing(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table') with dask.set_options(get=dask.get): res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
def test_writing_parquet_with_kwargs(tmpdir, engine): fn = str(tmpdir) path1 = os.path.join(fn, 'normal') path2 = os.path.join(fn, 'partitioned') df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100)}) ddf = dd.from_pandas(df, npartitions=3) engine_kwargs = { 'pyarrow': { 'compression': 'snappy', 'coerce_timestamps': None, 'use_dictionary': True }, 'fastparquet': { 'compression': 'snappy', 'times': 'int64', 'fixed_text': None } } ddf.to_parquet(path1, engine=engine, **engine_kwargs[engine]) out = dd.read_parquet(path1, engine=engine) assert_eq(out, df, check_index=(engine != 'fastparquet')) # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets with dask.set_options(get=dask.get): ddf.to_parquet(path2, engine=engine, partition_on=['a'], **engine_kwargs[engine]) out = dd.read_parquet(path2, engine=engine).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val])
def test_groupby_on_index(get): pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(pdf, npartitions=3) ddf2 = ddf.set_index('a') pdf2 = pdf.set_index('a') assert_eq(ddf.groupby('a').b.mean(), ddf2.groupby(ddf2.index).b.mean()) def func(df): return df.assign(b=df.b - df.b.mean()) def func2(df): return df[['b']] - df[['b']].mean() with dask.set_options(get=get): with pytest.warns(None): assert_eq(ddf.groupby('a').apply(func), pdf.groupby('a').apply(func)) assert_eq(ddf.groupby('a').apply(func).set_index('a'), pdf.groupby('a').apply(func).set_index('a')) assert_eq(pdf2.groupby(pdf2.index).apply(func2), ddf2.groupby(ddf2.index).apply(func2))
def cal(x,client): st = time.time() #Distributed scheduler #with dask.set_options(get=dask.threaded.get): with dask.set_options(get=client.get): A = da.transpose(x) B = da.dot(x,A) C = da.dot(B,B) print C.compute() #Default scheduler # with dask.set_options(get=dask.threaded.get): # A = da.transpose(x) # B = da.dot(x,A) # C = da.dot(B,B) # # print C.compute() #mannually set global thread. # from multiprocessing.pool import ThreadPool # with dask.set_options(pool=ThreadPool(4)): # A = da.transpose(x) # B = da.dot(x,A) # C = da.dot(B,B) # # print C.compute(num_works = 4) print 'time: ',time.time()-st return 0
def get(self): import dask if os.environ.get('DASK_DEBUG'): with dask.set_options(get=dask.threaded.get): return self.future.result() return self.future.result()
def test_coerce(): d = da.from_array(np.array([1]), chunks=(1,)) with dask.set_options(get=dask.get): assert bool(d) assert int(d) assert float(d) assert complex(d)
def test_get_scheduler(): assert get_scheduler() is None assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler(scheduler='sync') is dask.local.get_sync with dask.set_options(scheduler='threads'): assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler() is None
def test_groupby_on_index(get): full = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) d = dd.from_pandas(full, npartitions=3) e = d.set_index('a') efull = full.set_index('a') assert_eq(d.groupby('a').b.mean(), e.groupby(e.index).b.mean()) def func(df): return df.assign(b=df.b - df.b.mean()) with dask.set_options(get=get): assert_eq(d.groupby('a').apply(func), full.groupby('a').apply(func)) assert_eq( d.groupby('a').apply(func).set_index('a'), full.groupby('a').apply(func).set_index('a')) assert_eq( efull.groupby(efull.index).apply(func), e.groupby(e.index).apply(func))
def assertLazyAnd(self, expected, actual, test): expected_copy = expected.copy(deep=False) actual_copy = actual.copy(deep=False) with dask.set_options(get=dask.get): test(actual_copy, expected_copy) var = getattr(actual, "variable", actual) self.assertIsInstance(var.data, da.Array)
def test_fold(): c = b.fold(add) assert c.compute() == sum(L) assert c.key == b.fold(add).key c2 = b.fold(add, initial=10) assert c2.key != c.key assert c2.compute() == sum(L) + 10 * b.npartitions assert c2.key == b.fold(add, initial=10).key c = db.from_sequence(range(5), npartitions=3) def binop(acc, x): acc = acc.copy() acc.add(x) return acc d = c.fold(binop, set.union, initial=set()) assert d.compute() == set(c) assert d.key == c.fold(binop, set.union, initial=set()).key d = db.from_sequence('hello') assert set(d.fold(lambda a, b: ''.join([a, b]), initial='').compute()) == set('hello') e = db.from_sequence([[1], [2], [3]], npartitions=2) with dask.set_options(get=get_sync): assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
def check_identical(data1, data2, output_filename): with dask.set_options(get=dask.local.get_sync): if not all((data1 == data2).all().values()): _LOG.error("Mismatch found for %s, not indexing", output_filename) raise ValueError("Mismatch found for %s, not indexing" % output_filename) return True
def test_coerce(): d = da.from_array(np.array([1]), chunks=(1, )) with dask.set_options(get=dask.get): assert bool(d) assert int(d) assert float(d) assert complex(d)
def test_groupby_on_index(get): pdf = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b': [4, 5, 6, 3, 2, 1, 0, 0, 0] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf = dd.from_pandas(pdf, npartitions=3) ddf2 = ddf.set_index('a') pdf2 = pdf.set_index('a') assert_eq(ddf.groupby('a').b.mean(), ddf2.groupby(ddf2.index).b.mean()) def func(df): return df.assign(b=df.b - df.b.mean()) with dask.set_options(get=get): assert_eq(ddf.groupby('a').apply(func), pdf.groupby('a').apply(func)) assert_eq( ddf.groupby('a').apply(func).set_index('a'), pdf.groupby('a').apply(func).set_index('a')) assert_eq( pdf2.groupby(pdf2.index).apply(func), ddf2.groupby(ddf2.index).apply(func))
def main(kind): input_array = np.random.random(5000) getter = {'processes': dask.multiprocessing.get, 'threads': dask.threaded.get}[kind] # sets the scheduler with dask.set_options(get=getter): # set ``partition_size`` to ensure each partition has enough work bag = db.from_sequence(input_array, partition_size=1000) # compute elemwise cosine on the gpu within each partition bag_cos = bag.map_partitions( lambda x: gpu_cos(np.asarray(x, dtype=np.float32))) # apply partial sum-reduce on each partition # then, finish it on the host got = bag_cos.reduction(sum_parts, sum).compute() # cross validate with numpy expected = np.sum(np.cos(input_array)) print('Got: ', got) print('Expected:', expected) correct = np.allclose(got, expected) print('Correct: ', correct) sys.exit(0 if correct else 1)
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df['A'] = df.A // 0.1 df['B'] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: with dask.set_options(get=e.get): for ind in [lambda x: 'A', lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[['A', 'B']]).apply(len) a = df.groupby(['A', 'B']).apply(len) b = ddf.groupby(['A', 'B']).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def assertLazyAnd(self, expected, actual, test): expected_copy = expected.copy(deep=False) actual_copy = actual.copy(deep=False) with dask.set_options(get=dask.get): test(actual_copy, expected_copy) var = getattr(actual, 'variable', actual) self.assertIsInstance(var.data, da.Array)
def test_writing_parquet_with_kwargs(tmpdir, engine): fn = str(tmpdir) path1 = os.path.join(fn, 'normal') path2 = os.path.join(fn, 'partitioned') df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100), 'b': np.random.random(size=100), 'c': np.random.randint(1, 5, size=100)}) ddf = dd.from_pandas(df, npartitions=3) engine_kwargs = { 'pyarrow': { 'compression': 'snappy', 'coerce_timestamps': None, 'use_dictionary': True }, 'fastparquet': { 'compression': 'snappy', 'times': 'int64', 'fixed_text': None } } ddf.to_parquet(path1, engine=engine, **engine_kwargs[engine]) out = dd.read_parquet(path1, engine=engine) assert_eq(out, df, check_index=(engine != 'fastparquet')) # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets with dask.set_options(get=dask.get): ddf.to_parquet(path2, engine=engine, partition_on=['a'], **engine_kwargs[engine]) out = dd.read_parquet(path2, engine=engine).compute() for val in df.a.unique(): assert set(df.b[df.a == val]) == set(out.b[out.a == val])
def assertLazyAnd(self, expected, actual, test): expected_copy = _copy_at_variable_level(expected) actual_copy = _copy_at_variable_level(actual) with dask.set_options(get=dask.get): test(actual_copy, expected_copy) var = getattr(actual, 'variable', actual) self.assertIsInstance(var.data, da.Array)
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df['A'] = df.A // 0.1 df['B'] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Client(('127.0.0.1', c['port']), loop=loop) as c: with dask.set_options(get=c.get): for ind in [lambda x: 'A', lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[['A', 'B']]).apply(len) a = df.groupby(['A', 'B']).apply(len) b = ddf.groupby(['A', 'B']).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df["A"] = df.A // 0.1 df["B"] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Client(("127.0.0.1", c["port"]), loop=loop) as c: with dask.set_options(get=c.get): for ind in [lambda x: "A", lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any("partd" in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any("partd" in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[["A", "B"]]).apply(len) a = df.groupby(["A", "B"]).apply(len) b = ddf.groupby(["A", "B"]).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test_hdf_globbing(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) with tmpdir() as tdir: df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table') df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table') with dask.set_options(get=dask.get): res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 tm.assert_frame_equal(res.compute(), df) res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data', chunksize=2, start=1, stop=3) expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', start=1, stop=3) tm.assert_frame_equal(res.compute(), expected) res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2) assert res.npartitions == 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 2)) res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2) assert res.npartitions == 2 + 2 + 2 tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
def test_optimize(): x = dask.delayed(inc)(1) y = dask.delayed(inc)(x) z = x + y x2, y2, z2, constant = optimize(x, y, z, 1) assert constant == 1 # Same graphs for each dsk = dict(x2.dask) assert dict(y2.dask) == dsk assert dict(z2.dask) == dsk # Computationally equivalent assert dask.compute(x2, y2, z2) == dask.compute(x, y, z) # Applying optimizations before compute and during compute gives # same results. Shows optimizations are occurring. sols = dask.compute(x, y, z, optimizations=[inc_to_dec]) x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec]) assert dask.compute(x3, y3, z3) == sols # Optimize respects global optimizations as well with dask.set_options(optimizations=[inc_to_dec]): x4, y4, z4 = optimize(x, y, z) for a, b in zip([x3, y3, z3], [x4, y4, z4]): assert dict(a.dask) == dict(b.dask)
def test_fold(): c = b.fold(add) assert c.compute() == sum(L) assert c.key == b.fold(add).key c2 = b.fold(add, initial=10) assert c2.key != c.key assert c2.compute() == sum(L) + 10 * b.npartitions assert c2.key == b.fold(add, initial=10).key c = db.from_sequence(range(5), npartitions=3) def binop(acc, x): acc = acc.copy() acc.add(x) return acc d = c.fold(binop, set.union, initial=set()) assert d.compute() == set(c) assert d.key == c.fold(binop, set.union, initial=set()).key d = db.from_sequence('hello') assert set(d.fold(lambda a, b: ''.join([a, b]), initial='').compute()) == set('hello') e = db.from_sequence([[1], [2], [3]], npartitions=2) with dask.set_options(get=get_sync): assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
def test_fs_driver_backends(): from dask.bytes.hdfs3 import HDFS3HadoopFileSystem from dask.bytes.pyarrow import PyArrowHadoopFileSystem fs1, token1 = get_fs('hdfs') assert isinstance(fs1, HDFS3HadoopFileSystem) with dask.set_options(hdfs_driver='pyarrow'): fs2, token2 = get_fs('hdfs') assert isinstance(fs2, PyArrowHadoopFileSystem) assert token1 != token2 with pytest.raises(ValueError): with dask.set_options(hdfs_driver='not-a-valid-driver'): get_fs('hdfs')
def assertLazyAnd(self, expected, actual, test): expected_copy = _copy_at_variable_level(expected) actual_copy = _copy_at_variable_level(actual) with dask.set_options(get=dask.get): test(actual_copy, expected_copy) var = getattr(actual, "variable", actual) self.assertIsInstance(var.data, da.Array)
def test_temporary_directory(): b = db.range(10, npartitions=4) with dask.set_options(temporary_directory=os.getcwd()): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(os.getcwd()))
def main(kind): input_array = np.random.random(5000) getter = { 'processes': dask.multiprocessing.get, 'threads': dask.threaded.get }[kind] # sets the scheduler with dask.set_options(get=getter): # set ``partition_size`` to ensure each partition has enough work bag = db.from_sequence(input_array, partition_size=1000) # compute elemwise cosine on the gpu within each partition bag_cos = bag.map_partitions( lambda x: gpu_cos(np.asarray(x, dtype=np.float32))) # apply partial sum-reduce on each partition # then, finish it on the host got = bag_cos.reduction(sum_parts, sum).compute() # cross validate with numpy expected = np.sum(np.cos(input_array)) print('Got: ', got) print('Expected:', expected) correct = np.allclose(got, expected) print('Correct: ', correct) sys.exit(0 if correct else 1)
def test_groupby_multiprocessing(): from dask.multiprocessing import get df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': ['1','1','a','a','a']}) ddf = dd.from_pandas(df, npartitions=3) with dask.set_options(get=get): assert_eq(ddf.groupby('B').apply(lambda x: x), df.groupby('B').apply(lambda x: x))
def test_groupby_multiprocessing(): from dask.multiprocessing import get df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': ['1', '1', 'a', 'a', 'a']}) ddf = dd.from_pandas(df, npartitions=3) with dask.set_options(get=get): assert eq( ddf.groupby('B').apply(lambda x: x), df.groupby('B').apply(lambda x: x))
def test_determinism(func, kwargs, get): X, y = make_intercept_data(1000, 10) with dask.set_options(get=get): a = func(X, y, **kwargs) b = func(X, y, **kwargs) assert (a == b).all()
def test_optimizations_keyword(): x = dask.delayed(inc)(1) assert x.compute() == 2 with dask.set_options(optimizations=[inc_to_dec]): assert x.compute() == 0 assert x.compute() == 2
def write_data_variables(data_vars, nco): for name, variable in data_vars.items(): try: with dask.set_options(get=dask.async.get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync()
def test_optimizations_keyword(): x = dask.delayed(inc)(1) assert x.compute() == 2 with dask.set_options(optimizations=[inc_to_dec]): assert x.compute() == 0 assert x.compute() == 2
def test_globalmethod(): x = Foo() assert x.f() == 1 with dask.set_options(f=lambda: 2): assert x.f() == 2 with dask.set_options(f=foo): assert x.f is foo assert x.f() == 'foo' assert x.g is foo assert x.g() == 'foo' with dask.set_options(g=False): assert x.g is bar assert x.g() == 'bar'
def _iris_dask_defaults(): """ Set dask defaults for Iris. The current default dask operation mode for Iris is running single-threaded using `dask.local.get_sync`. This default ensures that running Iris under "normal" conditions will not use up all available computational resource. Otherwise, by default, `dask` will use a multi-threaded scheduler that uses all available CPUs. .. note:: We only want Iris to set dask options in the case where doing so will not change user-specified options that have already been set. """ if 'pool' not in dask.context._globals and \ 'get' not in dask.context._globals: dask.set_options(get=dget_sync)
def test_local_get_with_distributed_active(c, s, a, b): with dask.set_options(get=dask.get): x = delayed(inc)(1).persist() yield gen.sleep(0.01) assert not s.task_state # scheduler hasn't done anything y = delayed(inc)(2).persist(get=dask.get) yield gen.sleep(0.01) assert not s.task_state # scheduler hasn't done anything
def test_turn_off_fusion(): x = da.ones(10, chunks=(5,)) y = da.sum(x + 1 + 2 + 3) a = y.__dask_optimize__(y.dask, y.__dask_keys__()) with dask.set_options(fuse_ave_width=0): b = y.__dask_optimize__(y.dask, y.__dask_keys__()) assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__()) assert len(a) < len(b)
def test_temporary_directory(tmpdir): df = pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100), 'z': np.random.random(100)}) ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False) with dask.set_options(temporary_directory=str(tmpdir), get=dask.multiprocessing.get): ddf2 = ddf.set_index('x', shuffle='disk') ddf2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
def test_simultaneous_compute(self): ds = Dataset({'foo': ('x', range(5)), 'bar': ('x', range(5))}).reblock() count = np.array(0) def counting_get(*args, **kwargs): count[...] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load_data() self.assertEqual(count, 1)
def _futures_to_dask_array(futures, executor=None): executor = default_executor(executor) futures = np.array(futures, dtype=object) slices = [((0,) * i + (slice(None, None),) + (0,) * (futures.ndim - i - 1)) for i in range(futures.ndim)] chunks = [[executor.submit(get_dim, x, i) for x in futures[slc]] for i, slc in enumerate(slices)] dtype = executor.submit(get_dtype, futures.flat[0]) chunks, dtype = yield executor._gather([chunks, dtype]) chunks = tuple(map(tuple, chunks)) name = "array-from-futures-" + tokenize(*futures.flat) keys = list(product([name], *map(range, futures.shape))) values = list(futures.flat) dsk = dict(zip(keys, values)) if _globals["get"] != executor.get: print("Setting global dask scheduler to use distributed") dask.set_options(get=executor.get) raise gen.Return(da.Array(dsk, name, chunks, dtype))
def test_optimize_None(): da = pytest.importorskip('dask.array') x = da.ones(10, chunks=(5,)) y = x[:9][1:8][::2] + 1 # normally these slices would be fused def my_get(dsk, keys): assert dsk == dict(y.dask) # but they aren't return dask.get(dsk, keys) with dask.set_options(array_optimize=None, get=my_get): y.compute()
def test_cache_options(): try: from chest import Chest except ImportError: return cache = Chest() def inc2(x): assert 'y' in cache return x + 1 with dask.set_options(cache=cache): get_sync({'x': (inc2, 'y'), 'y': 1}, 'x')
def test_simultaneous_compute(self): ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk() count = [0] def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) with dask.set_options(get=counting_get): ds.load() self.assertEqual(count[0], 1)
def test_grid_search(): pipeline = dl.Pipeline([("pca", PCA()), ("select_k", SelectKBest()), ("svm", LinearSVC())]) param_grid = {'select_k__k': [1, 2, 3, 4], 'svm__C': np.logspace(-3, 2, 3)} grid = dl.GridSearchCV(pipeline, param_grid) with dask.set_options(get=dask.get): result = grid.fit(X_train, y_train).score(X_test, y_test) assert isinstance(result, float)
def test_categorical_set_index(): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': ['a', 'b', 'b', 'c']}) df['y'] = df.y.astype('category') a = dd.from_pandas(df, npartitions=2) with dask.set_options(get=get_sync): b = a.set_index('y') df2 = df.set_index('y') assert list(b.index.compute()), list(df2.index) b = a.set_index(a.y) df2 = df.set_index(df.y) assert list(b.index.compute()), list(df2.index)