示例#1
0
def test_set_index_divisions_sorted():
    p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']})
    p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']})
    p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']})

    ddf = dd.DataFrame({
        ('x', 0): p1,
        ('x', 1): p2,
        ('x', 2): p3
    }, 'x', p1, [None, None, None, None])
    df = ddf.compute()

    def throw(*args, **kwargs):
        raise Exception("Shouldn't have computed")

    with dask.set_options(get=throw):
        res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True)
    assert_eq(res, df.set_index('x'))

    with dask.set_options(get=throw):
        res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True)
    assert_eq(res, df.set_index('y'))

    # with sorted=True, divisions must be same length as df.divisions
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True)

    # Divisions must be sorted
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
示例#2
0
def test_set_index_divisions_sorted():
    p1 = pd.DataFrame({'x': [10, 11, 12], 'y': ['a', 'a', 'a']})
    p2 = pd.DataFrame({'x': [13, 14, 15], 'y': ['b', 'b', 'c']})
    p3 = pd.DataFrame({'x': [16, 17, 18], 'y': ['d', 'e', 'e']})

    ddf = dd.DataFrame({('x', 0): p1, ('x', 1): p2, ('x', 2): p3},
                       'x', p1, [None, None, None, None])
    df = ddf.compute()

    def throw(*args, **kwargs):
        raise Exception("Shouldn't have computed")

    with dask.set_options(get=throw):
        res = ddf.set_index('x', divisions=[10, 13, 16, 18], sorted=True)
    assert_eq(res, df.set_index('x'))

    with dask.set_options(get=throw):
        res = ddf.set_index('y', divisions=['a', 'b', 'd', 'e'], sorted=True)
    assert_eq(res, df.set_index('y'))

    # with sorted=True, divisions must be same length as df.divisions
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'c', 'd', 'e'], sorted=True)

    # Divisions must be sorted
    with pytest.raises(ValueError):
        ddf.set_index('y', divisions=['a', 'b', 'd', 'c'], sorted=True)
示例#3
0
def test_maybe_compress():
    pass

    try_converters = [bytes, memoryview]
    try_compressions = ['zlib', 'lz4']

    payload = b'123'

    with dask.set_options(compression=None):
        for f in try_converters:
            assert maybe_compress(f(payload)) == (None, payload)

    for compression in try_compressions:
        try:
            __import__(compression)
        except ImportError:
            continue

        with dask.set_options(compression=compression):
            for f in try_converters:
                payload = b'123'
                assert maybe_compress(f(payload)) == (None, payload)

                payload = b'0' * 10000
                rc, rd = maybe_compress(f(payload))
                # For some reason compressing memoryviews can force blosc...
                assert rc in (compression, 'blosc')
                assert compressions[rc]['decompress'](rd) == payload
示例#4
0
def _futures_to_dask_array(futures, executor=None):
    executor = default_executor(executor)
    futures = np.array(futures, dtype=object)

    slices = [
        ((0, ) * i + (slice(None, None), ) + (0, ) * (futures.ndim - i - 1))
        for i in range(futures.ndim)
    ]
    chunks = [[executor.submit(get_dim, x, i) for x in futures[slc]]
              for i, slc in enumerate(slices)]
    dtype = executor.submit(get_dtype, futures.flat[0])

    chunks, dtype = yield executor._gather([chunks, dtype])
    chunks = tuple(map(tuple, chunks))

    name = 'array-from-futures-' + tokenize(*futures.flat)
    keys = list(product([name], *map(range, futures.shape)))
    values = list(futures.flat)
    dsk = dict(zip(keys, values))

    if _globals['get'] != executor.get:
        print("Setting global dask scheduler to use distributed")
        dask.set_options(get=executor.get)

    raise gen.Return(da.Array(dsk, name, chunks, dtype))
示例#5
0
def test_read_text(hdfs):
    db = pytest.importorskip('dask.bag')
    import multiprocessing as mp
    pool = mp.get_context('spawn').Pool(2)

    with hdfs.open('%s/text.1.txt' % basedir, 'wb') as f:
        f.write('Alice 100\nBob 200\nCharlie 300'.encode())

    with hdfs.open('%s/text.2.txt' % basedir, 'wb') as f:
        f.write('Dan 400\nEdith 500\nFrank 600'.encode())

    with hdfs.open('%s/other.txt' % basedir, 'wb') as f:
        f.write('a b\nc d'.encode())

    b = db.read_text('hdfs://%s/text.*.txt' % basedir)
    with dask.set_options(pool=pool):
        result = b.str.strip().str.split().map(len).compute()

    assert result == [2, 2, 2, 2, 2, 2]

    b = db.read_text('hdfs://%s/other.txt' % basedir)
    with dask.set_options(pool=pool):
        result = b.str.split().flatten().compute()

    assert result == ['a', 'b', 'c', 'd']
示例#6
0
def get_x_hourlywrf_PNNL2018(
        homedir,
        spatialbounds,
        subdir='PNNL2018/Hourly_WRF_1981_2015/SaukSpatialBounds',
        nworkers=4,
        start_date='2005-01-01',
        end_date='2007-12-31',
        time_resolution='H',
        time_steps=24,
        file_prefix='sp_',
        rename_timelatlong_names={
            'south_north': 'SN',
            'west_east': 'WE',
            'time': 'TIME'
        },
        replace_file=True):
    """
    get hourly WRF data from a 2018 PNNL WRF run using xarray on netcdf files
    """
    # check and generate data directory
    filedir = os.path.join(homedir, subdir)
    ogh.ensure_dir(filedir)

    # modify each month between start_date and end_date to year-month
    dates = [
        x.strftime('%Y%m%d')
        for x in pd.date_range(start=start_date, end=end_date, freq='D')
    ]

    # initialize parallel workers
    da.set_options(pool=ThreadPool(nworkers))
    ProgressBar().register()

    # generate the list of files to download
    filelist = compile_x_wrfpnnl2018_raw_locations(dates)

    # download files of interest
    NetCDFs = []
    for url, date in zip(filelist, dates):
        NetCDFs.append(
            da.delayed(wget_x_download_spSubset_PNNL)(
                fileurl=url,
                filedate=date,
                time_resolution=time_resolution,
                time_steps=time_steps,
                spatialbounds=spatialbounds,
                file_prefix=file_prefix,
                rename_timelatlong_names=rename_timelatlong_names,
                replace_file=replace_file))

    # run operations
    outputfiles = da.compute(NetCDFs)[0]

    # reset working directory
    os.chdir(homedir)
    return (outputfiles)
示例#7
0
def test_big(fit_intercept):
    import dask
    dask.set_options(get=dask.get)
    X, y = make_classification()
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
示例#8
0
def _futures_to_dask_bag(futures, executor=None):
    executor = default_executor(executor)

    name = "bag-from-futures-" + tokenize(*futures)
    dsk = {(name, i): future for i, future in enumerate(futures)}

    if _globals["get"] != executor.get:
        print("Setting global dask scheduler to use distributed")
        dask.set_options(get=executor.get)

    raise gen.Return(db.Bag(dsk, name, len(futures)))
示例#9
0
def test_maybe_compress():
    import zlib
    payload = b'123'
    with dask.set_options(compression=None):
        assert maybe_compress(payload) == (None, payload)

    with dask.set_options(compression='zlib'):
        assert maybe_compress(payload) == (None, payload)
        assert maybe_compress(b'111') == (None, b'111')

        payload = b'0' * 10000
        assert maybe_compress(payload) == ('zlib', zlib.compress(payload))
示例#10
0
def get_x_dailymet_Livneh2013_raw(
        homedir,
        spatialbounds,
        subdir='livneh2013/Daily_MET_1915_2011/raw_netcdf',
        nworkers=4,
        start_date='1915-01-01',
        end_date='2011-12-31',
        rename_timelatlong_names={
            'lat': 'LAT',
            'lon': 'LON',
            'time': 'TIME'
        },
        file_prefix='sp_',
        replace_file=True):
    """
    get Daily MET data from Livneh et al. (2013) using xarray on netcdf files
    """
    # check and generate DailyMET livneh 2013 data directory
    filedir = os.path.join(homedir, subdir)
    ogh.ensure_dir(filedir)

    # modify each month between start_date and end_date to year-month
    dates = [
        x.strftime('%Y%m')
        for x in pd.date_range(start=start_date, end=end_date, freq='M')
    ]

    # initialize parallel workers
    da.set_options(pool=ThreadPool(nworkers))
    ProgressBar().register()

    # generate the list of files to download
    filelist = compile_x_dailymet_Livneh2013_raw_locations(dates)

    # download files of interest
    NetCDFs = []
    for url in filelist:
        NetCDFs.append(
            da.delayed(ftp_x_download_spSubset)(
                fileurl=url,
                spatialbounds=spatialbounds,
                file_prefix=file_prefix,
                rename_timelatlong_names=rename_timelatlong_names,
                replace_file=replace_file))

    # run operations
    outputfiles = da.compute(NetCDFs)[0]

    # reset working directory
    os.chdir(homedir)
    return (outputfiles)
示例#11
0
def getStore(cfg, mode='r', cell=-1):
    data = None
    dataPoca = None

    if cfg['data']['storeType'] == 'hdf':
        Store = HDFDataStore
    elif cfg['data']['storeType'] == 'dask':
        Store = DaskDataStore
        if mode == 'w':
            dask.set_options(get=dask.multiprocessing.get)
        dask.set_options(temporary_directory=cfg['data']['daskTmp'])
    elif cfg['data']['storeType'] == 'multihdf':
        Store = MultiHDFDataStore
    else:
        raise ValueError, 'Unkown store type %s' % cfg['data']['storeType']

    if mode == 'r':
        if cfg['data']['input'] != 'none':
            data = Store(cfg['data']['store'], mode='r')
        if cfg['data']['inputPoca'] != 'none':
            dataPoca = Store(cfg['data']['storePoca'], mode='r')
    elif mode == 'w':
        allCols = False
        if 'allColumns' in cfg['data']:
            allCols = cfg['data']['allColumns']
        if cfg['data']['input'] != 'none':
            if allCols:
                data = Store(cfg['data']['store'],
                             reader=L2SwathReaderAll,
                             mode='w',
                             cell=cell)
            else:
                data = Store(cfg['data']['store'],
                             reader=L2SwathReader,
                             mode='w',
                             cell=cell)
        if cfg['data']['inputPoca'] != 'none':
            if allCols:
                dataPoca = Store(cfg['data']['storePoca'],
                                 reader=L2SwathReaderAll,
                                 mode='w',
                                 cell=cell)
            else:
                dataPoca = Store(cfg['data']['storePoca'],
                                 reader=L2SwathReader,
                                 mode='w',
                                 cell=cell)
    else:
        raise ValueError, 'mode must be either r or w'

    return data, dataPoca
示例#12
0
def load_datasets(cache_dir, dbc, dsets, parse_all=False, resume=True):
    import cytoolz.dicttoolz as dz
    cache = dask.cache.Cache(8e9)
    cache.register()
    dask.set_options(get=dask.threaded.get, pool=ThreadPool())
    configs = list(
        map(lambda x: os.path.join('config/data', x),
            os.listdir('config/data')))
    cmap = {k: DatasetConfig.from_yaml(k) for k in configs}
    dsids = dz.valmap(lambda ds: ds.id, cmap)
    cmap = dz.merge(cmap, {d.id: d for d in cmap.values()})
    if parse_all:
        dsets = dsids
    for d in dsets:
        process_dataset(cmap[d], dbc, cache_dir)
示例#13
0
def test_optimize():
    x = dask.delayed(inc)(1)
    y = dask.delayed(inc)(x)
    z = x + y

    x2, y2, z2, constant = optimize(x, y, z, 1)
    assert constant == 1

    # Same graphs for each
    dsk = dict(x2.dask)
    assert dict(y2.dask) == dsk
    assert dict(z2.dask) == dsk

    # Computationally equivalent
    assert dask.compute(x2, y2, z2) == dask.compute(x, y, z)

    # Applying optimizations before compute and during compute gives
    # same results. Shows optimizations are occurring.
    sols = dask.compute(x, y, z, optimizations=[inc_to_dec])
    x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec])
    assert dask.compute(x3, y3, z3) == sols

    # Optimize respects global optimizations as well
    with dask.set_options(optimizations=[inc_to_dec]):
        x4, y4, z4 = optimize(x, y, z)
    for a, b in zip([x3, y3, z3], [x4, y4, z4]):
        assert dict(a.dask) == dict(b.dask)
示例#14
0
def test_persistent_dask(capsys):
    global IS_COMPUTED
    IS_COMPUTED = dict()
    g = setup_graph()
    # the first time the grap is created it has functions
    assert not all(g.is_computed().values())
    assert g.persistent_dask == g.dask

    # run the graph
    with dask.set_options(get=dask. async .get_sync):
        futures = g.run()
        assert IS_COMPUTED
        assert all(g.is_computed().values())
        assert g.persistent_dask != g.dask
        # then the graph is replaced by cached data
        values = g.results(futures).values()

        assert values == [
            'cleaned_data', 'analyzed_cleaned_data', 'data', 'cleaned_data',
            'data', 'analyzed_cleaned_data'
        ]

        # We recreate a new graph => the cache is deleted
        g = setup_graph()
        assert all(g.is_computed().values())
        # the graph contains the load methods
        assert g.persistent_dask != g.dask
        assert all(
            map(lambda f: f[0].func_name == 'load',
                g.persistent_dask.values()))
        # get multiple results
        data = g.get([('analyzed_data', 'pool1'), ('analyzed_data', 'pool2')])
示例#15
0
def test_temporary_directory(tmpdir):
    b = db.range(10, npartitions=4)

    with dask.set_options(temporary_directory=str(tmpdir)):
        b2 = b.groupby(lambda x: x % 2)
        b2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
示例#16
0
def test_hdf_globbing():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table')

        with dask.set_options(get=dask.get):
            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2)
            assert res.npartitions == 2
            tm.assert_frame_equal(res.compute(), df)

            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2, start=1, stop=3)
            expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data',
                                   start=1, stop=3)
            tm.assert_frame_equal(res.compute(), expected)

            res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
示例#17
0
def test_writing_parquet_with_kwargs(tmpdir, engine):
    fn = str(tmpdir)
    path1 = os.path.join(fn, 'normal')
    path2 = os.path.join(fn, 'partitioned')

    df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100),
                       'b': np.random.random(size=100),
                       'c': np.random.randint(1, 5, size=100)})
    ddf = dd.from_pandas(df, npartitions=3)

    engine_kwargs = {
        'pyarrow': {
            'compression': 'snappy',
            'coerce_timestamps': None,
            'use_dictionary': True
        },
        'fastparquet': {
            'compression': 'snappy',
            'times': 'int64',
            'fixed_text': None
        }
    }

    ddf.to_parquet(path1,  engine=engine, **engine_kwargs[engine])
    out = dd.read_parquet(path1, engine=engine)
    assert_eq(out, df, check_index=(engine != 'fastparquet'))

    # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets
    with dask.set_options(get=dask.get):
        ddf.to_parquet(path2, engine=engine, partition_on=['a'],
                       **engine_kwargs[engine])
    out = dd.read_parquet(path2, engine=engine).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])
示例#18
0
def test_groupby_on_index(get):
    pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                        'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]},
                       index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(pdf, npartitions=3)

    ddf2 = ddf.set_index('a')
    pdf2 = pdf.set_index('a')
    assert_eq(ddf.groupby('a').b.mean(), ddf2.groupby(ddf2.index).b.mean())

    def func(df):
        return df.assign(b=df.b - df.b.mean())

    def func2(df):
        return df[['b']] - df[['b']].mean()

    with dask.set_options(get=get):
        with pytest.warns(None):
            assert_eq(ddf.groupby('a').apply(func),
                      pdf.groupby('a').apply(func))

            assert_eq(ddf.groupby('a').apply(func).set_index('a'),
                      pdf.groupby('a').apply(func).set_index('a'))

            assert_eq(pdf2.groupby(pdf2.index).apply(func2),
                      ddf2.groupby(ddf2.index).apply(func2))
示例#19
0
def cal(x,client):
    st = time.time()

    #Distributed scheduler
    #with dask.set_options(get=dask.threaded.get):
    with dask.set_options(get=client.get):
        A = da.transpose(x)
        B = da.dot(x,A)
        C = da.dot(B,B)

        print C.compute()

    #Default scheduler
    # with dask.set_options(get=dask.threaded.get):
    #     A = da.transpose(x)
    #     B = da.dot(x,A)
    #     C = da.dot(B,B)
    #
    #     print C.compute()

    #mannually set global thread.
    # from multiprocessing.pool import ThreadPool
    # with dask.set_options(pool=ThreadPool(4)):
    #     A = da.transpose(x)
    #     B = da.dot(x,A)
    #     C = da.dot(B,B)
    #
    #     print C.compute(num_works = 4)




    print 'time: ',time.time()-st
    return 0
示例#20
0
    def get(self):
        import dask

        if os.environ.get('DASK_DEBUG'):
            with dask.set_options(get=dask.threaded.get):
                return self.future.result()
        return self.future.result()
示例#21
0
def test_coerce():
    d = da.from_array(np.array([1]), chunks=(1,))
    with dask.set_options(get=dask.get):
        assert bool(d)
        assert int(d)
        assert float(d)
        assert complex(d)
示例#22
0
文件: test_base.py 项目: semyont/dask
def test_get_scheduler():
    assert get_scheduler() is None
    assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler(scheduler='sync') is dask.local.get_sync
    with dask.set_options(scheduler='threads'):
        assert get_scheduler(scheduler='threads') is dask.threaded.get
    assert get_scheduler() is None
示例#23
0
def test_groupby_on_index(get):
    full = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
            'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    d = dd.from_pandas(full, npartitions=3)

    e = d.set_index('a')
    efull = full.set_index('a')
    assert_eq(d.groupby('a').b.mean(), e.groupby(e.index).b.mean())

    def func(df):
        return df.assign(b=df.b - df.b.mean())

    with dask.set_options(get=get):
        assert_eq(d.groupby('a').apply(func), full.groupby('a').apply(func))

        assert_eq(
            d.groupby('a').apply(func).set_index('a'),
            full.groupby('a').apply(func).set_index('a'))

        assert_eq(
            efull.groupby(efull.index).apply(func),
            e.groupby(e.index).apply(func))
示例#24
0
 def assertLazyAnd(self, expected, actual, test):
     expected_copy = expected.copy(deep=False)
     actual_copy = actual.copy(deep=False)
     with dask.set_options(get=dask.get):
         test(actual_copy, expected_copy)
     var = getattr(actual, "variable", actual)
     self.assertIsInstance(var.data, da.Array)
示例#25
0
文件: test_bag.py 项目: serazing/dask
def test_fold():
    c = b.fold(add)
    assert c.compute() == sum(L)
    assert c.key == b.fold(add).key

    c2 = b.fold(add, initial=10)
    assert c2.key != c.key
    assert c2.compute() == sum(L) + 10 * b.npartitions
    assert c2.key == b.fold(add, initial=10).key

    c = db.from_sequence(range(5), npartitions=3)

    def binop(acc, x):
        acc = acc.copy()
        acc.add(x)
        return acc

    d = c.fold(binop, set.union, initial=set())
    assert d.compute() == set(c)
    assert d.key == c.fold(binop, set.union, initial=set()).key

    d = db.from_sequence('hello')
    assert set(d.fold(lambda a, b: ''.join([a, b]),
                      initial='').compute()) == set('hello')

    e = db.from_sequence([[1], [2], [3]], npartitions=2)
    with dask.set_options(get=get_sync):
        assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
示例#26
0
def check_identical(data1, data2, output_filename):
    with dask.set_options(get=dask.local.get_sync):
        if not all((data1 == data2).all().values()):
            _LOG.error("Mismatch found for %s, not indexing", output_filename)
            raise ValueError("Mismatch found for %s, not indexing" %
                             output_filename)
    return True
示例#27
0
def test_coerce():
    d = da.from_array(np.array([1]), chunks=(1, ))
    with dask.set_options(get=dask.get):
        assert bool(d)
        assert int(d)
        assert float(d)
        assert complex(d)
示例#28
0
def test_groupby_on_index(get):
    pdf = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
            'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(pdf, npartitions=3)

    ddf2 = ddf.set_index('a')
    pdf2 = pdf.set_index('a')
    assert_eq(ddf.groupby('a').b.mean(), ddf2.groupby(ddf2.index).b.mean())

    def func(df):
        return df.assign(b=df.b - df.b.mean())

    with dask.set_options(get=get):
        assert_eq(ddf.groupby('a').apply(func), pdf.groupby('a').apply(func))

        assert_eq(
            ddf.groupby('a').apply(func).set_index('a'),
            pdf.groupby('a').apply(func).set_index('a'))

        assert_eq(
            pdf2.groupby(pdf2.index).apply(func),
            ddf2.groupby(ddf2.index).apply(func))
示例#29
0
def main(kind):
    input_array = np.random.random(5000)

    getter = {'processes': dask.multiprocessing.get,
              'threads': dask.threaded.get}[kind]

    # sets the scheduler
    with dask.set_options(get=getter):

        # set ``partition_size`` to ensure each partition has enough work
        bag = db.from_sequence(input_array, partition_size=1000)

        # compute elemwise cosine on the gpu within each partition
        bag_cos = bag.map_partitions(
            lambda x: gpu_cos(np.asarray(x, dtype=np.float32)))

        # apply partial sum-reduce on each partition
        # then, finish it on the host
        got = bag_cos.reduction(sum_parts, sum).compute()

        # cross validate with numpy
        expected = np.sum(np.cos(input_array))

        print('Got:     ', got)
        print('Expected:', expected)
        correct = np.allclose(got, expected)
        print('Correct: ', correct)
        sys.exit(0 if correct else 1)
示例#30
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df['A'] = df.A // 0.1
    df['B'] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Executor(('127.0.0.1', c['port']), loop=loop) as e:
            with dask.set_options(get=e.get):
                for ind in [lambda x: 'A', lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[['A', 'B']]).apply(len)

                a = df.groupby(['A', 'B']).apply(len)
                b = ddf.groupby(['A', 'B']).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
示例#31
0
 def assertLazyAnd(self, expected, actual, test):
     expected_copy = expected.copy(deep=False)
     actual_copy = actual.copy(deep=False)
     with dask.set_options(get=dask.get):
         test(actual_copy, expected_copy)
     var = getattr(actual, 'variable', actual)
     self.assertIsInstance(var.data, da.Array)
示例#32
0
def test_writing_parquet_with_kwargs(tmpdir, engine):
    fn = str(tmpdir)
    path1 = os.path.join(fn, 'normal')
    path2 = os.path.join(fn, 'partitioned')

    df = pd.DataFrame({'a': np.random.choice(['A', 'B', 'C'], size=100),
                       'b': np.random.random(size=100),
                       'c': np.random.randint(1, 5, size=100)})
    ddf = dd.from_pandas(df, npartitions=3)

    engine_kwargs = {
        'pyarrow': {
            'compression': 'snappy',
            'coerce_timestamps': None,
            'use_dictionary': True
        },
        'fastparquet': {
            'compression': 'snappy',
            'times': 'int64',
            'fixed_text': None
        }
    }

    ddf.to_parquet(path1,  engine=engine, **engine_kwargs[engine])
    out = dd.read_parquet(path1, engine=engine)
    assert_eq(out, df, check_index=(engine != 'fastparquet'))

    # Avoid race condition in pyarrow 0.8.0 on writing partitioned datasets
    with dask.set_options(get=dask.get):
        ddf.to_parquet(path2, engine=engine, partition_on=['a'],
                       **engine_kwargs[engine])
    out = dd.read_parquet(path2, engine=engine).compute()
    for val in df.a.unique():
        assert set(df.b[df.a == val]) == set(out.b[out.a == val])
示例#33
0
 def assertLazyAnd(self, expected, actual, test):
     expected_copy = _copy_at_variable_level(expected)
     actual_copy = _copy_at_variable_level(actual)
     with dask.set_options(get=dask.get):
         test(actual_copy, expected_copy)
     var = getattr(actual, 'variable', actual)
     self.assertIsInstance(var.data, da.Array)
示例#34
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df['A'] = df.A // 0.1
    df['B'] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Client(('127.0.0.1', c['port']), loop=loop) as c:
            with dask.set_options(get=c.get):
                for ind in [lambda x: 'A', lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any('partd' in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[['A', 'B']]).apply(len)

                a = df.groupby(['A', 'B']).apply(len)
                b = ddf.groupby(['A', 'B']).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
示例#35
0
def test_dataframe_groupby_tasks(loop):
    df = pd.util.testing.makeTimeDataFrame()
    df["A"] = df.A // 0.1
    df["B"] = df.B // 0.1
    ddf = dd.from_pandas(df, npartitions=10)
    with cluster() as (c, [a, b]):
        with Client(("127.0.0.1", c["port"]), loop=loop) as c:
            with dask.set_options(get=c.get):
                for ind in [lambda x: "A", lambda x: x.A]:
                    a = df.groupby(ind(df)).apply(len)
                    b = ddf.groupby(ind(ddf)).apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any("partd" in k[0] for k in b.dask)

                    a = df.groupby(ind(df)).B.apply(len)
                    b = ddf.groupby(ind(ddf)).B.apply(len)
                    assert_equal(a, b.compute(get=dask.get).sort_index())
                    assert not any("partd" in k[0] for k in b.dask)

                with pytest.raises(NotImplementedError):
                    ddf.groupby(ddf[["A", "B"]]).apply(len)

                a = df.groupby(["A", "B"]).apply(len)
                b = ddf.groupby(["A", "B"]).apply(len)

                assert_equal(a, b.compute(get=dask.get).sort_index())
示例#36
0
文件: test_io.py 项目: dukebody/dask
def test_hdf_globbing():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table')

        with dask.set_options(get=dask.get):
            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2)
            assert res.npartitions == 2
            tm.assert_frame_equal(res.compute(), df)

            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2, start=1, stop=3)
            expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data',
                                   start=1, stop=3)
            tm.assert_frame_equal(res.compute(), expected)

            res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
示例#37
0
def test_optimize():
    x = dask.delayed(inc)(1)
    y = dask.delayed(inc)(x)
    z = x + y

    x2, y2, z2, constant = optimize(x, y, z, 1)
    assert constant == 1

    # Same graphs for each
    dsk = dict(x2.dask)
    assert dict(y2.dask) == dsk
    assert dict(z2.dask) == dsk

    # Computationally equivalent
    assert dask.compute(x2, y2, z2) == dask.compute(x, y, z)

    # Applying optimizations before compute and during compute gives
    # same results. Shows optimizations are occurring.
    sols = dask.compute(x, y, z, optimizations=[inc_to_dec])
    x3, y3, z3 = optimize(x, y, z, optimizations=[inc_to_dec])
    assert dask.compute(x3, y3, z3) == sols

    # Optimize respects global optimizations as well
    with dask.set_options(optimizations=[inc_to_dec]):
        x4, y4, z4 = optimize(x, y, z)
    for a, b in zip([x3, y3, z3], [x4, y4, z4]):
        assert dict(a.dask) == dict(b.dask)
示例#38
0
文件: test_bag.py 项目: rla3rd/dask
def test_fold():
    c = b.fold(add)
    assert c.compute() == sum(L)
    assert c.key == b.fold(add).key

    c2 = b.fold(add, initial=10)
    assert c2.key != c.key
    assert c2.compute() == sum(L) + 10 * b.npartitions
    assert c2.key == b.fold(add, initial=10).key

    c = db.from_sequence(range(5), npartitions=3)
    def binop(acc, x):
        acc = acc.copy()
        acc.add(x)
        return acc

    d = c.fold(binop, set.union, initial=set())
    assert d.compute() == set(c)
    assert d.key == c.fold(binop, set.union, initial=set()).key

    d = db.from_sequence('hello')
    assert set(d.fold(lambda a, b: ''.join([a, b]), initial='').compute()) == set('hello')

    e = db.from_sequence([[1], [2], [3]], npartitions=2)
    with dask.set_options(get=get_sync):
        assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
示例#39
0
def test_fs_driver_backends():
    from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
    from dask.bytes.pyarrow import PyArrowHadoopFileSystem

    fs1, token1 = get_fs('hdfs')
    assert isinstance(fs1, HDFS3HadoopFileSystem)

    with dask.set_options(hdfs_driver='pyarrow'):
        fs2, token2 = get_fs('hdfs')
    assert isinstance(fs2, PyArrowHadoopFileSystem)

    assert token1 != token2

    with pytest.raises(ValueError):
        with dask.set_options(hdfs_driver='not-a-valid-driver'):
            get_fs('hdfs')
示例#40
0
 def assertLazyAnd(self, expected, actual, test):
     expected_copy = _copy_at_variable_level(expected)
     actual_copy = _copy_at_variable_level(actual)
     with dask.set_options(get=dask.get):
         test(actual_copy, expected_copy)
     var = getattr(actual, "variable", actual)
     self.assertIsInstance(var.data, da.Array)
示例#41
0
def test_temporary_directory():
    b = db.range(10, npartitions=4)

    with dask.set_options(temporary_directory=os.getcwd()):
        b2 = b.groupby(lambda x: x % 2)
        b2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(os.getcwd()))
示例#42
0
def main(kind):
    input_array = np.random.random(5000)

    getter = {
        'processes': dask.multiprocessing.get,
        'threads': dask.threaded.get
    }[kind]

    # sets the scheduler
    with dask.set_options(get=getter):

        # set ``partition_size`` to ensure each partition has enough work
        bag = db.from_sequence(input_array, partition_size=1000)

        # compute elemwise cosine on the gpu within each partition
        bag_cos = bag.map_partitions(
            lambda x: gpu_cos(np.asarray(x, dtype=np.float32)))

        # apply partial sum-reduce on each partition
        # then, finish it on the host
        got = bag_cos.reduction(sum_parts, sum).compute()

        # cross validate with numpy
        expected = np.sum(np.cos(input_array))

        print('Got:     ', got)
        print('Expected:', expected)
        correct = np.allclose(got, expected)
        print('Correct: ', correct)
        sys.exit(0 if correct else 1)
示例#43
0
def test_groupby_multiprocessing():
    from dask.multiprocessing import get
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
                       'B': ['1','1','a','a','a']})
    ddf = dd.from_pandas(df, npartitions=3)
    with dask.set_options(get=get):
        assert_eq(ddf.groupby('B').apply(lambda x: x),
                  df.groupby('B').apply(lambda x: x))
示例#44
0
def test_groupby_multiprocessing():
    from dask.multiprocessing import get
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': ['1', '1', 'a', 'a', 'a']})
    ddf = dd.from_pandas(df, npartitions=3)
    with dask.set_options(get=get):
        assert eq(
            ddf.groupby('B').apply(lambda x: x),
            df.groupby('B').apply(lambda x: x))
示例#45
0
def test_determinism(func, kwargs, get):
    X, y = make_intercept_data(1000, 10)

    with dask.set_options(get=get):
        a = func(X, y, **kwargs)
        b = func(X, y, **kwargs)

    assert (a == b).all()
示例#46
0
def test_optimizations_keyword():
    x = dask.delayed(inc)(1)
    assert x.compute() == 2

    with dask.set_options(optimizations=[inc_to_dec]):
        assert x.compute() == 0

    assert x.compute() == 2
示例#47
0
def write_data_variables(data_vars, nco):
    for name, variable in data_vars.items():
        try:
            with dask.set_options(get=dask.async.get_sync):
                da.store(variable.data, nco[name], lock=True)
        except ValueError:
            nco[name][:] = netcdf_writer.netcdfy_data(variable.values)
        nco.sync()
示例#48
0
def test_optimizations_keyword():
    x = dask.delayed(inc)(1)
    assert x.compute() == 2

    with dask.set_options(optimizations=[inc_to_dec]):
        assert x.compute() == 0

    assert x.compute() == 2
示例#49
0
def test_globalmethod():
    x = Foo()

    assert x.f() == 1

    with dask.set_options(f=lambda: 2):
        assert x.f() == 2

    with dask.set_options(f=foo):
        assert x.f is foo
        assert x.f() == 'foo'

    assert x.g is foo
    assert x.g() == 'foo'

    with dask.set_options(g=False):
        assert x.g is bar
        assert x.g() == 'bar'
示例#50
0
def _iris_dask_defaults():
    """
    Set dask defaults for Iris. The current default dask operation mode for
    Iris is running single-threaded using `dask.local.get_sync`. This default
    ensures that running Iris under "normal" conditions will not use up all
    available computational resource.

    Otherwise, by default, `dask` will use a multi-threaded scheduler that uses
    all available CPUs.

    .. note::
        We only want Iris to set dask options in the case where doing so will
        not change user-specified options that have already been set.

    """
    if 'pool' not in dask.context._globals and \
            'get' not in dask.context._globals:
        dask.set_options(get=dget_sync)
示例#51
0
def test_local_get_with_distributed_active(c, s, a, b):
    with dask.set_options(get=dask.get):
        x = delayed(inc)(1).persist()
    yield gen.sleep(0.01)
    assert not s.task_state # scheduler hasn't done anything

    y = delayed(inc)(2).persist(get=dask.get)
    yield gen.sleep(0.01)
    assert not s.task_state # scheduler hasn't done anything
示例#52
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5,))
    y = da.sum(x + 1 + 2 + 3)

    a = y.__dask_optimize__(y.dask, y.__dask_keys__())

    with dask.set_options(fuse_ave_width=0):
        b = y.__dask_optimize__(y.dask, y.__dask_keys__())

    assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__())
    assert len(a) < len(b)
示例#53
0
def test_temporary_directory(tmpdir):
    df = pd.DataFrame({'x': np.random.random(100),
                       'y': np.random.random(100),
                       'z': np.random.random(100)})
    ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)

    with dask.set_options(temporary_directory=str(tmpdir),
                          get=dask.multiprocessing.get):
        ddf2 = ddf.set_index('x', shuffle='disk')
        ddf2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
示例#54
0
    def test_simultaneous_compute(self):
        ds = Dataset({'foo': ('x', range(5)),
                      'bar': ('x', range(5))}).reblock()

        count = np.array(0)
        def counting_get(*args, **kwargs):
            count[...] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load_data()
        self.assertEqual(count, 1)
示例#55
0
def _futures_to_dask_array(futures, executor=None):
    executor = default_executor(executor)
    futures = np.array(futures, dtype=object)

    slices = [((0,) * i + (slice(None, None),) + (0,) * (futures.ndim - i - 1)) for i in range(futures.ndim)]
    chunks = [[executor.submit(get_dim, x, i) for x in futures[slc]] for i, slc in enumerate(slices)]
    dtype = executor.submit(get_dtype, futures.flat[0])

    chunks, dtype = yield executor._gather([chunks, dtype])
    chunks = tuple(map(tuple, chunks))

    name = "array-from-futures-" + tokenize(*futures.flat)
    keys = list(product([name], *map(range, futures.shape)))
    values = list(futures.flat)
    dsk = dict(zip(keys, values))

    if _globals["get"] != executor.get:
        print("Setting global dask scheduler to use distributed")
        dask.set_options(get=executor.get)

    raise gen.Return(da.Array(dsk, name, chunks, dtype))
示例#56
0
文件: test_base.py 项目: fortizc/dask
def test_optimize_None():
    da = pytest.importorskip('dask.array')

    x = da.ones(10, chunks=(5,))
    y = x[:9][1:8][::2] + 1  # normally these slices would be fused

    def my_get(dsk, keys):
        assert dsk == dict(y.dask)  # but they aren't
        return dask.get(dsk, keys)

    with dask.set_options(array_optimize=None, get=my_get):
        y.compute()
示例#57
0
def test_cache_options():
    try:
        from chest import Chest
    except ImportError:
        return
    cache = Chest()
    def inc2(x):
        assert 'y' in cache
        return x + 1

    with dask.set_options(cache=cache):
        get_sync({'x': (inc2, 'y'), 'y': 1}, 'x')
示例#58
0
    def test_simultaneous_compute(self):
        ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk()

        count = [0]

        def counting_get(*args, **kwargs):
            count[0] += 1
            return dask.get(*args, **kwargs)

        with dask.set_options(get=counting_get):
            ds.load()
        self.assertEqual(count[0], 1)
示例#59
0
def test_grid_search():
    pipeline = dl.Pipeline([("pca", PCA()),
                            ("select_k", SelectKBest()),
                            ("svm", LinearSVC())])
    param_grid = {'select_k__k': [1, 2, 3, 4],
                  'svm__C': np.logspace(-3, 2, 3)}
    grid = dl.GridSearchCV(pipeline, param_grid)

    with dask.set_options(get=dask.get):
        result = grid.fit(X_train, y_train).score(X_test, y_test)

    assert isinstance(result, float)
示例#60
0
def test_categorical_set_index():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': ['a', 'b', 'b', 'c']})
    df['y'] = df.y.astype('category')
    a = dd.from_pandas(df, npartitions=2)

    with dask.set_options(get=get_sync):
        b = a.set_index('y')
        df2 = df.set_index('y')
        assert list(b.index.compute()), list(df2.index)

        b = a.set_index(a.y)
        df2 = df.set_index(df.y)
        assert list(b.index.compute()), list(df2.index)