def _futures_to_dask_bag(futures, client=None): client = default_client(client) name = 'bag-from-futures-' + tokenize(*futures) dsk = {(name, i): future for i, future in enumerate(futures)} ensure_default_get(client) raise gen.Return(db.Bag(dsk, name, len(futures)))
def _futures_to_dask_bag(futures, executor=None): executor = default_executor(executor) name = 'bag-from-futures-' + tokenize(*futures) dsk = {(name, i): future for i, future in enumerate(futures)} ensure_default_get(executor) raise gen.Return(db.Bag(dsk, name, len(futures)))
def test_accumulate(): parts = [[1, 2, 3], [4, 5], [], [6, 7]] dsk = dict((('test', i), p) for (i, p) in enumerate(parts)) b = db.Bag(dsk, 'test', len(parts)) r = b.accumulate(add) assert r.name == b.accumulate(add).name assert r.name != b.accumulate(add, -1).name assert r.compute() == [1, 3, 6, 10, 15, 21, 28] assert b.accumulate(add, -1).compute() == [-1, 0, 2, 5, 9, 14, 20, 27] assert b.accumulate(add).map(inc).compute() == [2, 4, 7, 11, 16, 22, 29] b = db.from_sequence([1, 2, 3], npartitions=1) assert b.accumulate(add).compute() == [1, 3, 6]
def _part_split(x, parts, prefix): name = '{0}-{1}'.format(prefix, tokenize(x, parts)) dsk = dict(((name, i), (x.name, j)) for i, j in enumerate(parts)) if isinstance(x, db.Bag): return db.Bag(merge(dsk, x.dask), name, len(parts)) if x.ndim is not None: shape = (None, ) if x.ndim == 1 else (None, x.shape[1]) else: shape = None return dm.Matrix(merge(dsk, x.dask), name, len(parts), dtype=x.dtype, shape=shape)
def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None): """ Make a Dask Bag filled with data randomly generated by the mimesis projet Parameters ---------- field: dict keyword arguments to pass to ``mimesis.Field`` schema: Callable[Field] -> dict The schema to use to generate the data npartitions: int records_per_partition: int seed: int, None Seed for random data Returns ------- Dask Bag See Also -------- make_people """ import dask.bag as db from dask.base import tokenize field = field or {} if seed is None: seed = random.random() seeds = db.core.random_state_data_python(npartitions, seed) name = "mimesis-" + tokenize(field, schema, npartitions, records_per_partition, seed) dsk = {(name, i): (_generate_mimesis, field, schema, records_per_partition, seed) for i, seed in enumerate(seeds)} return db.Bag(dsk, name, npartitions)
def random_split(x, p_test=0.1, random_state=None): """Approximately split a dask collection into train/test data. Parameters ---------- X : da.Array, db.Bag, or dm.Matrix The dask collection to split p_test : float, optional The fraction of samples to use in the test set. Default is 0.1. random_state : int or RandomState, optional The ``RandomState`` or seed to use when performing the random split. """ if not 0 < p_test < 1: raise ValueError("p_test must be in (0, 1)") random_state = check_random_state(random_state) token = tokenize(x, p_test, random_state.get_state()) names = ['random-split-test-' + token, 'random-split-train-' + token] if isinstance(x, da.Array): x, x_keys = _as_tall_skinny_and_keys(x) chunks = np.array(x.chunks[0]) seeds = different_seeds(len(chunks) + 1, random_state) n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test) n_train = chunks - n_test dsks = [ dict(((name, ) + k[1:], (arr_split, k, n, b, s)) for k, n, s in zip(x_keys, n_test, seeds[1:])) for name, b in zip(names, [True, False]) ] test = da.Array(merge(dsks[0], x.dask), names[0], (tuple(n_test), ) + x.chunks[1:], x.dtype) train = da.Array(merge(dsks[1], x.dask), names[1], (tuple(n_train), ) + x.chunks[1:], x.dtype) elif isinstance(x, (db.Bag, dm.Matrix)): seeds = different_seeds(x.npartitions, random_state) split = bag_split if isinstance(x, db.Bag) else mat_split dsks = [ dict(((name, k[1]), (split, k, p_test, b, s)) for k, s in zip(x._keys(), seeds)) for name, b in zip(names, [True, False]) ] if isinstance(x, dm.Matrix): if x.ndim is not None: shape = (None, ) if x.ndim == 1 else (None, x.shape[1]) else: shape = None test = dm.Matrix(merge(dsks[0], x.dask), names[0], x.npartitions, dtype=x.dtype, shape=shape) train = dm.Matrix(merge(dsks[1], x.dask), names[1], x.npartitions, dtype=x.dtype, shape=shape) else: test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions) train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions) else: raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or " "``dm.Matrix`` - got {0}".format(type(x).__name__)) return train, test
def test_from_bag_multiple_in_partitions(mats, sol): b = db.Bag({('b', 0): mats[:2], ('b', 1): [mats[2]]}, 'b', 2) mat = dm.from_bag(b) assert eq(mat.compute(), sol)