示例#1
0
def _futures_to_dask_bag(futures, client=None):
    client = default_client(client)

    name = 'bag-from-futures-' + tokenize(*futures)
    dsk = {(name, i): future for i, future in enumerate(futures)}

    ensure_default_get(client)

    raise gen.Return(db.Bag(dsk, name, len(futures)))
示例#2
0
def _futures_to_dask_bag(futures, executor=None):
    executor = default_executor(executor)

    name = 'bag-from-futures-' + tokenize(*futures)
    dsk = {(name, i): future for i, future in enumerate(futures)}

    ensure_default_get(executor)

    raise gen.Return(db.Bag(dsk, name, len(futures)))
示例#3
0
文件: test_bag.py 项目: serazing/dask
def test_accumulate():
    parts = [[1, 2, 3], [4, 5], [], [6, 7]]
    dsk = dict((('test', i), p) for (i, p) in enumerate(parts))
    b = db.Bag(dsk, 'test', len(parts))
    r = b.accumulate(add)
    assert r.name == b.accumulate(add).name
    assert r.name != b.accumulate(add, -1).name
    assert r.compute() == [1, 3, 6, 10, 15, 21, 28]
    assert b.accumulate(add, -1).compute() == [-1, 0, 2, 5, 9, 14, 20, 27]
    assert b.accumulate(add).map(inc).compute() == [2, 4, 7, 11, 16, 22, 29]

    b = db.from_sequence([1, 2, 3], npartitions=1)
    assert b.accumulate(add).compute() == [1, 3, 6]
示例#4
0
def _part_split(x, parts, prefix):
    name = '{0}-{1}'.format(prefix, tokenize(x, parts))
    dsk = dict(((name, i), (x.name, j)) for i, j in enumerate(parts))
    if isinstance(x, db.Bag):
        return db.Bag(merge(dsk, x.dask), name, len(parts))
    if x.ndim is not None:
        shape = (None, ) if x.ndim == 1 else (None, x.shape[1])
    else:
        shape = None
    return dm.Matrix(merge(dsk, x.dask),
                     name,
                     len(parts),
                     dtype=x.dtype,
                     shape=shape)
示例#5
0
def _make_mimesis(field,
                  schema,
                  npartitions,
                  records_per_partition,
                  seed=None):
    """
    Make a Dask Bag filled with data randomly generated by the mimesis projet

    Parameters
    ----------
    field: dict
        keyword arguments to pass to ``mimesis.Field``
    schema: Callable[Field] -> dict
        The schema to use to generate the data
    npartitions: int
    records_per_partition: int
    seed: int, None
        Seed for random data

    Returns
    -------
    Dask Bag

    See Also
    --------
    make_people
    """
    import dask.bag as db
    from dask.base import tokenize

    field = field or {}

    if seed is None:
        seed = random.random()

    seeds = db.core.random_state_data_python(npartitions, seed)

    name = "mimesis-" + tokenize(field, schema, npartitions,
                                 records_per_partition, seed)
    dsk = {(name, i):
           (_generate_mimesis, field, schema, records_per_partition, seed)
           for i, seed in enumerate(seeds)}

    return db.Bag(dsk, name, npartitions)
示例#6
0
def random_split(x, p_test=0.1, random_state=None):
    """Approximately split a dask collection into train/test data.

    Parameters
    ----------
    X : da.Array, db.Bag, or dm.Matrix
        The dask collection to split
    p_test : float, optional
        The fraction of samples to use in the test set. Default is 0.1.
    random_state : int or RandomState, optional
        The ``RandomState`` or seed to use when performing the random split.
    """
    if not 0 < p_test < 1:
        raise ValueError("p_test must be in (0, 1)")

    random_state = check_random_state(random_state)
    token = tokenize(x, p_test, random_state.get_state())
    names = ['random-split-test-' + token, 'random-split-train-' + token]

    if isinstance(x, da.Array):
        x, x_keys = _as_tall_skinny_and_keys(x)
        chunks = np.array(x.chunks[0])
        seeds = different_seeds(len(chunks) + 1, random_state)
        n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test)
        n_train = chunks - n_test
        dsks = [
            dict(((name, ) + k[1:], (arr_split, k, n, b, s))
                 for k, n, s in zip(x_keys, n_test, seeds[1:]))
            for name, b in zip(names, [True, False])
        ]

        test = da.Array(merge(dsks[0], x.dask), names[0],
                        (tuple(n_test), ) + x.chunks[1:], x.dtype)
        train = da.Array(merge(dsks[1], x.dask), names[1],
                         (tuple(n_train), ) + x.chunks[1:], x.dtype)

    elif isinstance(x, (db.Bag, dm.Matrix)):
        seeds = different_seeds(x.npartitions, random_state)
        split = bag_split if isinstance(x, db.Bag) else mat_split
        dsks = [
            dict(((name, k[1]), (split, k, p_test, b, s))
                 for k, s in zip(x._keys(), seeds))
            for name, b in zip(names, [True, False])
        ]

        if isinstance(x, dm.Matrix):
            if x.ndim is not None:
                shape = (None, ) if x.ndim == 1 else (None, x.shape[1])
            else:
                shape = None
            test = dm.Matrix(merge(dsks[0], x.dask),
                             names[0],
                             x.npartitions,
                             dtype=x.dtype,
                             shape=shape)
            train = dm.Matrix(merge(dsks[1], x.dask),
                              names[1],
                              x.npartitions,
                              dtype=x.dtype,
                              shape=shape)

        else:
            test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions)
            train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions)
    else:
        raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or "
                        "``dm.Matrix`` - got {0}".format(type(x).__name__))

    return train, test
示例#7
0
def test_from_bag_multiple_in_partitions(mats, sol):
    b = db.Bag({('b', 0): mats[:2], ('b', 1): [mats[2]]}, 'b', 2)
    mat = dm.from_bag(b)
    assert eq(mat.compute(), sol)