Пример #1
0
    def random_frame(self, seed: int, dc: DataContainer,
                     **kwargs) -> dd.Series:
        """This function - in contrast to others in this module - will only ever be called on data frames"""

        random_state = np.random.RandomState(seed=seed)

        # Idea taken from dask.DataFrame.sample:
        # initialize a random state for each of the partitions
        # separately and then create a random series
        # for each partition
        df = dc.df
        name = "sample-" + tokenize(df, random_state)

        state_data = random_state_data(df.npartitions, random_state)
        dsk = {(name, i): (
            self.random_function,
            (df._name, i),
            np.random.RandomState(state),
            kwargs,
        )
               for i, state in enumerate(state_data)}

        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
        random_series = Series(graph, name, ("random", "float64"),
                               df.divisions)

        # This part seems to be stupid, but helps us do a very simple
        # task without going into the (private) internals of Dask:
        # copy all meta information from the original input dataframe
        # This is important so that the returned series looks
        # exactly like coming from the input dataframe
        return_df = df.assign(random=random_series)["random"]
        return return_df
Пример #2
0
    def split(self, X, y=None):
        """Iterate tuples of data split into training and test sets.

        Parameters
        ----------
        X : dask object
            Training data. May be a ``da.Array``, ``db.Bag``, or
            ``dklearn.Matrix``.

        y : dask object, optional
            The target variable for supervised learning problems.

        Yields
        -------
        X_train, y_train, X_test, y_test : dask objects
            The split training and testing data, returned as the same type as
            the input. If y is not provided, ``y_train`` and ``y_test`` will be
            ``None``.
        """
        X, y = check_X_y(X, y)
        seeds = random_state_data(self.n_iter, random_state=self.random_state)
        for seed in seeds:
            X_train, X_test = random_split(X, self.test_size, seed)
            if y is None:
                y_train = y_test = None
            else:
                y_train, y_test = random_split(y, self.test_size, seed)
            yield X_train, y_train, X_test, y_test
Пример #3
0
    def sample(self,
               n=None,
               frac=None,
               replace=False,
               weights=None,
               random_state=None,
               axis=None):
        axis = axis or 0
        if axis not in [0, 1]:
            raise ValueError("Axis must be either 0 or 1.")
        if axis == 0 and n is not None:
            raise NotImplementedError("Only `frac` can be used to sample rows"
                                      " from Dask SparseFrame, not `n`.")
        if (n is None) == (frac is None):
            raise ValueError("Please specify either `n` or `frac`.")
        if weights is not None:
            raise NotImplementedError("`weights` argument is not supported.")

        if random_state is None:
            random_state = np.random.RandomState()
        state_data = random_state_data(self.npartitions, random_state)
        state_data = (x for x in state_data)

        if axis == 0:
            return self.map_partitions(sp.SparseFrame.sample,
                                       self._meta,
                                       frac=frac,
                                       replace=replace,
                                       axis=0,
                                       random_state=state_data)
        if axis == 1:
            cols = self._meta\
                .sample(n=n, frac=frac, replace=replace, axis=1)\
                .columns.tolist()
            return self[cols]
Пример #4
0
def test_random_state_data():
    seed = 37
    state = np.random.RandomState(seed)
    n = 100000

    # Use an integer
    states = random_state_data(n, seed)
    assert len(states) == n

    # Use RandomState object
    states2 = random_state_data(n, state)
    for s1, s2 in zip(states, states2):
        assert (s1 == s2).all()

    # Consistent ordering
    states = random_state_data(10, 1234)
    states2 = random_state_data(20, 1234)[:10]

    for s1, s2 in zip(states, states2):
        assert (s1 == s2).all()
Пример #5
0
def test_random_state_data():
    seed = 37
    state = np.random.RandomState(seed)
    n = 100000

    # Use an integer
    states = random_state_data(n, seed)
    assert len(states) == n

    # Use RandomState object
    states2 = random_state_data(n, state)
    for s1, s2 in zip(states, states2):
        assert (s1 == s2).all()

    # Consistent ordering
    states = random_state_data(10, 1234)
    states2 = random_state_data(20, 1234)[:10]

    for s1, s2 in zip(states, states2):
        assert (s1 == s2).all()
Пример #6
0
def partition_quantiles(df, npartitions, upsample=1.0, random_state=None):
    """Approximate quantiles of Series used for repartitioning"""
    assert isinstance(df, Series)
    # currently, only Series has quantile method
    # Index.quantile(list-like) must be pd.Series, not pd.Index
    return_type = Series

    qs = np.linspace(0, 1, npartitions + 1)
    token = tokenize(df, qs, upsample)
    if random_state is None:
        random_state = int(token, 16) % np.iinfo(np.int32).max
    state_data = random_state_data(df.npartitions, random_state)

    df_keys = df.__dask_keys__()

    name0 = "re-quantiles-0-" + token
    dtype_dsk = {(name0, 0): (dtype_info, df_keys[0])}

    name1 = "re-quantiles-1-" + token
    val_dsk = {
        (name1, i): (
            percentiles_summary,
            key,
            df.npartitions,
            npartitions,
            upsample,
            state,
        )
        for i, (state, key) in enumerate(zip(state_data, df_keys))
    }

    name2 = "re-quantiles-2-" + token
    merge_dsk = create_merge_tree(merge_and_compress_summaries, sorted(val_dsk), name2)
    if not merge_dsk:
        # Compress the data even if we only have one partition
        merge_dsk = {(name2, 0, 0): (merge_and_compress_summaries, [list(val_dsk)[0]])}

    merged_key = max(merge_dsk)

    name3 = "re-quantiles-3-" + token
    last_dsk = {
        (name3, 0): (
            pd.Series,  # TODO: Use `type(df._meta)` when cudf adds `tolist()`
            (process_val_weights, merged_key, npartitions, (name0, 0)),
            qs,
            None,
            df.name,
        )
    }

    dsk = merge(df.dask, dtype_dsk, val_dsk, merge_dsk, last_dsk)
    new_divisions = [0.0, 1.0]
    return return_type(dsk, name3, df._meta, new_divisions)
Пример #7
0
def test_random_state_data():
    np = pytest.importorskip("numpy")
    seed = 37
    state = np.random.RandomState(seed)
    n = 10000

    # Use an integer
    states = random_state_data(n, seed)
    assert len(states) == n

    # Use RandomState object
    states2 = random_state_data(n, state)
    for s1, s2 in zip(states, states2):
        assert s1.shape == (624, )
        assert (s1 == s2).all()

    # Consistent ordering
    states = random_state_data(10, 1234)
    states2 = random_state_data(20, 1234)[:10]

    for s1, s2 in zip(states, states2):
        assert (s1 == s2).all()
Пример #8
0
def train_test_split(*arrays, **options):
    """Split dask collections into random train and test subsets.

    Quick utility that wraps input validation and calls to train/test splitting
    with ``RandomSplit`` into a single call for splitting data in a oneliner.

    Parameters
    ----------
    *arrays : sequence of dask collections with same length and partitions

        Allowed inputs are ``db.Bag``, ``da.Array``, or ``dm.Matrix``. All
        inputs must share the same length and partitions.

    test_size : float, optional
        Should be between 0.0 and 1.0 and represent the proportion of the
        dataset to include in the test split. Default is 0.25.

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.

    Returns
    -------
    splitting : list, length = 2 * len(arrays),
        List containing train-test split of inputs.

    Examples
    --------
    >>> X_train, X_test, y_train, y_test = train_test_split(  # doctest: +SKIP
    ...     X, y, test_size=0.20, random_state=42)
    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    check_aligned_partitions(*arrays)

    test_size = options.pop('test_size', 0.25)
    random_state = options.pop('random_state', None)

    if options:
        raise ValueError("Invalid parameters passed: %s" % str(options))

    seed = random_state_data(1, random_state=random_state)[0]
    return list(concat(random_split(a, test_size, seed) for a in arrays))
Пример #9
0
    def random_split(self, frac, random_state=None):
        if not np.allclose(sum(frac), 1):
            raise ValueError("frac should sum to 1")
        state_data = random_state_data(self.npartitions, random_state)

        partitions = self.to_delayed()
        partitions = [
            delayed(pd_split)(sf, frac, state)
            for sf, state in zip(partitions, state_data)
        ]

        splits = []
        for i in range(len(frac)):
            split_delayed = [delayed(itemgetter(i))(sf) for sf in partitions]
            split = from_delayed(split_delayed,
                                 prefix='random-split',
                                 meta=self._meta)
            splits.append(split)

        return splits
Пример #10
0
    def _wrap(self,
              funcname,
              *args,
              size=None,
              chunks="auto",
              extra_chunks=(),
              **kwargs):
        """Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        shapes = list({
            ar.shape
            for ar in chain(args, kwargs.values())
            if isinstance(ar, (Array, np.ndarray))
        })
        if size is not None:
            shapes.append(size)
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(
            chunks,
            size,  # ideally would use dtype here
            dtype=kwargs.get("dtype", np.float64),
        )
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        lookup = {}
        small_args = []
        dependencies = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        sizes = list(product(*chunks))
        seeds = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(seeds, size, chunks, args, kwargs)
        name = f"{funcname}-{token}"

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])

        vals = []
        for seed, size, slc, block in zip(seeds, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, self._RandomState, funcname, seed,
                         size, arg, kwrg))

        meta = _apply_random(
            self._RandomState,
            funcname,
            seed,
            (0, ) * len(size),
            small_args,
            small_kwargs,
        )

        dsk.update(dict(zip(keys, vals)))

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=dependencies)
        return Array(graph, name, chunks + extra_chunks, meta=meta)
Пример #11
0
        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
            dependencies = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a)
                a = a.rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dependencies.append(a)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dependencies.append(p)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size, dtype=np.float64)
            if not replace and len(chunks[0]) > 1:
                err_msg = ("replace=False is not currently supported for "
                           "dask.array.choice with multi-chunk output "
                           "arrays")
                raise NotImplementedError(err_msg)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = "da.random.choice-%s" % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=dependencies)
            return Array(graph, name, chunks, dtype=dtype)
Пример #12
0
 def __call__(self, part):
     divisions, state_data = part
     if isinstance(state_data, int):
         state_data = random_state_data(1, state_data)
     return make_timeseries_part(divisions[0], divisions[1], self.dtypes,
                                 self.freq, state_data, self.kwargs)
Пример #13
0
def make_timeseries(
    start="2000-01-01",
    end="2000-12-31",
    dtypes={
        "name": str,
        "id": int,
        "x": float,
        "y": float
    },
    freq="10s",
    partition_freq="1M",
    seed=None,
    **kwargs,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start: datetime (or datetime-like string)
        Start of time series
    end: datetime (or datetime-like string)
        End of time series
    dtypes: dict
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}
    freq: string
        String like '2s' or '1H' or '12W' for the time series frequency
    partition_freq: string
        String like '1M' or '2Y' to divide the dataframe into partitions
    seed: int (optional)
        Randomstate seed
    kwargs:
        Keywords to pass down to individual column creation functions.
        Keywords should be prefixed by the column name and then an underscore.

    Examples
    --------
    >>> import dask.dataframe as dd
    >>> df = dd.demo.make_timeseries('2000', '2010',
    ...                              {'value': float, 'name': str, 'id': int},
    ...                              freq='2H', partition_freq='1D', seed=1)
    >>> df.head()  # doctest: +SKIP
                           id      name     value
    2000-01-01 00:00:00   969     Jerry -0.309014
    2000-01-01 02:00:00  1010       Ray -0.760675
    2000-01-01 04:00:00  1016  Patricia -0.063261
    2000-01-01 06:00:00   960   Charlie  0.788245
    2000-01-01 08:00:00  1031     Kevin  0.466002
    """
    divisions = list(pd.date_range(start=start, end=end, freq=partition_freq))
    npartitions = len(divisions) - 1
    if seed is None:
        # Get random integer seed for each partition. We can
        # call `random_state_data` in `MakeTimeseriesPart`
        state_data = np.random.randint(2e9, size=npartitions)
    else:
        state_data = random_state_data(npartitions, seed)

    # Build parts
    parts = []
    for i in range(len(divisions) - 1):
        parts.append((divisions[i:i + 2], state_data[i]))

    # Construct the output collection with from_map
    return from_map(
        MakeTimeseriesPart(dtypes, freq, kwargs),
        parts,
        meta=make_timeseries_part("2000", "2000", dtypes, "1H", state_data[0],
                                  kwargs),
        divisions=divisions,
        label="make-timeseries",
        token=tokenize(start, end, dtypes, freq, partition_freq, state_data),
        enforce_metadata=False,
    )
Пример #14
0
def random_split(x, p_test=0.1, random_state=None):
    """Approximately split a dask collection into train/test data.

    Parameters
    ----------
    X : da.Array, db.Bag, or dm.Matrix
        The dask collection to split
    p_test : float, optional
        The fraction of samples to use in the test set. Default is 0.1.
    random_state : int or RandomState, optional
        The ``RandomState`` or seed to use when performing the random split.
    """
    if not 0 < p_test < 1:
        raise ValueError("p_test must be in (0, 1)")

    random_state = _check_random_state(random_state)
    token = tokenize(x, p_test, random_state.get_state())
    names = ['random-split-test-' + token, 'random-split-train-' + token]

    if isinstance(x, da.Array):
        x, x_keys = _as_tall_skinny_and_keys(x)
        chunks = np.array(x.chunks[0])
        seeds = random_state_data(len(chunks) + 1, random_state)
        n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test)
        n_train = chunks - n_test
        dsks = [
            dict(((name, ) + k[1:], (arr_split, k, n, b, s))
                 for k, n, s in zip(x_keys, n_test, seeds[1:]))
            for name, b in zip(names, [True, False])
        ]

        test = da.Array(merge(dsks[0], x.dask), names[0],
                        (tuple(n_test), ) + x.chunks[1:], x.dtype)
        train = da.Array(merge(dsks[1], x.dask), names[1],
                         (tuple(n_train), ) + x.chunks[1:], x.dtype)

    elif isinstance(x, (db.Bag, dm.Matrix)):
        seeds = random_state_data(x.npartitions, random_state)
        split = bag_split if isinstance(x, db.Bag) else mat_split
        dsks = [
            dict(((name, k[1]), (split, k, p_test, b, s))
                 for k, s in zip(x._keys(), seeds))
            for name, b in zip(names, [True, False])
        ]

        if isinstance(x, dm.Matrix):
            if x.ndim is not None:
                shape = (None, ) if x.ndim == 1 else (None, x.shape[1])
            else:
                shape = None
            test = dm.Matrix(merge(dsks[0], x.dask),
                             names[0],
                             x.npartitions,
                             dtype=x.dtype,
                             shape=shape)
            train = dm.Matrix(merge(dsks[1], x.dask),
                              names[1],
                              x.npartitions,
                              dtype=x.dtype,
                              shape=shape)

        else:
            test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions)
            train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions)
    else:
        raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or "
                        "``dm.Matrix`` - got {0}".format(type(x).__name__))

    return train, test