def random_frame(self, seed: int, dc: DataContainer, **kwargs) -> dd.Series: """This function - in contrast to others in this module - will only ever be called on data frames""" random_state = np.random.RandomState(seed=seed) # Idea taken from dask.DataFrame.sample: # initialize a random state for each of the partitions # separately and then create a random series # for each partition df = dc.df name = "sample-" + tokenize(df, random_state) state_data = random_state_data(df.npartitions, random_state) dsk = {(name, i): ( self.random_function, (df._name, i), np.random.RandomState(state), kwargs, ) for i, state in enumerate(state_data)} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) random_series = Series(graph, name, ("random", "float64"), df.divisions) # This part seems to be stupid, but helps us do a very simple # task without going into the (private) internals of Dask: # copy all meta information from the original input dataframe # This is important so that the returned series looks # exactly like coming from the input dataframe return_df = df.assign(random=random_series)["random"] return return_df
def split(self, X, y=None): """Iterate tuples of data split into training and test sets. Parameters ---------- X : dask object Training data. May be a ``da.Array``, ``db.Bag``, or ``dklearn.Matrix``. y : dask object, optional The target variable for supervised learning problems. Yields ------- X_train, y_train, X_test, y_test : dask objects The split training and testing data, returned as the same type as the input. If y is not provided, ``y_train`` and ``y_test`` will be ``None``. """ X, y = check_X_y(X, y) seeds = random_state_data(self.n_iter, random_state=self.random_state) for seed in seeds: X_train, X_test = random_split(X, self.test_size, seed) if y is None: y_train = y_test = None else: y_train, y_test = random_split(y, self.test_size, seed) yield X_train, y_train, X_test, y_test
def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): axis = axis or 0 if axis not in [0, 1]: raise ValueError("Axis must be either 0 or 1.") if axis == 0 and n is not None: raise NotImplementedError("Only `frac` can be used to sample rows" " from Dask SparseFrame, not `n`.") if (n is None) == (frac is None): raise ValueError("Please specify either `n` or `frac`.") if weights is not None: raise NotImplementedError("`weights` argument is not supported.") if random_state is None: random_state = np.random.RandomState() state_data = random_state_data(self.npartitions, random_state) state_data = (x for x in state_data) if axis == 0: return self.map_partitions(sp.SparseFrame.sample, self._meta, frac=frac, replace=replace, axis=0, random_state=state_data) if axis == 1: cols = self._meta\ .sample(n=n, frac=frac, replace=replace, axis=1)\ .columns.tolist() return self[cols]
def test_random_state_data(): seed = 37 state = np.random.RandomState(seed) n = 100000 # Use an integer states = random_state_data(n, seed) assert len(states) == n # Use RandomState object states2 = random_state_data(n, state) for s1, s2 in zip(states, states2): assert (s1 == s2).all() # Consistent ordering states = random_state_data(10, 1234) states2 = random_state_data(20, 1234)[:10] for s1, s2 in zip(states, states2): assert (s1 == s2).all()
def partition_quantiles(df, npartitions, upsample=1.0, random_state=None): """Approximate quantiles of Series used for repartitioning""" assert isinstance(df, Series) # currently, only Series has quantile method # Index.quantile(list-like) must be pd.Series, not pd.Index return_type = Series qs = np.linspace(0, 1, npartitions + 1) token = tokenize(df, qs, upsample) if random_state is None: random_state = int(token, 16) % np.iinfo(np.int32).max state_data = random_state_data(df.npartitions, random_state) df_keys = df.__dask_keys__() name0 = "re-quantiles-0-" + token dtype_dsk = {(name0, 0): (dtype_info, df_keys[0])} name1 = "re-quantiles-1-" + token val_dsk = { (name1, i): ( percentiles_summary, key, df.npartitions, npartitions, upsample, state, ) for i, (state, key) in enumerate(zip(state_data, df_keys)) } name2 = "re-quantiles-2-" + token merge_dsk = create_merge_tree(merge_and_compress_summaries, sorted(val_dsk), name2) if not merge_dsk: # Compress the data even if we only have one partition merge_dsk = {(name2, 0, 0): (merge_and_compress_summaries, [list(val_dsk)[0]])} merged_key = max(merge_dsk) name3 = "re-quantiles-3-" + token last_dsk = { (name3, 0): ( pd.Series, # TODO: Use `type(df._meta)` when cudf adds `tolist()` (process_val_weights, merged_key, npartitions, (name0, 0)), qs, None, df.name, ) } dsk = merge(df.dask, dtype_dsk, val_dsk, merge_dsk, last_dsk) new_divisions = [0.0, 1.0] return return_type(dsk, name3, df._meta, new_divisions)
def test_random_state_data(): np = pytest.importorskip("numpy") seed = 37 state = np.random.RandomState(seed) n = 10000 # Use an integer states = random_state_data(n, seed) assert len(states) == n # Use RandomState object states2 = random_state_data(n, state) for s1, s2 in zip(states, states2): assert s1.shape == (624, ) assert (s1 == s2).all() # Consistent ordering states = random_state_data(10, 1234) states2 = random_state_data(20, 1234)[:10] for s1, s2 in zip(states, states2): assert (s1 == s2).all()
def train_test_split(*arrays, **options): """Split dask collections into random train and test subsets. Quick utility that wraps input validation and calls to train/test splitting with ``RandomSplit`` into a single call for splitting data in a oneliner. Parameters ---------- *arrays : sequence of dask collections with same length and partitions Allowed inputs are ``db.Bag``, ``da.Array``, or ``dm.Matrix``. All inputs must share the same length and partitions. test_size : float, optional Should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. Default is 0.25. random_state : int or RandomState Pseudo-random number generator state used for random sampling. Returns ------- splitting : list, length = 2 * len(arrays), List containing train-test split of inputs. Examples -------- >>> X_train, X_test, y_train, y_test = train_test_split( # doctest: +SKIP ... X, y, test_size=0.20, random_state=42) """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") check_aligned_partitions(*arrays) test_size = options.pop('test_size', 0.25) random_state = options.pop('random_state', None) if options: raise ValueError("Invalid parameters passed: %s" % str(options)) seed = random_state_data(1, random_state=random_state)[0] return list(concat(random_split(a, test_size, seed) for a in arrays))
def random_split(self, frac, random_state=None): if not np.allclose(sum(frac), 1): raise ValueError("frac should sum to 1") state_data = random_state_data(self.npartitions, random_state) partitions = self.to_delayed() partitions = [ delayed(pd_split)(sf, frac, state) for sf, state in zip(partitions, state_data) ] splits = [] for i in range(len(frac)): split_delayed = [delayed(itemgetter(i))(sf) for sf in partitions] split = from_delayed(split_delayed, prefix='random-split', meta=self._meta) splits.append(split) return splits
def _wrap(self, funcname, *args, size=None, chunks="auto", extra_chunks=(), **kwargs): """Wrap numpy random function to produce dask.array random function extra_chunks should be a chunks tuple to append to the end of chunks """ if size is not None and not isinstance(size, (tuple, list)): size = (size, ) shapes = list({ ar.shape for ar in chain(args, kwargs.values()) if isinstance(ar, (Array, np.ndarray)) }) if size is not None: shapes.append(size) # broadcast to the final size(shape) size = broadcast_shapes(*shapes) chunks = normalize_chunks( chunks, size, # ideally would use dtype here dtype=kwargs.get("dtype", np.float64), ) slices = slices_from_chunks(chunks) def _broadcast_any(ar, shape, chunks): if isinstance(ar, Array): return broadcast_to(ar, shape).rechunk(chunks) if isinstance(ar, np.ndarray): return np.ascontiguousarray(np.broadcast_to(ar, shape)) # Broadcast all arguments, get tiny versions as well # Start adding the relevant bits to the graph dsk = {} lookup = {} small_args = [] dependencies = [] for i, ar in enumerate(args): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[i] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[i] = name dsk[name] = res small_args.append(ar[tuple(0 for _ in ar.shape)]) else: small_args.append(ar) small_kwargs = {} for key, ar in kwargs.items(): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[key] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[key] = name dsk[name] = res small_kwargs[key] = ar[tuple(0 for _ in ar.shape)] else: small_kwargs[key] = ar sizes = list(product(*chunks)) seeds = random_state_data(len(sizes), self._numpy_state) token = tokenize(seeds, size, chunks, args, kwargs) name = f"{funcname}-{token}" keys = product([name], *([range(len(bd)) for bd in chunks] + [[0]] * len(extra_chunks))) blocks = product(*[range(len(bd)) for bd in chunks]) vals = [] for seed, size, slc, block in zip(seeds, sizes, slices, blocks): arg = [] for i, ar in enumerate(args): if i not in lookup: arg.append(ar) else: if isinstance(ar, Array): arg.append((lookup[i], ) + block) else: # np.ndarray arg.append((getitem, lookup[i], slc)) kwrg = {} for k, ar in kwargs.items(): if k not in lookup: kwrg[k] = ar else: if isinstance(ar, Array): kwrg[k] = (lookup[k], ) + block else: # np.ndarray kwrg[k] = (getitem, lookup[k], slc) vals.append((_apply_random, self._RandomState, funcname, seed, size, arg, kwrg)) meta = _apply_random( self._RandomState, funcname, seed, (0, ) * len(size), small_args, small_kwargs, ) dsk.update(dict(zip(keys, vals))) graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks + extra_chunks, meta=meta)
def choice(self, a, size=None, replace=True, p=None, chunks="auto"): dependencies = [] # Normalize and validate `a` if isinstance(a, Integral): # On windows the output dtype differs if p is provided or # absent, see https://github.com/numpy/numpy/issues/9867 dummy_p = np.array([1]) if p is not None else p dtype = np.random.choice(1, size=(), p=dummy_p).dtype len_a = a if a < 0: raise ValueError("a must be greater than 0") else: a = asarray(a) a = a.rechunk(a.shape) dtype = a.dtype if a.ndim != 1: raise ValueError("a must be one dimensional") len_a = len(a) dependencies.append(a) a = a.__dask_keys__()[0] # Normalize and validate `p` if p is not None: if not isinstance(p, Array): # If p is not a dask array, first check the sum is close # to 1 before converting. p = np.asarray(p) if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): raise ValueError("probabilities do not sum to 1") p = asarray(p) else: p = p.rechunk(p.shape) if p.ndim != 1: raise ValueError("p must be one dimensional") if len(p) != len_a: raise ValueError("a and p must have the same size") dependencies.append(p) p = p.__dask_keys__()[0] if size is None: size = () elif not isinstance(size, (tuple, list)): size = (size, ) chunks = normalize_chunks(chunks, size, dtype=np.float64) if not replace and len(chunks[0]) > 1: err_msg = ("replace=False is not currently supported for " "dask.array.choice with multi-chunk output " "arrays") raise NotImplementedError(err_msg) sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) name = "da.random.choice-%s" % tokenize(state_data, size, chunks, a, replace, p) keys = product([name], *(range(len(bd)) for bd in chunks)) dsk = { k: (_choice, state, a, size, replace, p) for k, state, size in zip(keys, state_data, sizes) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks, dtype=dtype)
def __call__(self, part): divisions, state_data = part if isinstance(state_data, int): state_data = random_state_data(1, state_data) return make_timeseries_part(divisions[0], divisions[1], self.dtypes, self.freq, state_data, self.kwargs)
def make_timeseries( start="2000-01-01", end="2000-12-31", dtypes={ "name": str, "id": int, "x": float, "y": float }, freq="10s", partition_freq="1M", seed=None, **kwargs, ): """Create timeseries dataframe with random data Parameters ---------- start: datetime (or datetime-like string) Start of time series end: datetime (or datetime-like string) End of time series dtypes: dict Mapping of column names to types. Valid types include {float, int, str, 'category'} freq: string String like '2s' or '1H' or '12W' for the time series frequency partition_freq: string String like '1M' or '2Y' to divide the dataframe into partitions seed: int (optional) Randomstate seed kwargs: Keywords to pass down to individual column creation functions. Keywords should be prefixed by the column name and then an underscore. Examples -------- >>> import dask.dataframe as dd >>> df = dd.demo.make_timeseries('2000', '2010', ... {'value': float, 'name': str, 'id': int}, ... freq='2H', partition_freq='1D', seed=1) >>> df.head() # doctest: +SKIP id name value 2000-01-01 00:00:00 969 Jerry -0.309014 2000-01-01 02:00:00 1010 Ray -0.760675 2000-01-01 04:00:00 1016 Patricia -0.063261 2000-01-01 06:00:00 960 Charlie 0.788245 2000-01-01 08:00:00 1031 Kevin 0.466002 """ divisions = list(pd.date_range(start=start, end=end, freq=partition_freq)) npartitions = len(divisions) - 1 if seed is None: # Get random integer seed for each partition. We can # call `random_state_data` in `MakeTimeseriesPart` state_data = np.random.randint(2e9, size=npartitions) else: state_data = random_state_data(npartitions, seed) # Build parts parts = [] for i in range(len(divisions) - 1): parts.append((divisions[i:i + 2], state_data[i])) # Construct the output collection with from_map return from_map( MakeTimeseriesPart(dtypes, freq, kwargs), parts, meta=make_timeseries_part("2000", "2000", dtypes, "1H", state_data[0], kwargs), divisions=divisions, label="make-timeseries", token=tokenize(start, end, dtypes, freq, partition_freq, state_data), enforce_metadata=False, )
def random_split(x, p_test=0.1, random_state=None): """Approximately split a dask collection into train/test data. Parameters ---------- X : da.Array, db.Bag, or dm.Matrix The dask collection to split p_test : float, optional The fraction of samples to use in the test set. Default is 0.1. random_state : int or RandomState, optional The ``RandomState`` or seed to use when performing the random split. """ if not 0 < p_test < 1: raise ValueError("p_test must be in (0, 1)") random_state = _check_random_state(random_state) token = tokenize(x, p_test, random_state.get_state()) names = ['random-split-test-' + token, 'random-split-train-' + token] if isinstance(x, da.Array): x, x_keys = _as_tall_skinny_and_keys(x) chunks = np.array(x.chunks[0]) seeds = random_state_data(len(chunks) + 1, random_state) n_test = np.random.RandomState(seeds[0]).binomial(chunks, p_test) n_train = chunks - n_test dsks = [ dict(((name, ) + k[1:], (arr_split, k, n, b, s)) for k, n, s in zip(x_keys, n_test, seeds[1:])) for name, b in zip(names, [True, False]) ] test = da.Array(merge(dsks[0], x.dask), names[0], (tuple(n_test), ) + x.chunks[1:], x.dtype) train = da.Array(merge(dsks[1], x.dask), names[1], (tuple(n_train), ) + x.chunks[1:], x.dtype) elif isinstance(x, (db.Bag, dm.Matrix)): seeds = random_state_data(x.npartitions, random_state) split = bag_split if isinstance(x, db.Bag) else mat_split dsks = [ dict(((name, k[1]), (split, k, p_test, b, s)) for k, s in zip(x._keys(), seeds)) for name, b in zip(names, [True, False]) ] if isinstance(x, dm.Matrix): if x.ndim is not None: shape = (None, ) if x.ndim == 1 else (None, x.shape[1]) else: shape = None test = dm.Matrix(merge(dsks[0], x.dask), names[0], x.npartitions, dtype=x.dtype, shape=shape) train = dm.Matrix(merge(dsks[1], x.dask), names[1], x.npartitions, dtype=x.dtype, shape=shape) else: test = db.Bag(merge(dsks[0], x.dask), names[0], x.npartitions) train = db.Bag(merge(dsks[1], x.dask), names[1], x.npartitions) else: raise TypeError("Expected an instance of ``da.Array``, ``db.Bag``, or " "``dm.Matrix`` - got {0}".format(type(x).__name__)) return train, test