def test_matching_blocks_raises(arrays): with pytest.raises(ValueError): check_matching_blocks(*arrays)
def test_matching_blocks_ok(arrays): check_matching_blocks(*arrays)
def train_test_split(*arrays, **options): """Split arrays into random train and test matricies. Parameters ---------- *arrays : Sequence of Dask Arrays test_size : float or int, defualt 0.1 train_size: float or int, optional random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default True Whether to shuffle the data before splitting. blockwise : bool, optional. Whether to shuffle data only within blocks (True), or allow data to be shuffled between blocks (False). Shuffling between blocks can be much more expensive, especially in distributed environments. The default behavior depends on the types in arrays. For Dask Arrays, the default is True (data are not shuffled between blocks). For Dask DataFrames, the default and only allowed value is True (data are shuffled between blocks). Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs Examples -------- import dask.array as da from dask_ml.datasets import make_regression >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50, ... random_state=0) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> X_train dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)> >>> X_train.compute()[:2] array([[ 0.12372191, 0.58222459, 0.92950511, -2.09460307], [ 0.99439439, -0.70972797, -0.27567053, 1.73887268]]) """ test_size = options.pop("test_size", None) train_size = options.pop("train_size", None) random_state = options.pop("random_state", None) shuffle = options.pop("shuffle", True) blockwise = options.pop("blockwise", None) if train_size is None and test_size is None: # all other validation dones elsewhere. test_size = 0.1 if options: raise TypeError("Unexpected options {}".format(options)) if not shuffle: raise NotImplementedError( "'shuffle=False' is not currently supported.") if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays): check_matching_blocks(*arrays) if blockwise is None: blockwise = False rng = check_random_state(random_state) rng = draw_seed(rng, 0, 2**32 - 1, dtype="uint") return list( itertools.chain.from_iterable( arr.random_split([train_size, test_size], random_state=rng) for arr in arrays)) elif all(isinstance(arr, da.Array) for arr in arrays): if blockwise is None: blockwise = True splitter = ShuffleSplit( n_splits=1, test_size=test_size, train_size=train_size, blockwise=blockwise, random_state=random_state, ) train_idx, test_idx = next(splitter.split(*arrays)) train_test_pairs = ((_blockwise_slice(arr, train_idx), _blockwise_slice(arr, test_idx)) for arr in arrays) return list(itertools.chain.from_iterable(train_test_pairs)) else: logger.warning( "Mixture of types in 'arrays'. Falling back to scikit-learn.") return ms.train_test_split(*arrays, test_size=test_size, train_size=train_size, random_state=random_state, shuffle=shuffle)
def train_test_split( *arrays, test_size=None, train_size=None, random_state=None, shuffle=None, blockwise=None, convert_mixed_types=False, **options, ): """Split arrays into random train and test matricies. Parameters ---------- *arrays : Sequence of Dask Arrays, DataFrames, or Series Non-dask objects will be passed through to :func:`sklearn.model_selection.train_test_split`. test_size : float or int, default 0.1 train_size : float or int, optional random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default None Whether to shuffle the data before splitting. blockwise : bool, optional. Whether to shuffle data only within blocks (True), or allow data to be shuffled between blocks (False). Shuffling between blocks can be much more expensive, especially in distributed environments. The default behavior depends on the types in arrays. For Dask Arrays, the default is True (data are not shuffled between blocks). For Dask DataFrames, the default and only allowed value is False (data are shuffled between blocks). convert_mixed_types : bool, default False Whether to convert dask DataFrames and Series to dask Arrays when arrays contains a mixiture of types. This results in some computation to determine the length of each block. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs Examples -------- >>> import dask.array as da >>> from dask_ml.datasets import make_regression >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50, ... random_state=0) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> X_train dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)> >>> X_train.compute()[:2] array([[ 0.12372191, 0.58222459, 0.92950511, -2.09460307], [ 0.99439439, -0.70972797, -0.27567053, 1.73887268]]) """ if train_size is None and test_size is None: # all other validation dones elsewhere. test_size = 0.1 if train_size is None and test_size is not None: train_size = 1 - test_size if test_size is None and train_size is not None: test_size = 1 - train_size if options: raise TypeError("Unexpected options {}".format(options)) types = set(type(arr) for arr in arrays) if da.Array in types and types & {dd.Series, dd.DataFrame}: if convert_mixed_types: arrays = tuple( x.to_dask_array( lengths=True) if isinstance(x, (dd.Series, dd.DataFrame)) else x for x in arrays) else: raise TypeError( "Got mixture of dask DataFrames and Arrays. Specify " "'convert_mixed_types=True'") if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays): check_matching_blocks(*arrays) if blockwise is False: raise NotImplementedError( "'blockwise=False' is not currently supported for dask DataFrames." ) rng = check_random_state(random_state) rng = draw_seed(rng, 0, _I4MAX, dtype="uint") if DASK_2130: if shuffle is None: shuffle = False warnings.warn( message="The default value for 'shuffle' must be specified" " when splitting DataFrames. In the future" " DataFrames will automatically be shuffled within" " blocks prior to splitting. Specify 'shuffle=True'" " to adopt the future behavior now, or 'shuffle=False'" " to retain the previous behavior.", category=FutureWarning, ) kwargs = {"shuffle": shuffle} else: if shuffle is None: shuffle = True if not shuffle: raise NotImplementedError( f"'shuffle=False' is not supported for DataFrames in" f" dask versions<2.13.0. Current version is {DASK_VERSION}." ) kwargs = {} return list( itertools.chain.from_iterable( arr.random_split( [train_size, test_size], random_state=rng, **kwargs) for arr in arrays)) elif all(isinstance(arr, da.Array) for arr in arrays): if shuffle is None: shuffle = True if not shuffle: raise NotImplementedError( "'shuffle=False' is not currently supported for dask Arrays.") if blockwise is None: blockwise = True splitter = ShuffleSplit( n_splits=1, test_size=test_size, train_size=train_size, blockwise=blockwise, random_state=random_state, ) train_idx, test_idx = next(splitter.split(*arrays)) train_test_pairs = ((_blockwise_slice(arr, train_idx), _blockwise_slice(arr, test_idx)) for arr in arrays) return list(itertools.chain.from_iterable(train_test_pairs)) else: return ms.train_test_split( *arrays, test_size=test_size, train_size=train_size, random_state=random_state, shuffle=shuffle, )
def train_test_split( *arrays, test_size=None, train_size=None, stratify=None, classes=None, random_state=None, shuffle=None, blockwise=None, convert_mixed_types=False, **options, ): """Split arrays into random train and test matricies. Parameters ---------- *arrays : Sequence of Dask Arrays, DataFrames, or Series Non-dask objects will be passed through to :func:`sklearn.model_selection.train_test_split`. test_size : float or int, default 0.1 train_size : float or int, optional stratify : Dask Array or Series, optional (default=None) If all *arrays are non-dask objects, stratify will be passed through to :func:`sklearn.model_selection.train_test_split`. If not None, data is split in a stratified fashion, using this as the class labels. classes: non-dask array-like object, optional (default=None) If stratify is not None and any of *arrays is a dask object, this is required. This contains the unique class labels in `stratify` random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default None Whether to shuffle the data before splitting. blockwise : bool, optional. Whether to shuffle data only within blocks (True), or allow data to be shuffled between blocks (False). Shuffling between blocks can be much more expensive, especially in distributed environments. The default behavior depends on the types in arrays. For Dask Arrays, the default is True (data are not shuffled between blocks). For Dask DataFrames, the default and only allowed value is False (data are shuffled between blocks). convert_mixed_types : bool, default False Whether to convert dask DataFrames and Series to dask Arrays when arrays contains a mixiture of types. This results in some computation to determine the length of each block. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs Examples -------- >>> import dask.array as da >>> from dask_ml.datasets import make_regression >>> X, y = make_regression(n_samples=125, n_features=4, chunks=50, ... random_state=0) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, ... random_state=0) >>> X_train dask.array<concatenate, shape=(113, 4), dtype=float64, chunksize=(45, 4)> >>> X_train.compute()[:2] array([[ 0.12372191, 0.58222459, 0.92950511, -2.09460307], [ 0.99439439, -0.70972797, -0.27567053, 1.73887268]]) """ if train_size is None and test_size is None: # all other validation dones elsewhere. test_size = 0.1 if train_size is None and test_size is not None: train_size = 1 - test_size if test_size is None and train_size is not None: test_size = 1 - train_size if options: raise TypeError("Unexpected options {}".format(options)) types = set(type(arr) for arr in arrays) if stratify is not None: # type check if not isinstance(stratify, (da.Array, dd.Series, dd.DataFrame)): # raise error iff not passing thru to sklearn's train_test_split if any( isinstance(arr, (da.Array, dd.Series, dd.DataFrame)) for arr in arrays): raise TypeError( "If 'stratify' is not None, it must be an instance of either" " one of dask Array, Series, or DataFrame. " "Got type {} instead".format(type(stratify))) if classes is None: raise ValueError( "If 'stratify' is not None, 'classes' must be specified") if da.Array in types and types & {dd.Series, dd.DataFrame}: if convert_mixed_types: arrays = tuple( x.to_dask_array( lengths=True) if isinstance(x, (dd.Series, dd.DataFrame)) else x for x in arrays) else: raise TypeError( "Got mixture of dask DataFrames and Arrays. Specify " "'convert_mixed_types=True'") if all(isinstance(arr, (dd.Series, dd.DataFrame)) for arr in arrays): if stratify is not None: # convert to dd.Series if isinstance(stratify, da.Array): stratify = dd.from_dask_array(stratify) check_matching_blocks([*arrays] + [stratify]) else: check_matching_blocks(*arrays) if blockwise is False: raise NotImplementedError( "'blockwise=False' is not currently supported for dask DataFrames." ) rng = check_random_state(random_state) rng = draw_seed(rng, 0, _I4MAX, dtype="uint") if DASK_2130: if shuffle is None: shuffle = False warnings.warn( message="The default value for 'shuffle' must be specified" " when splitting DataFrames. In the future" " DataFrames will automatically be shuffled within" " blocks prior to splitting. Specify 'shuffle=True'" " to adopt the future behavior now, or 'shuffle=False'" " to retain the previous behavior.", category=FutureWarning, ) kwargs = {"shuffle": shuffle} else: if shuffle is None: shuffle = True if not shuffle: raise NotImplementedError( f"'shuffle=False' is not supported for DataFrames in" f" dask versions<2.13.0. Current version is {DASK_VERSION}." ) kwargs = {} if stratify is not None: train_test_pairs = [] for arr in arrays: # list of class-wise train/test split dataframe slices arr_train_slices = [] arr_test_slices = [] for ci in classes: # get subdf of data from this class ci_arr = arr[stratify == ci] # split subdf arr_train, arr_test = ci_arr.random_split( [train_size, test_size], random_state=rng, **kwargs, ) # add subdf's from this class to list of all subdf's arr_train_slices.append(arr_train) arr_test_slices.append(arr_test) # concat all train subdfs as 1 train df, same for test train_test_pairs.append( [dd.concat(arr_train_slices), dd.concat(arr_test_slices)]) else: train_test_pairs = [ arr.random_split([train_size, test_size], random_state=rng, **kwargs) for arr in arrays ] return list(itertools.chain.from_iterable(train_test_pairs)) elif all(isinstance(arr, da.Array) for arr in arrays): if shuffle is None: shuffle = True if not shuffle: raise NotImplementedError( "'shuffle=False' is not currently supported for dask Arrays.") if blockwise is None: blockwise = True splitter = ShuffleSplit( n_splits=1, test_size=test_size, train_size=train_size, blockwise=blockwise, random_state=random_state, ) if stratify is not None: # convert to da.Array if not isinstance(stratify, da.Array): stratify = stratify.to_dask_array(lengths=True) # must be 1-d for indexing stratify = stratify.ravel() train_test_pairs = [] for arr in arrays: # list of class-wise train/test split array slices arr_train_slices = [] arr_test_slices = [] for ci in classes: # get subarray of data from this class ci_arr = arr[stratify == ci] # FIXME: can chunks be determined lazily? ci_arr.compute_chunk_sizes() # split for this class train_idx, test_idx = next(splitter.split(ci_arr)) # add slices from this class to lists of all slices arr_train_slices.append(_blockwise_slice( ci_arr, train_idx)) arr_test_slices.append(_blockwise_slice(ci_arr, test_idx)) # concat all train subarrays as 1 train arr, same for test train_test_pairs.append([ da.concatenate(arr_train_slices), da.concatenate(arr_test_slices) ]) else: train_idx, test_idx = next(splitter.split(*arrays)) train_test_pairs = ((_blockwise_slice(arr, train_idx), _blockwise_slice(arr, test_idx)) for arr in arrays) return list(itertools.chain.from_iterable(train_test_pairs)) else: return ms.train_test_split( *arrays, test_size=test_size, train_size=train_size, random_state=random_state, stratify=stratify, shuffle=shuffle, )