Пример #1
0
def test_array_init_bad(input_type, dtype, shape, order):
    """
    This test ensures that we assert on incorrect combinations of arguments
    when creating CumlArray
    """
    if input_type == 'series':
        if dtype == np.float16:
            pytest.skip("Skipping due to cuDF issue #9065")
        inp = create_input(input_type, dtype, shape, 'C')
    else:
        inp = create_input(input_type, dtype, shape, order)

    # Ensure the array is creatable
    cuml_ary = CumlArray(inp)

    with pytest.raises(AssertionError):
        CumlArray(inp, dtype=cuml_ary.dtype)

    with pytest.raises(AssertionError):
        CumlArray(inp, shape=cuml_ary.shape)

    with pytest.raises(AssertionError):
        CumlArray(inp,
                  order=_strides_to_order(cuml_ary.strides, cuml_ary.dtype))

    assert cp.all(cp.asarray(inp) == cp.asarray(cuml_ary))
Пример #2
0
    def __init__(self,
                 data=None,
                 owner=None,
                 dtype=None,
                 shape=None,
                 order=None):

        # Checks of parameters
        if data is None:
            raise TypeError("To create an empty Array, use the class method" +
                            " Array.empty().")
        elif isinstance(data, memoryview):
            data = np.asarray(data)
        if dtype is not None:
            dtype = np.dtype(dtype)

        if _check_low_level_type(data):
            if dtype is None or shape is None or order is None:
                raise TypeError(
                    "Need to specify dtype, shape and order when" +
                    " creating an Array from {}.".format(type(data)))
            detailed_construction = True
        elif dtype is not None and shape is not None and order is not None:
            detailed_construction = True
        else:
            detailed_construction = False

        ary_interface = False

        # Base class (Buffer) constructor call
        size, shape = _get_size_from_shape(shape, dtype)
        super(CumlArray, self).__init__(data=data, owner=owner, size=size)

        # Post processing of meta data
        if detailed_construction:
            self.shape = shape
            self.dtype = dtype
            self.order = order
            self.strides = _order_to_strides(order, shape, dtype)

        elif hasattr(data, "__array_interface__"):
            ary_interface = data.__array_interface__

        elif hasattr(data, "__cuda_array_interface__"):
            ary_interface = data.__cuda_array_interface__

        else:
            raise TypeError("Unrecognized data type: %s" % str(type(data)))

        if ary_interface:
            self.shape = ary_interface['shape']
            self.dtype = np.dtype(ary_interface['typestr'])
            if ary_interface.get('strides', None) is None:
                self.order = 'C'
                self.strides = _order_to_strides(self.order, self.shape,
                                                 self.dtype)
            else:
                self.strides = ary_interface['strides']
                self.order = _strides_to_order(self.strides, self.dtype)
Пример #3
0
def _stratify_split(X, stratify, labels, n_train, n_test, x_numba, y_numba,
                    random_state):
    """
    Function to perform a stratified split based on stratify column.
    Based on scikit-learn stratified split implementation.

    Parameters
    ----------
    X, y: Shuffled input data and labels
    stratify: column to be stratified on.
    n_train: Number of samples in train set
    n_test: number of samples in test set
    x_numba: Determines whether the data should be converted to numba
    y_numba: Determines whether the labales should be converted to numba

    Returns
    -------
    X_train, X_test: Data X divided into train and test sets
    y_train, y_test: Labels divided into train and test sets
    """
    x_cudf = False
    labels_cudf = False

    if isinstance(X, cudf.DataFrame):
        x_cudf = True
    elif hasattr(X, "__cuda_array_interface__"):
        X = cp.asarray(X)
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    # labels and stratify will be only cp arrays
    if isinstance(labels, cudf.Series):
        labels_cudf = True
        labels = labels.values
    elif hasattr(labels, "__cuda_array_interface__"):
        labels = cp.asarray(labels)
    elif isinstance(stratify, cudf.DataFrame):
        # ensuring it has just one column
        if labels.shape[1] != 1:
            raise ValueError('Expected one column for labels, but found df'
                             'with shape = %d' % (labels.shape))
        labels_cudf = True
        labels = labels[0].values

    labels_order = _strides_to_order(
                        labels.__cuda_array_interface__['strides'],
                        cp.dtype(labels.dtype))

    # Converting to cupy array removes the need to add an if-else block
    # for startify column
    if isinstance(stratify, cudf.Series):
        stratify = stratify.values
    elif hasattr(stratify, "__cuda_array_interface__"):
        stratify = cp.asarray(stratify)
    elif isinstance(stratify, cudf.DataFrame):
        # ensuring it has just one column
        if stratify.shape[1] != 1:
            raise ValueError('Expected one column, but found column'
                             'with shape = %d' % (stratify.shape))
        stratify = stratify[0].values

    classes, stratify_indices = cp.unique(stratify, return_inverse=True)

    n_classes = classes.shape[0]
    class_counts = cp.bincount(stratify_indices)
    if cp.min(class_counts) < 2:
        raise ValueError("The least populated class in y has only 1"
                         " member, which is too few. The minimum"
                         " number of groups for any class cannot"
                         " be less than 2.")

    if n_train < n_classes:
        raise ValueError('The train_size = %d should be greater or '
                         'equal to the number of classes = %d' % (n_train,
                                                                  n_classes))

    class_indices = cp.split(cp.argsort(stratify_indices),
                             cp.cumsum(class_counts)[:-1].tolist())

    X_train = None

    # random_state won't be None or int, that's handled earlier
    if isinstance(random_state, np.random.RandomState):
        random_state = cp.random.RandomState(seed=random_state.get_state()[1])

    # Break ties
    n_i = _approximate_mode(class_counts, n_train, random_state)
    class_counts_remaining = class_counts - n_i
    t_i = _approximate_mode(class_counts_remaining, n_test, random_state)

    for i in range(n_classes):
        permutation = random_state.permutation(class_counts[i].item())
        perm_indices_class_i = class_indices[i].take(permutation)

        y_train_i = cp.array(labels[perm_indices_class_i[:n_i[i]]],
                             order=labels_order)
        y_test_i = cp.array(labels[perm_indices_class_i[n_i[i]:n_i[i] +
                                                        t_i[i]]],
                            order=labels_order)
        if hasattr(X, "__cuda_array_interface__") or \
           isinstance(X, cupyx.scipy.sparse.csr_matrix):

            X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]],
                                 order=x_order)
            X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] +
                                                       t_i[i]]],
                                order=x_order)

            if X_train is None:
                X_train = cp.array(X_train_i, order=x_order)
                y_train = cp.array(y_train_i, order=labels_order)
                X_test = cp.array(X_test_i, order=x_order)
                y_test = cp.array(y_test_i, order=labels_order)
            else:
                X_train = cp.concatenate([X_train, X_train_i], axis=0)
                X_test = cp.concatenate([X_test, X_test_i], axis=0)
                y_train = cp.concatenate([y_train, y_train_i], axis=0)
                y_test = cp.concatenate([y_test, y_test_i], axis=0)

        elif x_cudf:
            X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]]
            X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

            if X_train is None:
                X_train = X_train_i
                y_train = y_train_i
                X_test = X_test_i
                y_test = y_test_i
            else:
                X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
                X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
                y_train = cp.concatenate([y_train, y_train_i], axis=0)
                y_test = cp.concatenate([y_test, y_test_i], axis=0)

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)
    elif x_cudf:
        X_train = cudf.DataFrame(X_train)
        X_test = cudf.DataFrame(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)
    elif labels_cudf:
        y_train = cudf.Series(y_train)
        y_test = cudf.Series(y_test)

    return X_train, X_test, y_train, y_test
Пример #4
0
def train_test_split(X,
                     y=None,
                     test_size: Union[float,
                                      int] = None,
                     train_size: Union[float,
                                       int] = None,
                     shuffle: bool = True,
                     random_state: Union[int,
                                         cp.random.RandomState,
                                         np.random.RandomState] = None,
                     stratify=None):
    """
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split
    <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html>`_.

    Parameters
    ----------
    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default

    stratify: cudf.Series or cuda_array_interface compliant device array,
            optional parameter. When passed, the input is split using this
            as column to startify on. Default=None

    Examples
    --------
    .. code-block:: python

        >>> import cudf
        >>> from cuml.model_selection import train_test_split
        >>> # Generate some sample data
        >>> df = cudf.DataFrame({'x': range(10),
        ...                      'y': [0, 1] * 5})
        >>> print(f'Original data: {df.shape[0]} elements')
        Original data: 10 elements
        >>> # Suppose we want an 80/20 split
        >>> X_train, X_test, y_train, y_test = train_test_split(df, 'y',
        ...                                                     train_size=0.8)
        >>> print(f'X_train: {X_train.shape[0]} elements')
        X_train: 8 elements
        >>> print(f'X_test: {X_test.shape[0]} elements')
        X_test: 2 elements
        >>> print(f'y_train: {y_train.shape[0]} elements')
        y_train: 8 elements
        >>> print(f'y_test: {y_test.shape[0]} elements')
        y_test: 2 elements

        >>> # Alternatively, if our labels are stored separately
        >>> labels = df['y']
        >>> df = df.drop(['y'], axis=1)
        >>> # we can also do
        >>> X_train, X_test, y_train, y_test = train_test_split(df, labels,
        ...                                                     train_size=0.8)

    Returns
    -------

    X_train, X_test, y_train, y_test : cudf.DataFrame or array-like objects
        Partitioned dataframes if X and y were cuDF objects. If `y` was
        provided as a column name, the column was dropped from `X`.
        Partitioned numba device arrays if X and y were Numba device arrays.
        Partitioned CuPy arrays for any other input.

    """
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name, axis=1)
        else:
            raise TypeError("X needs to be a cuDF Dataframe when y is a \
                             string")

    # todo: this check will be replaced with upcoming improvements
    # to input_utils
    #
    if y is not None:
        if not hasattr(X, "__cuda_array_interface__") and not \
                isinstance(X, cudf.DataFrame):
            raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant array.")

        if not hasattr(y, "__cuda_array_interface__") and not \
                isinstance(y, cudf.DataFrame):
            raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant array.")

        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y must have the same first dimension"
                             "(found {} and {})".format(
                                 X.shape[0],
                                 y.shape[0]))
    else:
        if not hasattr(X, "__cuda_array_interface__") and not \
                isinstance(X, cudf.DataFrame):
            raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant object.")

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = cuda.devicearray.is_cuda_ndarray(X)
    y_numba = cuda.devicearray.is_cuda_ndarray(y)

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if shuffle:
        # Shuffle the data
        if random_state is None or isinstance(random_state, int):
            idxs = cp.arange(X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = cp.arange(X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

        else:
            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")

        random_state.shuffle(idxs)

        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs]

        elif hasattr(X, "__cuda_array_interface__"):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif hasattr(y, "__cuda_array_interface__"):
            y = cp.asarray(y)[idxs]

        if stratify is not None:
            if isinstance(stratify, cudf.DataFrame) or \
                    isinstance(stratify, cudf.Series):
                stratify = stratify.iloc[idxs]

            elif hasattr(stratify, "__cuda_array_interface__"):
                stratify = cp.asarray(stratify)[idxs]

            split_return = _stratify_split(X,
                                           stratify,
                                           y,
                                           train_size,
                                           test_size,
                                           x_numba,
                                           y_numba,
                                           random_state)
            return split_return

    # If not stratified, perform train_test_split splicing
    if hasattr(X, "__cuda_array_interface__"):
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    if hasattr(y, "__cuda_array_interface__"):
        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
                                    cp.dtype(y.dtype))

    if hasattr(X, "__cuda_array_interface__") or \
            isinstance(X, cupyx.scipy.sparse.csr_matrix):
        X_train = cp.array(X[0:train_size], order=x_order)
        X_test = cp.array(X[-1 * test_size:], order=x_order)
        if y is not None:
            y_train = cp.array(y[0:train_size], order=y_order)
            y_test = cp.array(y[-1 * test_size:], order=y_order)
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        X_test = X.iloc[-1 * test_size:]
        if y is not None:
            if isinstance(y, cudf.Series):
                y_train = y.iloc[0:train_size]
                y_test = y.iloc[-1 * test_size:]
            elif hasattr(y, "__cuda_array_interface__") or \
                    isinstance(y, cupyx.scipy.sparse.csr_matrix):
                y_train = cp.array(y[0:train_size], order=y_order)
                y_test = cp.array(y[-1 * test_size:], order=y_order)

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    if y is not None:
        return X_train, X_test, y_train, y_test
    else:
        return X_train, X_test
Пример #5
0
def _stratify_split(X, y, n_train, n_test, x_numba, y_numba, random_state):
    """
    Function to perform a stratified split based on y lables.
    Based on scikit-learn stratified split implementation.

    Parameters
    ----------
    X, y: Shuffled input data and labels
    n_train: Number of samples in train set
    n_test: number of samples in test set
    x_numba: Determines whether the data should be converted to numba
    y_numba: Determines whether the labales should be converted to numba

    Returns
    -------
    X_train, X_test: Data X divided into train and test sets
    y_train, y_test: Labels divided into train and test sets
    """
    x_cudf = False
    y_cudf = False

    if isinstance(X, cudf.DataFrame):
        x_cudf = True
    elif hasattr(X, "__cuda_array_interface__"):
        X = cp.asarray(X)
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    if isinstance(y, cudf.Series):
        y_cudf = True
    elif hasattr(y, "__cuda_array_interface__"):
        y = cp.asarray(y)
        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
                                    cp.dtype(y.dtype))
    elif isinstance(y, cudf.DataFrame):
        y_cudf = True
        # ensuring it has just one column
        if y.shape[1] != 1:
            raise ValueError('Expected one label, but found y'
                             'with shape = %d' % (y.shape))

    classes, y_indices = cp.unique(y.values if y_cudf else y,
                                   return_inverse=True)

    n_classes = classes.shape[0]
    class_counts = cp.bincount(y_indices)
    if n_train < n_classes:
        raise ValueError('The train_size = %d should be greater or '
                         'equal to the number of classes = %d' %
                         (n_train, n_classes))
    if n_test < n_classes:
        raise ValueError('The test_size = %d should be greater or '
                         'equal to the number of classes = %d' %
                         (n_test, n_classes))
    class_indices = cp.array_split(cp.argsort(y_indices), n_classes)

    X_train = None

    # random_state won't be None or int, that's handled earlier
    if isinstance(random_state, np.random.RandomState):
        random_state = cp.random.RandomState(seed=random_state.get_state()[1])

    # Break ties
    n_i = _approximate_mode(class_counts, n_train, random_state)
    class_counts_remaining = class_counts - n_i
    t_i = _approximate_mode(class_counts_remaining, n_test, random_state)

    for i in range(n_classes):
        permutation = random_state.permutation(class_counts[i].item())
        perm_indices_class_i = class_indices[i].take(permutation)

        if hasattr(X, "__cuda_array_interface__") or \
           isinstance(X, cupyx.scipy.sparse.csr_matrix):

            X_train_i = cp.array(X[perm_indices_class_i[:n_i[i]]],
                                 order=x_order)
            X_test_i = cp.array(X[perm_indices_class_i[n_i[i]:n_i[i] +
                                                       t_i[i]]],
                                order=x_order)

            y_train_i = cp.array(y[perm_indices_class_i[:n_i[i]]],
                                 order=y_order)
            y_test_i = cp.array(y[perm_indices_class_i[n_i[i]:n_i[i] +
                                                       t_i[i]]],
                                order=y_order)

            if X_train is None:
                X_train = cp.array(X_train_i, order=x_order)
                y_train = cp.array(y_train_i, order=y_order)
                X_test = cp.array(X_test_i, order=x_order)
                y_test = cp.array(y_test_i, order=y_order)
            else:
                X_train = cp.concatenate([X_train, X_train_i], axis=0)
                X_test = cp.concatenate([X_test, X_test_i], axis=0)
                y_train = cp.concatenate([y_train, y_train_i], axis=0)
                y_test = cp.concatenate([y_test, y_test_i], axis=0)

        elif x_cudf:
            X_train_i = X.iloc[perm_indices_class_i[:n_i[i]]]
            X_test_i = X.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

            y_train_i = y.iloc[perm_indices_class_i[:n_i[i]]]
            y_test_i = y.iloc[perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]]

            if X_train is None:
                X_train = X_train_i
                y_train = y_train_i
                X_test = X_test_i
                y_test = y_test_i
            else:
                X_train = cudf.concat([X_train, X_train_i], ignore_index=False)
                X_test = cudf.concat([X_test, X_test_i], ignore_index=False)
                y_train = cudf.concat([y_train, y_train_i], ignore_index=False)
                y_test = cudf.concat([y_test, y_test_i], ignore_index=False)

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)
    elif x_cudf:
        X_train = cudf.DataFrame(X_train)
        X_test = cudf.DataFrame(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)
    elif y_cudf:
        y_train = cudf.DataFrame(y_train)
        y_test = cudf.DataFrame(y_test)

    return X_train, X_test, y_train, y_test
Пример #6
0
    def __init__(self,
                 data=None,
                 owner=None,
                 dtype=None,
                 shape=None,
                 order=None):

        # Checks of parameters
        memview_construction = False
        if data is None:
            raise TypeError("To create an empty Array, use the class method" +
                            " Array.empty().")
        elif isinstance(data, memoryview):
            data = np.asarray(data)
            memview_construction = True

        if dtype is not None:
            dtype = np.dtype(dtype)

        if _check_low_level_type(data):
            if dtype is None or shape is None or order is None:
                raise TypeError(
                    "Need to specify dtype, shape and order when" +
                    " creating an Array from {}.".format(type(data)))
            detailed_construction = True
        elif dtype is not None and shape is not None and order is not None:
            detailed_construction = True
        else:
            # Catch a likely developer error if CumlArray is created
            # incorrectly
            assert dtype is None and shape is None and order is None, \
                ("Creating array from array-like object. The arguments "
                 "`dtype`, `shape` and `order` should be `None`.")

            detailed_construction = False

        ary_interface = False

        # Base class (Buffer) constructor call
        size, shape = _get_size_from_shape(shape, dtype)

        if not memview_construction and not detailed_construction:
            # Convert to cupy array and manually specify the ptr, size and
            # owner. This is to avoid the restriction on Buffer that requires
            # all data be u8
            cupy_data = cp.asarray(data)
            flattened_data = cupy_data.data.ptr

            # Size for Buffer is not the same as for cupy. Use nbytes
            size = cupy_data.nbytes
            owner = cupy_data if cupy_data.flags.owndata else data
        else:
            flattened_data = data

        super(CumlArray, self).__init__(data=flattened_data,
                                        owner=owner,
                                        size=size)

        # Post processing of meta data
        if detailed_construction:
            self.shape = shape
            self.dtype = dtype
            self.order = order
            self.strides = _order_to_strides(order, shape, dtype)

        elif hasattr(data, "__array_interface__"):
            ary_interface = data.__array_interface__

        elif hasattr(data, "__cuda_array_interface__"):
            ary_interface = data.__cuda_array_interface__

        else:
            raise TypeError("Unrecognized data type: %s" % str(type(data)))

        if ary_interface:
            self.shape = ary_interface['shape']
            self.dtype = np.dtype(ary_interface['typestr'])
            if ary_interface.get('strides', None) is None:
                self.order = 'C'
                self.strides = _order_to_strides(self.order, self.shape,
                                                 self.dtype)
            else:
                self.strides = ary_interface['strides']
                self.order = _strides_to_order(self.strides, self.dtype)
Пример #7
0
def train_test_split(X,
                     y,
                     test_size: Union[float, int] = None,
                     train_size: Union[float, int] = None,
                     shuffle: bool = True,
                     random_state: Union[int, cp.random.RandomState,
                                         np.random.RandomState] = None,
                     seed: Union[int, cp.random.RandomState,
                                 np.random.RandomState] = None):
    """
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split`

    Parameters
    ----------
    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default
    seed: random_state : int, CuPy RandomState or NumPy RandomState optional
        Deprecated in favor of `random_state`.
        If shuffle is true, seeds the generator. Unseeded by default

    Examples
    --------
    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
                                                            train_size=0.8)
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'])

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                            train_size=0.8)

    Output:

    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    Returns
    -------
    X_train, X_test, y_train, y_test : cudf.DataFrame or array-like objects
        Partitioned dataframes if X and y were cuDF objects. If `y` was
        provided as a column name, the column was dropped from the `X`s
        Partitioned numba device arrays if X and y were Numba device arrays.
        Partitioned CuPy arrays for any other input.
    """
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name)
        else:
            raise TypeError("X needs to be a cuDF Dataframe when y is a \
                             string")

    # todo: this check will be replaced with upcoming improvements
    # to input_utils
    #
    if not hasattr(X, "__cuda_array_interface__") and not \
            isinstance(X, cudf.DataFrame) and isinstance(y, cudf.Series):
        raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if not hasattr(y, "__cuda_array_interface__") and not \
            isinstance(y, cudf.DataFrame) and isinstance(y, cudf.Series):
        raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = cuda.devicearray.is_cuda_ndarray(X)
    y_numba = cuda.devicearray.is_cuda_ndarray(y)

    if seed is not None:
        if random_state is None:
            warnings.warn("Parameter 'seed' is deprecated, please use \
                          'random_state' instead.")
            random_state = seed
        else:
            warnings.warn("Both 'seed' and 'random_state' parameters were \
                          set, using 'random_state' since 'seed' is \
                          deprecated. ")

    if shuffle:
        if random_state is None or isinstance(random_state, int):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

        else:
            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")

        random_state.shuffle(idxs)

        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs].reset_index(drop=True)

        elif hasattr(X, "__cuda_array_interface__"):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif hasattr(y, "__cuda_array_interface__"):
            y = cp.asarray(y)[idxs]

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if hasattr(X, "__cuda_array_interface__"):
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    if hasattr(y, "__cuda_array_interface__"):
        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
                                    cp.dtype(y.dtype))

    if hasattr(X, "__cuda_array_interface__") or \
            isinstance(X, cp.sparse.csr_matrix):
        X_train = cp.array(X[0:train_size], order=x_order)
        y_train = cp.array(y[0:train_size], order=y_order)
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        y_train = y.iloc[0:train_size]

    if hasattr(X, "__cuda_array_interface__") or \
            isinstance(X, cp.sparse.csr_matrix):
        X_test = cp.array(X[-1 * test_size:], order=x_order)
        y_test = cp.array(y[-1 * test_size:], order=y_order)
    elif isinstance(X, cudf.DataFrame):
        X_test = X.iloc[-1 * test_size:]
        y_test = y.iloc[-1 * test_size:]

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    return X_train, X_test, y_train, y_test