Exemplo n.º 1
0
def make_monotonic(labels, classes=None, copy=False):
    """
    Takes a set of labels that might not be drawn from the
    set [0, n-1] and renumbers them to be drawn that
    interval.

    Replaces labels not present in classes by len(classes)+1.

    Parameters
    ----------

    labels : array-like of size (n,) labels to convert
    classes : array-like of size (n_classes,) the unique
              set of classes in the set of labels
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    mapped_labels : array-like of size (n,)
    classes : array-like of size (n_classes,)
    """

    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)

    if copy:
        labels = labels.copy()

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    if classes is None:
        classes = rmm_cupy_ary(cp.unique, labels)

    smem = labels.dtype.itemsize * int(classes.shape[0])

    map_labels = _map_kernel(labels.dtype)
    map_labels((math.ceil(labels.shape[0] / 32), ), (32, ),
               (labels, labels.shape[0], classes, classes.shape[0]),
               shared_mem=smem)

    return labels, classes
Exemplo n.º 2
0
def invert_labels(labels, classes, copy=False):
    """
    Takes a set of labels that have been mapped to be drawn
    from a monotonically increasing set and inverts them to
    back to the original set of classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to invert
    classes : array-like of size (n_classes,) the unique set
              of classes for inversion. It is assumed that
              the classes are ordered by their corresponding
              monotonically increasing label.
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    inverted labels : array-like of size (n,)

    """

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))
    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)
    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)

    if copy:
        labels = labels.copy()

    smem = labels.dtype.itemsize * len(classes)
    inverse_map = _inverse_map_kernel(labels.dtype)
    inverse_map((math.ceil(len(labels) / 32), ), (32, ),
                (classes, len(classes), labels, len(labels)),
                shared_mem=smem)

    return labels
Exemplo n.º 3
0
def check_labels(labels, classes):
    """
    Validates that a set of labels is drawn from the unique
    set of given classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to validate
    classes : array-like of size (n_classes,) the unique
              set of classes to verify

    Returns
    -------

    result : boolean
    """

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))

    labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype)
    classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype)

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    valid = cp.array([1])

    smem = labels.dtype.itemsize * int(classes.shape[0])
    validate = _validate_kernel(labels.dtype)
    validate((math.ceil(labels.shape[0] / 32), ), (32, ),
             (labels, labels.shape[0], classes, classes.shape[0], valid),
             shared_mem=smem)

    return valid[0] == 1
Exemplo n.º 4
0
def train_test_split(X,
                     y=None,
                     test_size: Union[float, int] = None,
                     train_size: Union[float, int] = None,
                     shuffle: bool = True,
                     random_state: Union[int, cp.random.RandomState,
                                         np.random.RandomState] = None,
                     seed: Union[int, cp.random.RandomState,
                                 np.random.RandomState] = None,
                     stratify=None):
    """
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split
    <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html>`_.

    Parameters
    ----------
    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default
    seed: random_state : int, CuPy RandomState or NumPy RandomState optional
        Deprecated in favor of `random_state`.
        If shuffle is true, seeds the generator. Unseeded by default
    stratify: bool, optional
        Whether to stratify the input data based on class labels.
        None by default

    Examples
    --------

    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
                                                            train_size=0.8)
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'], axis=1)

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                            train_size=0.8)

    Output:

    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    Returns
    -------

    X_train, X_test, y_train, y_test : cudf.DataFrame or array-like objects
        Partitioned dataframes if X and y were cuDF objects. If `y` was
        provided as a column name, the column was dropped from `X`.
        Partitioned numba device arrays if X and y were Numba device arrays.
        Partitioned CuPy arrays for any other input.

    """
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name, axis=1)
        else:
            raise TypeError("X needs to be a cuDF Dataframe when y is a \
                             string")

    # todo: this check will be replaced with upcoming improvements
    # to input_utils
    #
    if y is not None:
        if not hasattr(X, "__cuda_array_interface__") and not \
                isinstance(X, cudf.DataFrame):
            raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant array.")

        if not hasattr(y, "__cuda_array_interface__") and not \
                isinstance(y, cudf.DataFrame):
            raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant array.")

        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y must have the same first dimension"
                             "(found {} and {})".format(
                                 X.shape[0], y.shape[0]))
    else:
        if not hasattr(X, "__cuda_array_interface__") and not \
                isinstance(X, cudf.DataFrame):
            raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                            a cuda_array_interface compliant object.")

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = cuda.devicearray.is_cuda_ndarray(X)
    y_numba = cuda.devicearray.is_cuda_ndarray(y)

    if seed is not None:
        if random_state is None:
            warnings.warn(
                "Parameter 'seed' is deprecated and will be"
                " removed in 0.17. Please use 'random_state'"
                " instead. Setting 'random_state' as the"
                " curent 'seed' value", DeprecationWarning)
            random_state = seed
        else:
            warnings.warn(
                "Both 'seed' and 'random_state' parameters were"
                " set. Using 'random_state' since 'seed' is"
                " deprecated and will be removed in 0.17.", DeprecationWarning)

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if shuffle:
        # Shuffle the data
        if random_state is None or isinstance(random_state, int):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

        else:
            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")

        random_state.shuffle(idxs)

        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs]

        elif hasattr(X, "__cuda_array_interface__"):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif hasattr(y, "__cuda_array_interface__"):
            y = cp.asarray(y)[idxs]

        if stratify is not None:
            split_return = _stratify_split(X, y, train_size, test_size,
                                           x_numba, y_numba, random_state)
            return split_return

    # If not stratified, perform train_test_split splicing
    if hasattr(X, "__cuda_array_interface__"):
        x_order = _strides_to_order(X.__cuda_array_interface__['strides'],
                                    cp.dtype(X.dtype))

    if hasattr(y, "__cuda_array_interface__"):
        y_order = _strides_to_order(y.__cuda_array_interface__['strides'],
                                    cp.dtype(y.dtype))

    if hasattr(X, "__cuda_array_interface__") or \
            isinstance(X, cupyx.scipy.sparse.csr_matrix):
        X_train = cp.array(X[0:train_size], order=x_order)
        if y is not None:
            y_train = cp.array(y[0:train_size], order=y_order)
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        if y is not None:
            y_train = y.iloc[0:train_size]

    if hasattr(X, "__cuda_array_interface__") or \
            isinstance(X, cupyx.scipy.sparse.csr_matrix):
        X_test = cp.array(X[-1 * test_size:], order=x_order)
        if y is not None:
            y_test = cp.array(y[-1 * test_size:], order=y_order)
    elif isinstance(X, cudf.DataFrame):
        X_test = X.iloc[-1 * test_size:]
        if y is not None:
            y_test = y.iloc[-1 * test_size:]
    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    if y is not None:
        return X_train, X_test, y_train, y_test
    else:
        return X_train, X_test