예제 #1
0
    def fit(self, X, y=None, sample_weight=None):
        """Fit the Kernel Density model on the data.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            List of n_features-dimensional data points.  Each row
            corresponds to a single data point.
        y : None
            Ignored.
        sample_weight : array-like of shape (n_samples,), default=None
            List of sample weights attached to the data X.

        Returns
        -------

        self : object
            Returns the instance itself.
        """
        if sample_weight is not None:
            self.sample_weight_ = input_to_cupy_array(
                sample_weight, check_dtype=[cp.float32, cp.float64]).array
            if self.sample_weight_.min() <= 0:
                raise ValueError("sample_weight must have positive values")
        else:
            self.sample_weight_ = None

        self.X_ = input_to_cupy_array(X,
                                      order="C",
                                      check_dtype=[cp.float32,
                                                   cp.float64]).array

        return self
예제 #2
0
    def _partial_fit(self,
                     X,
                     y,
                     sample_weight=None,
                     _classes=None) -> "MultinomialNB":

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cupy_array(X, order='K').array

        y = input_to_cupy_array(y).array

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(_classes, order='K')
                check_labels(Y, _classes)
                self.classes_ = _classes
            else:
                self.classes_ = label_classes

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self.classes_)

        self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self
예제 #3
0
def model_func_call(X, model_func, gpu_model=False):
    """
    Function to call `model_func(X)` using either `NumPy` arrays if
    gpu_model is False or X directly if model_gpu based is True.
    Returns the results as CuPy arrays.
    """
    if gpu_model:
        y = input_to_cupy_array(X=model_func(X), order='K').array
    else:
        try:
            y = input_to_cupy_array(model_func(cp.asnumpy(X))).array
        except TypeError:
            raise TypeError('Explainer can only explain models that can '
                            'take GPU data or NumPy arrays as input.')

    return y
예제 #4
0
    def predict(self, X) -> CumlArray:
        """
        Perform classification on an array of test vectors X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return y_hat
예제 #5
0
    def predict(self, X) -> CumlArray:
        """
        Perform classification on an array of test vectors X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cupy_array(X, order='K').array

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return y_hat
예제 #6
0
def make_monotonic(labels,
                   classes=None,
                   copy=False) -> typing.Tuple[CumlArray, CumlArray]:
    """
    Takes a set of labels that might not be drawn from the
    set [0, n-1] and renumbers them to be drawn that
    interval.

    Replaces labels not present in classes by len(classes)+1.

    Parameters
    ----------

    labels : array-like of size (n,) labels to convert
    classes : array-like of size (n_classes,) the unique
              set of classes in the set of labels
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    mapped_labels : array-like of size (n,)
    classes : array-like of size (n_classes,)
    """
    labels = input_to_cupy_array(labels, deepcopy=copy).array

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    if classes is None:
        classes = cp.unique(labels)
    else:
        classes = input_to_cupy_array(classes).array

    smem = labels.dtype.itemsize * int(classes.shape[0])

    map_labels = _map_kernel(labels.dtype)
    map_labels((math.ceil(labels.shape[0] / 32), ), (32, ),
               (labels, labels.shape[0], classes, classes.shape[0]),
               shared_mem=smem)

    return labels, classes
예제 #7
0
def roc_auc_score(y_true, y_score):
    """
    Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.

    .. note:: this implementation can only be used with binary classification.

    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        True labels. The binary cases
        expect labels with shape (n_samples,)
    y_score : array-like of shape (n_samples,)
        Target scores. In the binary cases, these can be either
        probability estimates or non-thresholded decision values (as returned
        by `decision_function` on some classifiers). The binary
        case expects a shape (n_samples,), and the scores must be the scores of
        the class with the greater label.

    Returns
    -------
        auc : float

    Examples
    --------
    >>> import numpy as np
    >>> from cuml.metrics import roc_auc_score
    >>> y_true = np.array([0, 0, 1, 1])
    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
    >>> print(roc_auc_score(y_true, y_scores))
    0.75

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cupy_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    y_score, _, _, _ = \
        input_to_cupy_array(y_score, check_dtype=[np.int32, np.int64,
                                                  np.float32, np.float64],
                            check_rows=n_rows, check_cols=n_cols)
    return _binary_roc_auc_score(y_true, y_score)
예제 #8
0
def test_tocupy_missing_values_handling():
    df = cudf.DataFrame(data=[[7, 2, 3], [4, 5, 6], [10, 5, 9]])
    array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False)
    assert isinstance(array, cp.ndarray)
    assert str(array.dtype) == 'int64'

    df = cudf.DataFrame(data=[[7, 2, 3], [4, None, 6], [10, 5, 9]])
    array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False)
    assert isinstance(array, cp.ndarray)
    assert str(array.dtype) == 'float64'
    assert cp.isnan(array[1, 1])

    df = cudf.Series(data=[7, None, 3])
    array, n_rows, n_cols, dtype = input_to_cupy_array(df, fail_on_null=False)
    assert str(array.dtype) == 'float64'
    assert cp.isnan(array[1])

    with pytest.raises(ValueError):
        df = cudf.Series(data=[7, None, 3])
        array, n_rows, n_cols, dtype = input_to_cupy_array(df,
                                                           fail_on_null=True)
예제 #9
0
    def fit(self, X, y=None) -> "IncrementalPCA":
        """
        Fit the model with X, using minibatches of size batch_size.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        y : Ignored

        Returns
        -------

        self : object
            Returns the instance itself.

        """
        self.n_samples_seen_ = 0
        self.mean_ = .0
        self.var_ = .0

        if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
            X = _validate_sparse_input(X)
        else:
            # NOTE: While we cast the input to a cupy array here, we still
            # respect the `output_type` parameter in the constructor. This
            # is done by PCA, which IncrementalPCA inherits from. PCA's
            # transform and inverse transform convert the output to the
            # required type.
            X, n_samples, n_features, self.dtype = \
                input_to_cupy_array(X, order='K',
                                    check_dtype=[cp.float32, cp.float64])

        n_samples, n_features = X.shape

        if self.batch_size is None:
            self.batch_size_ = 5 * n_features
        else:
            self.batch_size_ = self.batch_size

        for batch in _gen_batches(n_samples,
                                  self.batch_size_,
                                  min_batch_size=self.n_components or 0):
            X_batch = X[batch]
            if cupyx.scipy.sparse.issparse(X_batch):
                X_batch = X_batch.toarray()

            self.partial_fit(X_batch, check_input=False)

        return self
예제 #10
0
def check_labels(labels, classes) -> bool:
    """
    Validates that a set of labels is drawn from the unique
    set of given classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to validate
    classes : array-like of size (n_classes,) the unique
              set of classes to verify

    Returns
    -------

    result : boolean
    """

    labels = input_to_cupy_array(labels, order="K").array
    classes = input_to_cupy_array(classes, order="K").array

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))

    if labels.ndim != 1:
        raise ValueError("Labels array must be 1D")

    valid = cp.array([1])

    smem = labels.dtype.itemsize * int(classes.shape[0])
    validate = _validate_kernel(labels.dtype)
    validate((math.ceil(labels.shape[0] / 32), ), (32, ),
             (labels, labels.shape[0], classes, classes.shape[0], valid),
             shared_mem=smem)

    return valid[0] == 1
예제 #11
0
def invert_labels(labels, classes, copy=False) -> CumlArray:
    """
    Takes a set of labels that have been mapped to be drawn
    from a monotonically increasing set and inverts them to
    back to the original set of classes.

    Parameters
    ----------

    labels : array-like of size (n,) labels to invert
    classes : array-like of size (n_classes,) the unique set
              of classes for inversion. It is assumed that
              the classes are ordered by their corresponding
              monotonically increasing label.
    copy : boolean if true, a copy will be returned and the
           operation will not be done in place.

    Returns
    -------

    inverted labels : array-like of size (n,)

    """
    labels = input_to_cupy_array(labels, deepcopy=copy).array
    classes = input_to_cupy_array(classes).array

    if labels.dtype != classes.dtype:
        raise ValueError("Labels and classes must have same dtype (%s != %s" %
                         (labels.dtype, classes.dtype))
    smem = labels.dtype.itemsize * len(classes)
    inverse_map = _inverse_map_kernel(labels.dtype)
    inverse_map((math.ceil(len(labels) / 32), ), (32, ),
                (classes, len(classes), labels, len(labels)),
                shared_mem=smem)

    return labels
예제 #12
0
    def predict_log_proba(self, X) -> CumlArray:
        """
        Return log-probability estimates for the test vector X.

        """
        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cupy_array(X, order='K').array

        jll = self._joint_log_likelihood(X)

        # normalize by P(X) = P(f_1, ..., f_n)

        # Compute log(sum(exp()))

        # Subtract max in exp to prevent inf
        a_max = cp.amax(jll, axis=1, keepdims=True)

        exp = cp.exp(jll - a_max)
        logsumexp = cp.log(cp.sum(exp, axis=1))

        a_max = cp.squeeze(a_max, axis=1)

        log_prob_x = a_max + logsumexp

        if log_prob_x.ndim < 2:
            log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0]))
        result = jll - log_prob_x.T
        return result
예제 #13
0
    def _partial_fit(self,
                     X,
                     y,
                     sample_weight=None,
                     _classes=None,
                     convert_dtype=True) -> "MultinomialNB":

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = _convert_x_sparse(X)
            # TODO: Expanded this since sparse kernel doesn't
            # actually require the scipy sparse container format.
        else:
            X = input_to_cupy_array(
                X, order='K', check_dtype=[cp.float32, cp.float64,
                                           cp.int32]).array

        expected_y_dtype = cp.int32 if X.dtype in [cp.float32, cp.int32
                                                   ] else cp.int64
        y = input_to_cupy_array(
            y,
            convert_to_dtype=(expected_y_dtype if convert_dtype else False),
            check_dtype=expected_y_dtype).array

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(
                    _classes,
                    order='K',
                    convert_to_dtype=(expected_y_dtype
                                      if convert_dtype else False))
                check_labels(Y, _classes)
                self.classes_ = _classes
            else:
                self.classes_ = label_classes

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self.classes_)

        if cp.sparse.isspmatrix(X):
            self._count_sparse(X.row, X.col, X.data, X.shape, Y)
        else:
            self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self
예제 #14
0
def pairwise_kernels(X,
                     Y=None,
                     metric="linear",
                     *,
                     filter_params=False,
                     convert_dtype=True,
                     **kwds):
    """
    Compute the kernel between arrays X and optional array Y.
    This method takes either a vector array or a kernel matrix, and returns
    a kernel matrix. If the input is a vector array, the kernels are
    computed. If the input is a kernel matrix, it is returned instead.
    This method provides a safe way to take a kernel matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.
    If Y is given (default is None), then the returned matrix is the pairwise
    kernel between the arrays from both X and Y.
    Valid values for metric are: ['additive_chi2', 'chi2', 'linear', 'poly',
    'polynomial', 'rbf', 'laplacian', 'sigmoid', 'cosine']

    Parameters
    ----------
    X : Dense matrix (device or host) of shape (n_samples_X, n_samples_X) or \
            (n_samples_X, n_features)
        Array of pairwise kernels between samples, or a feature array.
        The shape of the array should be (n_samples_X, n_samples_X) if
        metric == "precomputed" and (n_samples_X, n_features) otherwise.
        Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
        ndarray, cuda array interface compliant array like CuPy
    Y : Dense matrix (device or host) of shape (n_samples_Y, n_features), \
        default=None
        A second feature array only if X has shape (n_samples_X, n_features).
        Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
        ndarray, cuda array interface compliant array like CuPy
    metric : str or callable (numba device function), default="linear"
        The metric to use when calculating kernel between instances in a
        feature array.
        If metric is "precomputed", X is assumed to be a kernel matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two rows from X as input and return the corresponding
        kernel value as a single number.
    filter_params : bool, default=False
        Whether to filter invalid parameters or not.
    convert_dtype : bool, optional (default = True)
        When set to True, the method will, when necessary, convert
        Y to be the same data type as X if they differ. This
        will increase memory used for the method.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the kernel function.

    Returns
    -------
    K : ndarray of shape (n_samples_X, n_samples_X) or \
            (n_samples_X, n_samples_Y)
        A kernel matrix K such that K_{i, j} is the kernel between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then K_{i, j} is the kernel between the ith array
        from X and the jth array from Y.

    Notes
    -----
    If metric is 'precomputed', Y is ignored and X is returned.

    Examples
    --------

    .. code-block:: python

        >>> import cupy as cp
        >>> from cuml.metrics import pairwise_kernels
        >>> from numba import cuda
        >>> import math

        >>> X = cp.array([[2, 3], [3, 5], [5, 8]])
        >>> Y = cp.array([[1, 0], [2, 1]])

        >>> pairwise_kernels(X, Y, metric='linear')
        array([[ 2,  7],
            [ 3, 11],
            [ 5, 18]])
        >>> @cuda.jit(device=True)
        ... def custom_rbf_kernel(x, y, gamma=None):
        ...     if gamma is None:
        ...         gamma = 1.0 / len(x)
        ...     sum = 0.0
        ...     for i in range(len(x)):
        ...         sum += (x[i] - y[i]) ** 2
        ...     return math.exp(-gamma * sum)

        >>> pairwise_kernels(X, Y, metric=custom_rbf_kernel) # doctest: +SKIP
        array([[6.73794700e-03, 1.35335283e-01],
            [5.04347663e-07, 2.03468369e-04],
            [4.24835426e-18, 2.54366565e-13]])
    """
    X = input_to_cupy_array(X).array
    if Y is None:
        Y = X
    else:
        Y = input_to_cupy_array(Y).array
    if X.shape[1] != Y.shape[1]:
        raise ValueError("X and Y have different dimensions.")

    if metric == "precomputed":
        return X

    if metric in PAIRWISE_KERNEL_FUNCTIONS:
        kwds = _filter_params(PAIRWISE_KERNEL_FUNCTIONS[metric], filter_params,
                              **kwds)
        return PAIRWISE_KERNEL_FUNCTIONS[metric](X, Y, **kwds)
    elif isinstance(metric, str):
        raise ValueError("Unknown kernel %r" % metric)
    else:
        kwds = _filter_params(metric, filter_params, **kwds)

        return custom_kernel(X, Y, metric, **kwds)
예제 #15
0
    def partial_fit(self, X, y=None, check_input=True) -> "IncrementalPCA":
        """
        Incremental fit with X. All of X is processed as a single batch.

        Parameters
        ----------

        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        check_input : bool
            Run check_array on X.
        y : Ignored

        Returns
        -------

        self : object
            Returns the instance itself.

        """
        if check_input:
            if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
                raise TypeError(
                    "IncrementalPCA.partial_fit does not support "
                    "sparse input. Either convert data to dense "
                    "or use IncrementalPCA.fit to do so in batches.")

            self._set_output_type(X)

            X, n_samples, n_features, self.dtype = \
                input_to_cupy_array(X, order='K',
                                    check_dtype=[cp.float32, cp.float64])
        else:
            n_samples, n_features = X.shape

        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            if self.components_ is None:
                self.n_components_ = min(n_samples, n_features)
            else:
                self.n_components_ = self.components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))
        # This is the first partial_fit
        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = 0
            self.mean_ = .0
            self.var_ = .0

        # Update stats - they are 0 if this is the first step
        col_mean, col_var, n_total_samples = \
            _incremental_mean_and_var(
                X, last_mean=self.mean_, last_variance=self.var_,
                last_sample_count=cp.repeat(cp.asarray([self.n_samples_seen_]),
                                            X.shape[1]))
        n_total_samples = n_total_samples[0]

        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X = X - col_mean
        else:
            col_batch_mean = cp.mean(X, axis=0)
            X = X - col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = \
                cp.sqrt((self.n_samples_seen_ * n_samples) /
                        n_total_samples) * (self.mean_ - col_batch_mean)
            X = cp.vstack((self.singular_values_.reshape(
                (-1, 1)) * self.components_, X, mean_correction))

        U, S, V = cp.linalg.svd(X, full_matrices=False)
        U, V = _svd_flip(U, V, u_based_decision=False)
        explained_variance = S**2 / (n_total_samples - 1)
        explained_variance_ratio = S**2 / cp.sum(col_var * n_total_samples)

        self.n_rows = n_total_samples
        self.n_samples_seen_ = n_total_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        if self.n_components_ < n_features:
            self.noise_variance_ = \
                explained_variance[self.n_components_:].mean()
        else:
            self.noise_variance_ = 0.

        return self
예제 #16
0
def precision_recall_curve(
        y_true, probs_pred) -> typing.Tuple[CumlArray, CumlArray, CumlArray]:
    """
    Compute precision-recall pairs for different probability thresholds

    .. note:: this implementation is restricted to the binary classification
        task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the
        number of true positives and ``fp`` the number of false positives. The
        precision is intuitively the ability of the classifier not to label as
        positive a sample that is negative.

        The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
        of true positives and ``fn`` the number of false negatives. The recall
        is intuitively the ability of the classifier to find all the positive
        samples. The last precision and recall values are 1. and 0.
        respectively and do not have a corresponding threshold. This ensures
        that the graph starts on the y axis.

        Read more in the scikit-learn's `User Guide
        <https://scikit-learn.org/stable/modules/model_evaluation.html#precision-recall-f-measure-metrics>`_.


    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True binary labels, {0, 1}.
    probas_pred : array, shape = [n_samples]
        Estimated probabilities or decision function.

    Returns
    -------
    precision : array, shape = [n_thresholds + 1]
        Precision values such that element i is the precision of
        predictions with score >= thresholds[i] and the last element is 1.
    recall : array, shape = [n_thresholds + 1]
        Decreasing recall values such that element i is the recall of
        predictions with score >= thresholds[i] and the last element is 0.
    thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
        Increasing thresholds on the decision function used to compute
        precision and recall.

    Examples
    --------

    .. code-block:: python

        >>> import cupy as cp
        >>> from cuml.metrics import precision_recall_curve
        >>> y_true = cp.array([0, 0, 1, 1])
        >>> y_scores = cp.array([0.1, 0.4, 0.35, 0.8])
        >>> precision, recall, thresholds = precision_recall_curve(
        ...     y_true, y_scores)
        >>> print(precision)
        [0.666... 0.5  1.  1. ]
        >>> print(recall)
        [1. 0.5 0.5 0. ]
        >>> print(thresholds)
        [0.35 0.4 0.8 ]

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cupy_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    y_score, _, _, _ = \
        input_to_cupy_array(probs_pred, check_dtype=[np.int32, np.int64,
                            np.float32, np.float64],
                            check_rows=n_rows, check_cols=n_cols)

    if cp.any(y_true) == 0:
        raise ValueError("precision_recall_curve cannot be used when "
                         "y_true is all zero.")

    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
    precision = cp.flip(tps / (tps + fps), axis=0)
    recall = cp.flip(tps / tps[-1], axis=0)
    n = (recall == 1).sum()

    if n > 1:
        precision = precision[n - 1:]
        recall = recall[n - 1:]
        thresholds = thresholds[n - 1:]
    precision = cp.concatenate([precision, cp.ones(1)])
    recall = cp.concatenate([recall, cp.zeros(1)])

    return precision, recall, thresholds
예제 #17
0
def check_array(array,
                accept_sparse=False,
                accept_large_sparse=True,
                dtype='numeric',
                order=None,
                copy=False,
                force_all_finite=True,
                ensure_2d=True,
                allow_nd=False,
                ensure_min_samples=1,
                ensure_min_features=1,
                warn_on_dtype=None,
                estimator=None):
    """Input validation on an array, list, sparse matrix or similar.
    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the array is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.
    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.
    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.
    order : 'F', 'C' or None (default=None)
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.
    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
        possibilities are:
        - True: Force all values of array to be finite.
        - False: accepts np.inf, np.nan, pd.NA in array.
        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
          cannot be infinite.
           ``force_all_finite`` accepts the string ``'allow-nan'``.
    ensure_2d : boolean (default=True)
        Whether to raise a value error if array is not 2D.
    allow_nd : boolean (default=False)
        Whether to allow array.ndim > 2.
    ensure_min_samples : int (default=1)
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.
    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.
    estimator : unused parameter

    Returns
    -------
    array_converted : object
        The converted and validated array.
    """

    if dtype == 'numeric':
        dtype = numeric_types

    correct_dtype = check_dtype(array, dtype)

    if copy and not order and hasattr(array, 'flags'):
        if array.flags['F_CONTIGUOUS']:
            order = 'F'
        elif array.flags['C_CONTIGUOUS']:
            order = 'C'

    if not order:
        order = 'F'

    hasshape = hasattr(array, 'shape')
    if ensure_2d and hasshape:
        if len(array.shape) != 2:
            raise ValueError("Not 2D")

    if not allow_nd and hasshape:
        if len(array.shape) > 2:
            raise ValueError("More than 2 dimensions detected")

    if ensure_min_samples > 0 and hasshape:
        if array.shape[0] < ensure_min_samples:
            raise ValueError("Not enough samples")

    if ensure_min_features > 0 and hasshape and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError("Found array with %d feature(s) (shape=%s) while"
                             " a minimum of %d is required." %
                             (n_features, array.shape, ensure_min_features))

    is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array)
    if is_sparse:
        check_sparse(array, accept_sparse, accept_large_sparse)
        if array.format == 'csr':
            new_array = gpu_csr_matrix(array, copy=copy)
        elif array.format == 'csc':
            new_array = gpu_csc_matrix(array, copy=copy)
        elif array.format == 'coo':
            new_array = gpu_coo_matrix(array, copy=copy)
        else:
            raise ValueError('Sparse matrix format not supported')
        check_finite(new_array.data, force_all_finite)
        if correct_dtype != new_array.dtype:
            new_array = new_array.astype(correct_dtype)
        return new_array
    else:
        X, n_rows, n_cols, dtype = input_to_cupy_array(array,
                                                       order=order,
                                                       deepcopy=copy,
                                                       fail_on_null=False)
        if correct_dtype != dtype:
            X = X.astype(correct_dtype)
        check_finite(X, force_all_finite)
        return X
예제 #18
0
파일: base.py 프로젝트: jpurviance/cuml
    def __init__(self,
                 *,
                 model,
                 background,
                 order=None,
                 order_default='C',
                 link='identity',
                 verbose=False,
                 random_state=None,
                 is_gpu_model=None,
                 handle=None,
                 dtype=None,
                 output_type=None):

        if verbose is True:
            self.verbose = logger.level_debug
        elif verbose is False:
            self.verbose = logger.level_error
        else:
            self.verbose = verbose

        if handle is None:
            self.handle = get_handle_from_cuml_model_func(model,
                                                          create_new=True)
        else:
            self.handle = handle

        if order is None:
            self.order = get_tag_from_model_func(func=model,
                                                 tag='preferred_input_order',
                                                 default=order_default)
        else:
            self.order = order

        self.link = link
        self.link_fn = get_link_fn_from_str_or_fn(link)
        self.model = model
        if is_gpu_model is None:
            # todo (dgd): when sparse support is added, use this tag to see if
            # model can accept sparse data
            self.is_gpu_model = \
                get_tag_from_model_func(func=model,
                                        tag='X_types_gpu',
                                        default=None) is not None
        else:
            self.is_gpu_model = is_gpu_model

        # we are defaulting to numpy for now for compatibility
        if output_type is None:
            # self.output_type = 'cupy' if self.is_gpu_model else 'numpy'
            self.output_type = 'numpy'
        else:
            self.output_type = output_type

        # if not dtype is specified, we try to get it from the model
        if dtype is None:
            self.dtype = get_dtype_from_model_func(func=model,
                                                   default=np.float32)
        else:
            if dtype in [np.float32, np.float64]:
                self.dtype = np.dtype(dtype)
            raise ValueError("dtype must be either np.float32 or np.float64")

        self.background, self.N, self.M, _ = \
            input_to_cupy_array(background, order=self.order,
                                convert_to_dtype=self.dtype)

        self.random_state = random_state

        if isinstance(background,
                      pandas.DataFrame) or isinstance(background,
                                                      cudf.DataFrame):
            self.feature_names = background.columns.to_list()
        else:
            self.feature_names = [None for _ in range(len(background))]

        # evaluate the model in background to get the expected_value
        self.expected_value = self.link_fn(
            cp.mean(
                model_func_call(X=self.background,
                                model_func=self.model,
                                gpu_model=self.is_gpu_model),
                axis=0
            )
        )

        # D tells us the dimension of the model. For example, `predict_proba`
        # functions typically return n values for n classes as opposed to
        # 1 valued for a typical `predict`
        if len(self.expected_value.shape) == 0:
            self.D = 1
        else:
            self.D = self.expected_value.shape[0]
예제 #19
0
def log_loss(y_true,
             y_pred,
             eps=1e-15,
             normalize=True,
             sample_weight=None) -> float:
    """ Log loss, aka logistic loss or cross-entropy loss.
    This is the loss function used in (multinomial) logistic regression
    and extensions of it such as neural networks, defined as the negative
    log-likelihood of a logistic model that returns ``y_pred`` probabilities
    for its training data ``y_true``.
    The log loss is only defined for two or more labels.

    Parameters
    ----------
    y_true : array-like, shape = (n_samples,)
    y_pred : array-like of float,
        shape = (n_samples, n_classes) or (n_samples,)
    eps : float (default=1e-15)
        Log loss is undefined for p=0 or p=1, so probabilities are
        clipped to max(eps, min(1 - eps, p)).
    normalize : bool, optional (default=True)
        If true, return the mean loss per sample.
        Otherwise, return the sum of the per-sample losses.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    Returns
    -------
    loss : float

    Examples
    --------
    .. code-block:: python

        >>> from cuml.metrics import log_loss
        >>> import cupy as cp
        >>> log_loss(cp.array([1, 0, 0, 1]),
        ...          cp.array([[.1, .9], [.9, .1], [.8, .2], [.35, .65]]))
        0.21616...

    References
    ----------
    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
    p. 209.

    Notes
    -----
    The logarithm used is the natural logarithm (base-e).

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cupy_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)):
        raise ValueError("'y_true' can only have integer values")
    if y_true.min() < 0:
        raise ValueError("'y_true' cannot have negative values")

    y_pred, _, _, _ = \
        input_to_cupy_array(y_pred, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64],
                            check_rows=n_rows)

    y_true_max = y_true.max()
    if (y_pred.ndim == 1 and y_true_max > 1) \
       or (y_pred.ndim > 1 and y_pred.shape[1] <= y_true_max):
        raise ValueError("The shape of y_pred doesn't "
                         "match the number of classes")

    y_true = y_true.astype('int32')
    y_pred = cp.clip(y_pred, eps, 1 - eps)
    if y_pred.ndim == 1:
        y_pred = cp.expand_dims(y_pred, axis=1)
    if y_pred.shape[1] == 1:
        y_pred = cp.hstack([1 - y_pred, y_pred])

    y_pred /= cp.sum(y_pred, axis=1, keepdims=True)
    loss = -cp.log(y_pred)[cp.arange(y_pred.shape[0]), y_true]
    return _weighted_sum(loss, sample_weight, normalize).item()
예제 #20
0
def confusion_matrix(y_true,
                     y_pred,
                     labels=None,
                     sample_weight=None,
                     normalize=None) -> CumlArray:
    """Compute confusion matrix to evaluate the accuracy of a classification.

    Parameters
    ----------
    y_true : array-like (device or host) shape = (n_samples,)
        or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like (device or host) shape = (n_samples,)
        or (n_samples, n_outputs)
        Estimated target values.
    labels : array-like (device or host) shape = (n_classes,), optional
        List of labels to index the matrix. This may be used to reorder or
        select a subset of labels. If None is given, those that appear at least
        once in y_true or y_pred are used in sorted order.
    sample_weight : array-like (device or host) shape = (n_samples,), optional
        Sample weights.
    normalize : string in [‘true’, ‘pred’, ‘all’]
        Normalizes confusion matrix over the true (rows), predicted (columns)
        conditions or all the population. If None, confusion matrix will not be
        normalized.

    Returns
    -------
    C : array-like (device or host) shape = (n_classes, n_classes)
        Confusion matrix.
    """
    y_true, n_rows, n_cols, dtype = \
        input_to_cuml_array(y_true, check_dtype=[cp.int32, cp.int64])

    y_pred, _, _, _ = \
        input_to_cuml_array(y_pred, check_dtype=dtype,
                            check_rows=n_rows, check_cols=n_cols)

    if labels is None:
        labels = sorted_unique_labels(y_true, y_pred)
        n_labels = len(labels)
    else:
        labels, n_labels, _, _ = \
            input_to_cupy_array(labels, check_dtype=dtype, check_cols=1)
    if sample_weight is None:
        sample_weight = cp.ones(n_rows, dtype=dtype)
    else:
        sample_weight, _, _, _ = \
            input_to_cupy_array(sample_weight,
                                check_dtype=[cp.float32, cp.float64,
                                             cp.int32, cp.int64],
                                check_rows=n_rows, check_cols=n_cols)

    if normalize not in ['true', 'pred', 'all', None]:
        msg = "normalize must be one of " \
              f"{{'true', 'pred', 'all', None}}, got {normalize}."
        raise ValueError(msg)

    with using_output_type("cupy"):
        y_true, _ = make_monotonic(y_true, labels, copy=True)
        y_pred, _ = make_monotonic(y_pred, labels, copy=True)

    # intersect y_pred, y_true with labels, eliminate items not in labels
    ind = cp.logical_and(y_pred < n_labels, y_true < n_labels)
    y_pred = y_pred[ind]
    y_true = y_true[ind]
    sample_weight = sample_weight[ind]

    cm = cupyx.scipy.sparse.coo_matrix((sample_weight, (y_true, y_pred)),
                                       shape=(n_labels, n_labels),
                                       dtype=np.float64).toarray()

    # Choose the accumulator dtype to always have high precision
    if sample_weight.dtype.kind in {'i', 'u', 'b'}:
        cm = cm.astype(np.int64)

    with np.errstate(all='ignore'):
        if normalize == 'true':
            cm = cp.divide(cm, cm.sum(axis=1, keepdims=True))
        elif normalize == 'pred':
            cm = cp.divide(cm, cm.sum(axis=0, keepdims=True))
        elif normalize == 'all':
            cm = cp.divide(cm, cm.sum())
        cm = cp.nan_to_num(cm)

    return cm