示例#1
0
def test_dtype_check(dtype, check_dtype, input_type, order):

    if (dtype == np.float16 or check_dtype == np.float16)\
            and input_type != 'numpy':
        pytest.xfail("float16 not yet supported by numba/cuDF")

    if dtype in [np.uint8, np.uint16, np.uint32, np.uint64]:
        if input_type in ['cudf', 'pandas']:
            pytest.xfail("unsigned int types not yet supported")

    input_data, real_data = get_input(input_type, 10, 10, dtype, order=order)

    if input_type == 'cupy' and input_data is None:
        pytest.skip('cupy not installed')

    if dtype == check_dtype:
        _, _, _, got_dtype = \
            input_to_cuml_array(input_data, check_dtype=check_dtype,
                                order=order)
        assert got_dtype == check_dtype
    else:
        with pytest.raises(TypeError):
            _, _, _, got_dtype = \
                input_to_cuml_array(input_data, check_dtype=check_dtype,
                                    order=order)
示例#2
0
def test_svm_skl_cmp_predict_proba(in_type, n_rows=10000, n_cols=20):
    params = {
        'kernel': 'rbf',
        'C': 1,
        'tol': 1e-3,
        'gamma': 'scale',
        'probability': True
    }
    X, y = make_classification(n_samples=n_rows,
                               n_features=n_cols,
                               n_informative=2,
                               n_redundant=10,
                               random_state=137)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.8,
                                                        random_state=42)

    X_m = input_to_cuml_array(X_train).array
    y_m = input_to_cuml_array(y_train).array

    cuSVC = cu_svm.SVC(**params)
    cuSVC.fit(X_m.to_output(in_type), y_m.to_output(in_type))
    sklSVC = svm.SVC(**params)
    sklSVC.fit(X_train, y_train)
    compare_probabilistic_svm(cuSVC, sklSVC, X_test, y_test, 1e-3, 1e-2)
示例#3
0
def test_convert_matrix_order_cuml_array(dtype, input_type, from_order,
                                         to_order):
    input_data, real_data = get_input(input_type,
                                      10,
                                      10,
                                      dtype,
                                      order=from_order)

    # conv_data = np.array(real_data, order=to_order, copy=True)
    if from_order == to_order or to_order == 'K':
        conv_data, *_ = input_to_cuml_array(input_data,
                                            fail_on_order=False,
                                            order=to_order)
    else:
        # Warning is raised for non cudf dataframe or numpy arrays
        # those are converted form order by their respective libraries
        if input_type in ['numpy', 'cupy', 'numba']:
            with pytest.warns(UserWarning):
                conv_data, *_ = input_to_cuml_array(input_data,
                                                    fail_on_order=False,
                                                    order=to_order)
        else:
            conv_data, *_ = input_to_cuml_array(input_data,
                                                fail_on_order=False,
                                                order=to_order)

    if to_order == 'K':
        if input_type in ['cudf', 'pandas']:
            assert conv_data.order == 'F'
        else:
            assert conv_data.order == from_order
    else:
        assert conv_data.order == to_order
    np.testing.assert_equal(real_data, conv_data.to_output('numpy'))
示例#4
0
def test_fail_on_order(dtype, input_type, order, order_check):
    # this is tested only for non cudf dataframe or numpy arrays
    # those are converted form order by their respective libraries
    input_data, real_data = get_input(input_type, 10, 10, dtype, order=order)

    if input_type == 'cupy' and input_data is None:
        pytest.skip('cupy not installed')

    if order == order_check:
        input_to_cuml_array(input_data, fail_on_order=False, order=order)
    else:
        with pytest.raises(ValueError):
            input_to_cuml_array(input_data, fail_on_order=True,
                                order=order_check)
示例#5
0
def to_output_type(array, output_type, order='F'):
    """Used to convert arrays while creating datasets
    for testing.

    Parameters
    ----------
    array : array
        Input array to convert
    output_type : string
        Type of to convert to

    Returns
    -------
    Converted array
    """
    if output_type == 'scipy_csr':
        return cpu_sparse.csr_matrix(array.get())
    if output_type == 'scipy_csc':
        return cpu_sparse.csc_matrix(array.get())
    if output_type == 'scipy_coo':
        return cpu_sparse.coo_matrix(array.get())
    if output_type == 'cupy_csr':
        if array.format in ['csc', 'coo']:
            return array.tocsr()
        else:
            return array
    if output_type == 'cupy_csc':
        if array.format in ['csr', 'coo']:
            return array.tocsc()
        else:
            return array
    if output_type == 'cupy_coo':
        if array.format in ['csr', 'csc']:
            return array.tocoo()
        else:
            return array

    if cpu_sparse.issparse(array):
        if output_type == 'numpy':
            return array.todense()
        elif output_type == 'cupy':
            return cp.array(array.todense())
        else:
            array = array.todense()
    elif gpu_sparse.issparse(array):
        if output_type == 'numpy':
            return cp.asnumpy(array.todense())
        elif output_type == 'cupy':
            return array.todense()
        else:
            array = array.todense()

    cuml_array = input_to_cuml_array(array, order=order)[0]
    if output_type == 'series' and len(array.shape) > 1:
        output_type = 'cudf'

    return cuml_array.to_output(output_type)
示例#6
0
    def fit(self, X, y=None):
        """
        Fit the model with X, using minibatches of size batch_size.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        y : Ignored

        Returns
        -------

        self : object
            Returns the instance itself.

        """

        self._set_base_attributes(output_type=X)

        self.n_samples_seen_ = 0
        self._mean_ = .0
        self.var_ = .0

        if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
            X = _validate_sparse_input(X)
        else:
            X, n_samples, n_features, self.dtype = \
                input_to_cuml_array(X, order='K',
                                    check_dtype=[cp.float32, cp.float64])

            # NOTE: While we cast the input to a cupy array here, we still
            # respect the `output_type` parameter in the constructor. This
            # is done by PCA, which IncrementalPCA inherits from. PCA's
            # transform and inverse transform convert the output to the
            # required type.
            X = X.to_output(output_type='cupy')

        n_samples, n_features = X.shape

        if self.batch_size is None:
            self.batch_size_ = 5 * n_features
        else:
            self.batch_size_ = self.batch_size

        for batch in _gen_batches(n_samples,
                                  self.batch_size_,
                                  min_batch_size=self.n_components or 0):
            X_batch = X[batch]
            if cupyx.scipy.sparse.issparse(X_batch):
                X_batch = X_batch.toarray()

            self.partial_fit(X_batch, check_input=False)

        return self
示例#7
0
def test_convert_vector_order_cuml_array(dtype, input_type, shape, from_order,
                                         to_order):
    input_data, real_data = get_input(input_type, shape[0], shape[1], dtype,
                                      order=from_order)

    # conv_data = np.array(real_data, order=to_order, copy=True)
    conv_data, *_ = input_to_cuml_array(input_data, fail_on_order=False,
                                        order=to_order)

    np.testing.assert_equal(real_data, conv_data.to_output('numpy'))
示例#8
0
def roc_auc_score(y_true, y_score):
    """
    Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.

    .. note:: this implementation can only be used with binary classification.

    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        True labels. The binary cases
        expect labels with shape (n_samples,)
    y_score : array-like of shape (n_samples,)
        Target scores. In the binary cases, these can be either
        probability estimates or non-thresholded decision values (as returned
        by `decision_function` on some classifiers). The binary
        case expects a shape (n_samples,), and the scores must be the scores of
        the class with the greater label.

    Returns
    -------
        auc : float

    Examples
    --------
    >>> import numpy as np
    >>> from cuml.metrics import roc_auc_score
    >>> y_true = np.array([0, 0, 1, 1])
    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
    >>> print(roc_auc_score(y_true, y_scores))
    0.75

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    y_score, _, _, _ = \
        input_to_cuml_array(y_score, check_dtype=[np.int32, np.int64,
                                                  np.float32, np.float64],
                            check_rows=n_rows, check_cols=n_cols)
    return _binary_roc_auc_score(y_true, y_score)
示例#9
0
def test_input_to_cuml_array(dtype, input_type, num_rows, num_cols, order):
    input_data, real_data = get_input(input_type, num_rows, num_cols,
                                      dtype, order=order)

    if input_type == 'cupy' and input_data is None:
        pytest.skip('cupy not installed')

    X, n_rows, n_cols, res_dtype = input_to_cuml_array(input_data,
                                                       order=order)

    np.testing.assert_equal(X.to_output('numpy'), real_data)

    assert n_rows == num_rows == X.shape[0] == len(X)
    assert n_cols == num_cols == X.shape[1]
    assert dtype == res_dtype == X.dtype

    del input_data
    del real_data
示例#10
0
    def transform(self, X, convert_dtype=False):
        """
        Apply dimensionality reduction to X.

        X is projected on the first principal components previously extracted
        from a training set, using minibatches of size batch_size if X is
        sparse.

        Parameters
        ----------

        X : array-like or sparse matrix, shape (n_samples, n_features)
            New data, where n_samples is the number of samples
            and n_features is the number of features.

        convert_dtype : bool, optional (default = False)
            When set to True, the transform method will automatically
            convert the input to the data type which was used to train the
            model. This will increase memory used for the method.

        Returns
        -------

        X_new : array-like, shape (n_samples, n_components)

        """

        if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
            out_type = self._get_output_type(X)

            X = _validate_sparse_input(X)

            n_samples = X.shape[0]
            output = []
            for batch in _gen_batches(n_samples,
                                      self.batch_size_,
                                      min_batch_size=self.n_components or 0):
                output.append(super().transform(X[batch]))
            output, _, _, _ = \
                input_to_cuml_array(cp.vstack(output), order='K')

            return output.to_output(out_type)
        else:
            return super().transform(X)
示例#11
0
文件: adapters.py 项目: shchojj/cuml
def to_output_type(array, output_type, order='F'):
    if output_type == 'scipy_csr':
        return cpu_sparse.csr_matrix(array.get())
    if output_type == 'scipy_csc':
        return cpu_sparse.csc_matrix(array.get())
    if output_type == 'scipy_coo':
        return cpu_sparse.coo_matrix(array.get())
    if output_type == 'cupy_csr':
        if array.format in ['csc', 'coo']:
            return array.tocsr()
        else:
            return array
    if output_type == 'cupy_csc':
        if array.format in ['csr', 'coo']:
            return array.tocsc()
        else:
            return array
    if output_type == 'cupy_coo':
        if array.format in ['csr', 'csc']:
            return array.tocoo()
        else:
            return array

    if cpu_sparse.issparse(array):
        if output_type == 'numpy':
            return array.todense()
        elif output_type == 'cupy':
            return cp.array(array.todense())
        else:
            array = array.todense()
    elif gpu_sparse.issparse(array):
        if output_type == 'numpy':
            return cp.asnumpy(array.todense())
        elif output_type == 'cupy':
            return array.todense()
        else:
            array = array.todense()

    cuml_array = input_to_cuml_array(array, order=order)[0]
    if output_type == 'series' and len(array.shape) > 1:
        output_type = 'cudf'

    return cuml_array.to_output(output_type)
示例#12
0
def test_non_contiguous_to_contiguous_input(dtype, input_type, order,
                                            contiguous, force_contiguous):
    input_data, real_data = get_input(input_type, 10, 8, dtype, order=order)

    if not contiguous:
        if order == 'F':
            data_view = input_data[:-3]
            real_data = real_data[:-3]
        else:
            data_view = input_data[:, :-3]
            real_data = real_data[:, :-3]

    else:
        data_view = input_data

    cumlary, *_ = input_to_cuml_array(data_view,
                                      force_contiguous=force_contiguous)

    if force_contiguous:
        assert (_check_array_contiguity(cumlary))

    np.testing.assert_equal(real_data, cumlary.to_output('numpy'))
示例#13
0
def test_indexed_inputs(input_type, num_rows, num_cols, order):
    if num_cols == 1:
        input_type += '-series'

    index = np.arange(num_rows, 2 * num_rows)

    input_data, real_data = get_input(input_type,
                                      num_rows,
                                      num_cols,
                                      np.float32,
                                      index=index)

    X, n_rows, n_cols, res_dtype = input_to_cuml_array(input_data, order=order)

    # testing the index in the cuml array
    np.testing.assert_equal(X.index.to_numpy(), index)

    # testing the index in the converted outputs
    cudf_output = X.to_output('cudf')
    np.testing.assert_equal(cudf_output.index.to_numpy(), index)

    pandas_output = X.to_output('pandas')
    np.testing.assert_equal(pandas_output.index.to_numpy(), index)
示例#14
0
def check_array(array,
                accept_sparse=False,
                accept_large_sparse=True,
                dtype='numeric',
                order=None,
                copy=False,
                force_all_finite=True,
                ensure_2d=True,
                allow_nd=False,
                ensure_min_samples=1,
                ensure_min_features=1,
                warn_on_dtype=None,
                estimator=None):
    """Input validation on an array, list, sparse matrix or similar.
    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the array is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.
    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.
    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.
    order : 'F', 'C' or None (default=None)
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.
    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
        possibilities are:
        - True: Force all values of array to be finite.
        - False: accepts np.inf, np.nan, pd.NA in array.
        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
          cannot be infinite.
           ``force_all_finite`` accepts the string ``'allow-nan'``.
    ensure_2d : boolean (default=True)
        Whether to raise a value error if array is not 2D.
    allow_nd : boolean (default=False)
        Whether to allow array.ndim > 2.
    ensure_min_samples : int (default=1)
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.
    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.
    estimator : unused parameter

    Returns
    -------
    array_converted : object
        The converted and validated array.
    """

    if dtype == 'numeric':
        dtype = numeric_types

    correct_dtype = check_dtype(array, dtype)

    if copy and not order and hasattr(array, 'flags'):
        if array.flags['F_CONTIGUOUS']:
            order = 'F'
        elif array.flags['C_CONTIGUOUS']:
            order = 'C'

    if not order:
        order = 'F'

    hasshape = hasattr(array, 'shape')
    if ensure_2d and hasshape:
        if len(array.shape) != 2:
            raise ValueError("Not 2D")

    if not allow_nd and hasshape:
        if len(array.shape) > 2:
            raise ValueError("More than 2 dimensions detected")

    if ensure_min_samples > 0 and hasshape:
        if array.shape[0] < ensure_min_samples:
            raise ValueError("Not enough samples")

    if ensure_min_features > 0 and hasshape and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError("Found array with %d feature(s) (shape=%s) while"
                             " a minimum of %d is required." %
                             (n_features, array.shape, ensure_min_features))

    is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array)
    if is_sparse:
        check_sparse(array, accept_sparse, accept_large_sparse)
        if array.format == 'csr':
            new_array = gpu_csr_matrix(array, copy=copy)
        elif array.format == 'csc':
            new_array = gpu_csc_matrix(array, copy=copy)
        elif array.format == 'coo':
            new_array = gpu_coo_matrix(array, copy=copy)
        else:
            raise ValueError('Sparse matrix format not supported')
        check_finite(new_array.data, force_all_finite)
        if correct_dtype != new_array.dtype:
            new_array = new_array.astype(correct_dtype)
        return new_array
    else:
        X, n_rows, n_cols, dtype = input_to_cuml_array(array,
                                                       order=order,
                                                       deepcopy=copy)
        X = X.to_output('cupy')
        if correct_dtype != dtype:
            X = X.astype(correct_dtype)
        check_finite(X, force_all_finite)
        return X
示例#15
0
    def partial_fit(self, X, y=None, check_input=True):
        """
        Incremental fit with X. All of X is processed as a single batch.

        Parameters
        ----------

        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        check_input : bool
            Run check_array on X.
        y : Ignored

        Returns
        -------

        self : object
            Returns the instance itself.

        """
        if check_input:
            if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X):
                raise TypeError(
                    "IncrementalPCA.partial_fit does not support "
                    "sparse input. Either convert data to dense "
                    "or use IncrementalPCA.fit to do so in batches.")

            self._set_output_type(X)

            X, n_samples, n_features, self.dtype = \
                input_to_cuml_array(X, order='K',
                                    check_dtype=[cp.float32, cp.float64])
            X = X.to_output(output_type='cupy')
        else:
            n_samples, n_features = X.shape

        if not hasattr(self, '_components_'):
            self._components_ = None

        if self.n_components is None:
            if self._components_ is None:
                self.n_components_ = min(n_samples, n_features)
            else:
                self.n_components_ = self._components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
        else:
            self.n_components_ = self.n_components

        if (self._components_ is not None) and (self._components_.shape[0] !=
                                                self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self._components_.shape[0], self.n_components_))

        if not self._cupy_attributes:
            self._cumlarray_to_cupy_attrs()
            self._cupy_attributes = True

        # This is the first partial_fit
        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = 0
            self._mean_ = .0
            self.var_ = .0

        # Update stats - they are 0 if this is the first step
        col_mean, col_var, n_total_samples = \
            _incremental_mean_and_var(
                X, last_mean=self._mean_, last_variance=self.var_,
                last_sample_count=cp.repeat(cp.asarray([self.n_samples_seen_]),
                                            X.shape[1]))
        n_total_samples = n_total_samples[0]

        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X = X - col_mean
        else:
            col_batch_mean = cp.mean(X, axis=0)
            X = X - col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = \
                cp.sqrt((self.n_samples_seen_ * n_samples) /
                        n_total_samples) * (self._mean_ - col_batch_mean)
            X = cp.vstack((self._singular_values_.reshape(
                (-1, 1)) * self._components_, X, mean_correction))

        U, S, V = cp.linalg.svd(X, full_matrices=False)
        U, V = _svd_flip(U, V, u_based_decision=False)
        explained_variance = S**2 / (n_total_samples - 1)
        explained_variance_ratio = S**2 / cp.sum(col_var * n_total_samples)

        self.n_samples_seen_ = n_total_samples
        self._components_ = V[:self.n_components_]
        self._singular_values_ = S[:self.n_components_]
        self._mean_ = col_mean
        self.var_ = col_var
        self._explained_variance_ = explained_variance[:self.n_components_]
        self._explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        if self.n_components_ < n_features:
            self._noise_variance_ = \
                explained_variance[self.n_components_:].mean()
        else:
            self._noise_variance_ = 0.

        if self._cupy_attributes:
            self._cupy_to_cumlarray_attrs()
            self._cupy_attributes = False

        return self
示例#16
0
def confusion_matrix(y_true,
                     y_pred,
                     labels=None,
                     sample_weight=None,
                     normalize=None) -> CumlArray:
    """Compute confusion matrix to evaluate the accuracy of a classification.

    Parameters
    ----------
    y_true : array-like (device or host) shape = (n_samples,)
        or (n_samples, n_outputs)
        Ground truth (correct) target values.
    y_pred : array-like (device or host) shape = (n_samples,)
        or (n_samples, n_outputs)
        Estimated target values.
    labels : array-like (device or host) shape = (n_classes,), optional
        List of labels to index the matrix. This may be used to reorder or
        select a subset of labels. If None is given, those that appear at least
        once in y_true or y_pred are used in sorted order.
    sample_weight : array-like (device or host) shape = (n_samples,), optional
        Sample weights.
    normalize : string in [‘true’, ‘pred’, ‘all’]
        Normalizes confusion matrix over the true (rows), predicted (columns)
        conditions or all the population. If None, confusion matrix will not be
        normalized.

    Returns
    -------
    C : array-like (device or host) shape = (n_classes, n_classes)
        Confusion matrix.
    """
    y_true, n_rows, n_cols, dtype = \
        input_to_cuml_array(y_true, check_dtype=[cp.int32, cp.int64])

    y_pred, _, _, _ = \
        input_to_cuml_array(y_pred, check_dtype=dtype,
                            check_rows=n_rows, check_cols=n_cols)

    if labels is None:
        labels = sorted_unique_labels(y_true, y_pred)
        n_labels = len(labels)
    else:
        labels, n_labels, _, _ = \
            input_to_cupy_array(labels, check_dtype=dtype, check_cols=1)
    if sample_weight is None:
        sample_weight = cp.ones(n_rows, dtype=dtype)
    else:
        sample_weight, _, _, _ = \
            input_to_cupy_array(sample_weight,
                                check_dtype=[cp.float32, cp.float64,
                                             cp.int32, cp.int64],
                                check_rows=n_rows, check_cols=n_cols)

    if normalize not in ['true', 'pred', 'all', None]:
        msg = "normalize must be one of " \
              f"{{'true', 'pred', 'all', None}}, got {normalize}."
        raise ValueError(msg)

    with using_output_type("cupy"):
        y_true, _ = make_monotonic(y_true, labels, copy=True)
        y_pred, _ = make_monotonic(y_pred, labels, copy=True)

    # intersect y_pred, y_true with labels, eliminate items not in labels
    ind = cp.logical_and(y_pred < n_labels, y_true < n_labels)
    y_pred = y_pred[ind]
    y_true = y_true[ind]
    sample_weight = sample_weight[ind]

    cm = cupyx.scipy.sparse.coo_matrix((sample_weight, (y_true, y_pred)),
                                       shape=(n_labels, n_labels),
                                       dtype=np.float64).toarray()

    # Choose the accumulator dtype to always have high precision
    if sample_weight.dtype.kind in {'i', 'u', 'b'}:
        cm = cm.astype(np.int64)

    with np.errstate(all='ignore'):
        if normalize == 'true':
            cm = cp.divide(cm, cm.sum(axis=1, keepdims=True))
        elif normalize == 'pred':
            cm = cp.divide(cm, cm.sum(axis=0, keepdims=True))
        elif normalize == 'all':
            cm = cp.divide(cm, cm.sum())
        cm = cp.nan_to_num(cm)

    return cm
示例#17
0
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
    """ Log loss, aka logistic loss or cross-entropy loss.
    This is the loss function used in (multinomial) logistic regression
    and extensions of it such as neural networks, defined as the negative
    log-likelihood of a logistic model that returns ``y_pred`` probabilities
    for its training data ``y_true``.
    The log loss is only defined for two or more labels.

    Parameters
    ----------
    y_true : array-like, shape = (n_samples,)
    y_pred : array-like of float,
        shape = (n_samples, n_classes) or (n_samples,)
    eps : float
        Log loss is undefined for p=0 or p=1, so probabilities are
        clipped to max(eps, min(1 - eps, p)).
    normalize : bool, optional (default=True)
        If true, return the mean loss per sample.
        Otherwise, return the sum of the per-sample losses.
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    Returns
    -------
    loss : float

    Examples
    --------
    >>> from cuml.metrics import log_loss
    >>> import numpy as np
    >>> log_loss(np.array([1, 0, 0, 1]),
    ...          np.array([[.1, .9], [.9, .1], [.8, .2], [.35, .65]]))
    0.21616...

    References
    ----------
    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
    p. 209.

    Notes
    -----
    The logarithm used is the natural logarithm (base-e).

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    y_true = y_true.to_output('cupy')
    if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)):
        raise ValueError("'y_true' can only have integer values")
    if y_true.min() < 0:
        raise ValueError("'y_true' cannot have negative values")

    y_pred, _, _, _ = \
        input_to_cuml_array(y_pred, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64],
                            check_rows=n_rows)

    y_pred = y_pred.to_output('cupy')
    y_true_max = y_true.max()
    if (y_pred.ndim == 1 and y_true_max > 1) \
       or (y_pred.ndim > 1 and y_pred.shape[1] <= y_true_max):
        raise ValueError("The shape of y_pred doesn't "
                         "match the number of classes")

    y_true = y_true.astype('int32')
    y_pred = cp.clip(y_pred, eps, 1 - eps)
    if y_pred.ndim == 1:
        y_pred = cp.expand_dims(y_pred, axis=1)
    if y_pred.shape[1] == 1:
        y_pred = cp.hstack([1 - y_pred, y_pred])

    y_pred /= cp.sum(y_pred, axis=1, keepdims=True)
    loss = -cp.log(y_pred)[cp.arange(y_pred.shape[0]), y_true]
    return _weighted_sum(loss, sample_weight, normalize).item()
示例#18
0
def precision_recall_curve(y_true, probs_pred):
    """
    Compute precision-recall pairs for different probability thresholds

    .. note:: this implementation is restricted to the binary classification
        task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the
        number of true positives and ``fp`` the number of false positives. The
        precision is intuitively the ability of the classifier not to label as
        positive a sample that is negative.

        The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
        of true positives and ``fn`` the number of false negatives. The recall
        is intuitively the ability of the classifier to find all the positive
        samples. The last precision and recall values are 1. and 0.
        respectively and do not have a corresponding threshold. This ensures
        that the graph starts on the y axis.

        Read more in the
        :ref:`User Guide <precision_recall_f_measure_metrics>`.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True binary labels, {0, 1}.
    probas_pred : array, shape = [n_samples]
        Estimated probabilities or decision function.

    Returns
    -------
    precision : array, shape = [n_thresholds + 1]
        Precision values such that element i is the precision of
        predictions with score >= thresholds[i] and the last element is 1.
    recall : array, shape = [n_thresholds + 1]
        Decreasing recall values such that element i is the recall of
        predictions with score >= thresholds[i] and the last element is 0.
    thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
        Increasing thresholds on the decision function used to compute
        precision and recall.

    Examples
    --------
    .. code-block:: python

            import numpy as np
            from cuml.metrics import precision_recall_curve
            y_true = np.array([0, 0, 1, 1])
            y_scores = np.array([0.1, 0.4, 0.35, 0.8])
            precision, recall, thresholds = precision_recall_curve(
                y_true, y_scores)
            print(precision)
            print(recall)
            print(thresholds)

    Output:

    .. code-block:: python

            array([0.66666667, 0.5       , 1.        , 1.        ])
            array([1. , 0.5, 0.5, 0. ])
            array([0.35, 0.4 , 0.8 ])

    """
    y_true, n_rows, n_cols, ytype = \
        input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64,
                                                 np.float32, np.float64])

    y_score, _, _, _ = \
        input_to_cuml_array(probs_pred, check_dtype=[np.int32, np.int64,
                            np.float32, np.float64],
                            check_rows=n_rows, check_cols=n_cols)

    y_true = y_true.to_output('cupy')
    y_score = y_score.to_output('cupy')

    if cp.any(y_true) == 0:
        raise ValueError("precision_recall_curve cannot be used when "
                         "y_true is all zero.")

    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
    precision = cp.flip(tps / (tps + fps), axis=0)
    recall = cp.flip(tps / tps[-1], axis=0)
    n = (recall == 1).sum()

    if n > 1:
        precision = precision[n - 1:]
        recall = recall[n - 1:]
        thresholds = thresholds[n - 1:]
    precision = cp.concatenate([precision, cp.ones(1)])
    recall = cp.concatenate([recall, cp.zeros(1)])

    return precision, recall, thresholds