def test_dtype_check(dtype, check_dtype, input_type, order): if (dtype == np.float16 or check_dtype == np.float16)\ and input_type != 'numpy': pytest.xfail("float16 not yet supported by numba/cuDF") if dtype in [np.uint8, np.uint16, np.uint32, np.uint64]: if input_type in ['cudf', 'pandas']: pytest.xfail("unsigned int types not yet supported") input_data, real_data = get_input(input_type, 10, 10, dtype, order=order) if input_type == 'cupy' and input_data is None: pytest.skip('cupy not installed') if dtype == check_dtype: _, _, _, got_dtype = \ input_to_cuml_array(input_data, check_dtype=check_dtype, order=order) assert got_dtype == check_dtype else: with pytest.raises(TypeError): _, _, _, got_dtype = \ input_to_cuml_array(input_data, check_dtype=check_dtype, order=order)
def test_svm_skl_cmp_predict_proba(in_type, n_rows=10000, n_cols=20): params = { 'kernel': 'rbf', 'C': 1, 'tol': 1e-3, 'gamma': 'scale', 'probability': True } X, y = make_classification(n_samples=n_rows, n_features=n_cols, n_informative=2, n_redundant=10, random_state=137) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) X_m = input_to_cuml_array(X_train).array y_m = input_to_cuml_array(y_train).array cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_m.to_output(in_type), y_m.to_output(in_type)) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_probabilistic_svm(cuSVC, sklSVC, X_test, y_test, 1e-3, 1e-2)
def test_convert_matrix_order_cuml_array(dtype, input_type, from_order, to_order): input_data, real_data = get_input(input_type, 10, 10, dtype, order=from_order) # conv_data = np.array(real_data, order=to_order, copy=True) if from_order == to_order or to_order == 'K': conv_data, *_ = input_to_cuml_array(input_data, fail_on_order=False, order=to_order) else: # Warning is raised for non cudf dataframe or numpy arrays # those are converted form order by their respective libraries if input_type in ['numpy', 'cupy', 'numba']: with pytest.warns(UserWarning): conv_data, *_ = input_to_cuml_array(input_data, fail_on_order=False, order=to_order) else: conv_data, *_ = input_to_cuml_array(input_data, fail_on_order=False, order=to_order) if to_order == 'K': if input_type in ['cudf', 'pandas']: assert conv_data.order == 'F' else: assert conv_data.order == from_order else: assert conv_data.order == to_order np.testing.assert_equal(real_data, conv_data.to_output('numpy'))
def test_fail_on_order(dtype, input_type, order, order_check): # this is tested only for non cudf dataframe or numpy arrays # those are converted form order by their respective libraries input_data, real_data = get_input(input_type, 10, 10, dtype, order=order) if input_type == 'cupy' and input_data is None: pytest.skip('cupy not installed') if order == order_check: input_to_cuml_array(input_data, fail_on_order=False, order=order) else: with pytest.raises(ValueError): input_to_cuml_array(input_data, fail_on_order=True, order=order_check)
def to_output_type(array, output_type, order='F'): """Used to convert arrays while creating datasets for testing. Parameters ---------- array : array Input array to convert output_type : string Type of to convert to Returns ------- Converted array """ if output_type == 'scipy_csr': return cpu_sparse.csr_matrix(array.get()) if output_type == 'scipy_csc': return cpu_sparse.csc_matrix(array.get()) if output_type == 'scipy_coo': return cpu_sparse.coo_matrix(array.get()) if output_type == 'cupy_csr': if array.format in ['csc', 'coo']: return array.tocsr() else: return array if output_type == 'cupy_csc': if array.format in ['csr', 'coo']: return array.tocsc() else: return array if output_type == 'cupy_coo': if array.format in ['csr', 'csc']: return array.tocoo() else: return array if cpu_sparse.issparse(array): if output_type == 'numpy': return array.todense() elif output_type == 'cupy': return cp.array(array.todense()) else: array = array.todense() elif gpu_sparse.issparse(array): if output_type == 'numpy': return cp.asnumpy(array.todense()) elif output_type == 'cupy': return array.todense() else: array = array.todense() cuml_array = input_to_cuml_array(array, order=order)[0] if output_type == 'series' and len(array.shape) > 1: output_type = 'cudf' return cuml_array.to_output(output_type)
def fit(self, X, y=None): """ Fit the model with X, using minibatches of size batch_size. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : Ignored Returns ------- self : object Returns the instance itself. """ self._set_base_attributes(output_type=X) self.n_samples_seen_ = 0 self._mean_ = .0 self.var_ = .0 if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X): X = _validate_sparse_input(X) else: X, n_samples, n_features, self.dtype = \ input_to_cuml_array(X, order='K', check_dtype=[cp.float32, cp.float64]) # NOTE: While we cast the input to a cupy array here, we still # respect the `output_type` parameter in the constructor. This # is done by PCA, which IncrementalPCA inherits from. PCA's # transform and inverse transform convert the output to the # required type. X = X.to_output(output_type='cupy') n_samples, n_features = X.shape if self.batch_size is None: self.batch_size_ = 5 * n_features else: self.batch_size_ = self.batch_size for batch in _gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0): X_batch = X[batch] if cupyx.scipy.sparse.issparse(X_batch): X_batch = X_batch.toarray() self.partial_fit(X_batch, check_input=False) return self
def test_convert_vector_order_cuml_array(dtype, input_type, shape, from_order, to_order): input_data, real_data = get_input(input_type, shape[0], shape[1], dtype, order=from_order) # conv_data = np.array(real_data, order=to_order, copy=True) conv_data, *_ = input_to_cuml_array(input_data, fail_on_order=False, order=to_order) np.testing.assert_equal(real_data, conv_data.to_output('numpy'))
def roc_auc_score(y_true, y_score): """ Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. .. note:: this implementation can only be used with binary classification. Parameters ---------- y_true : array-like of shape (n_samples,) True labels. The binary cases expect labels with shape (n_samples,) y_score : array-like of shape (n_samples,) Target scores. In the binary cases, these can be either probability estimates or non-thresholded decision values (as returned by `decision_function` on some classifiers). The binary case expects a shape (n_samples,), and the scores must be the scores of the class with the greater label. Returns ------- auc : float Examples -------- >>> import numpy as np >>> from cuml.metrics import roc_auc_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> print(roc_auc_score(y_true, y_scores)) 0.75 """ y_true, n_rows, n_cols, ytype = \ input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64, np.float32, np.float64]) y_score, _, _, _ = \ input_to_cuml_array(y_score, check_dtype=[np.int32, np.int64, np.float32, np.float64], check_rows=n_rows, check_cols=n_cols) return _binary_roc_auc_score(y_true, y_score)
def test_input_to_cuml_array(dtype, input_type, num_rows, num_cols, order): input_data, real_data = get_input(input_type, num_rows, num_cols, dtype, order=order) if input_type == 'cupy' and input_data is None: pytest.skip('cupy not installed') X, n_rows, n_cols, res_dtype = input_to_cuml_array(input_data, order=order) np.testing.assert_equal(X.to_output('numpy'), real_data) assert n_rows == num_rows == X.shape[0] == len(X) assert n_cols == num_cols == X.shape[1] assert dtype == res_dtype == X.dtype del input_data del real_data
def transform(self, X, convert_dtype=False): """ Apply dimensionality reduction to X. X is projected on the first principal components previously extracted from a training set, using minibatches of size batch_size if X is sparse. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) New data, where n_samples is the number of samples and n_features is the number of features. convert_dtype : bool, optional (default = False) When set to True, the transform method will automatically convert the input to the data type which was used to train the model. This will increase memory used for the method. Returns ------- X_new : array-like, shape (n_samples, n_components) """ if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X): out_type = self._get_output_type(X) X = _validate_sparse_input(X) n_samples = X.shape[0] output = [] for batch in _gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0): output.append(super().transform(X[batch])) output, _, _, _ = \ input_to_cuml_array(cp.vstack(output), order='K') return output.to_output(out_type) else: return super().transform(X)
def to_output_type(array, output_type, order='F'): if output_type == 'scipy_csr': return cpu_sparse.csr_matrix(array.get()) if output_type == 'scipy_csc': return cpu_sparse.csc_matrix(array.get()) if output_type == 'scipy_coo': return cpu_sparse.coo_matrix(array.get()) if output_type == 'cupy_csr': if array.format in ['csc', 'coo']: return array.tocsr() else: return array if output_type == 'cupy_csc': if array.format in ['csr', 'coo']: return array.tocsc() else: return array if output_type == 'cupy_coo': if array.format in ['csr', 'csc']: return array.tocoo() else: return array if cpu_sparse.issparse(array): if output_type == 'numpy': return array.todense() elif output_type == 'cupy': return cp.array(array.todense()) else: array = array.todense() elif gpu_sparse.issparse(array): if output_type == 'numpy': return cp.asnumpy(array.todense()) elif output_type == 'cupy': return array.todense() else: array = array.todense() cuml_array = input_to_cuml_array(array, order=order)[0] if output_type == 'series' and len(array.shape) > 1: output_type = 'cudf' return cuml_array.to_output(output_type)
def test_non_contiguous_to_contiguous_input(dtype, input_type, order, contiguous, force_contiguous): input_data, real_data = get_input(input_type, 10, 8, dtype, order=order) if not contiguous: if order == 'F': data_view = input_data[:-3] real_data = real_data[:-3] else: data_view = input_data[:, :-3] real_data = real_data[:, :-3] else: data_view = input_data cumlary, *_ = input_to_cuml_array(data_view, force_contiguous=force_contiguous) if force_contiguous: assert (_check_array_contiguity(cumlary)) np.testing.assert_equal(real_data, cumlary.to_output('numpy'))
def test_indexed_inputs(input_type, num_rows, num_cols, order): if num_cols == 1: input_type += '-series' index = np.arange(num_rows, 2 * num_rows) input_data, real_data = get_input(input_type, num_rows, num_cols, np.float32, index=index) X, n_rows, n_cols, res_dtype = input_to_cuml_array(input_data, order=order) # testing the index in the cuml array np.testing.assert_equal(X.index.to_numpy(), index) # testing the index in the converted outputs cudf_output = X.to_output('cudf') np.testing.assert_equal(cudf_output.index.to_numpy(), index) pandas_output = X.to_output('pandas') np.testing.assert_equal(pandas_output.index.to_numpy(), index)
def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype='numeric', order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=None, estimator=None): """Input validation on an array, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing only finite values. If the dtype of the array is object, attempt converting to float, raising on failure. Parameters ---------- array : object Input object to check / convert. accept_sparse : string, boolean or list/tuple of strings (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse=False will cause it to be accepted only if its indices are stored with a 32-bit dtype. dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. When order is None (default), then if copy=False, nothing is ensured about the memory layout of the output array; otherwise (copy=True) the memory layout of the returned array is kept as close as possible to the original array. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - False: accepts np.inf, np.nan, pd.NA in array. - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. ``force_all_finite`` accepts the string ``'allow-nan'``. ensure_2d : boolean (default=True) Whether to raise a value error if array is not 2D. allow_nd : boolean (default=False) Whether to allow array.ndim > 2. ensure_min_samples : int (default=1) Make sure that the array has a minimum number of samples in its first axis (rows for a 2D array). Setting to 0 disables this check. ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when the input data has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. estimator : unused parameter Returns ------- array_converted : object The converted and validated array. """ if dtype == 'numeric': dtype = numeric_types correct_dtype = check_dtype(array, dtype) if copy and not order and hasattr(array, 'flags'): if array.flags['F_CONTIGUOUS']: order = 'F' elif array.flags['C_CONTIGUOUS']: order = 'C' if not order: order = 'F' hasshape = hasattr(array, 'shape') if ensure_2d and hasshape: if len(array.shape) != 2: raise ValueError("Not 2D") if not allow_nd and hasshape: if len(array.shape) > 2: raise ValueError("More than 2 dimensions detected") if ensure_min_samples > 0 and hasshape: if array.shape[0] < ensure_min_samples: raise ValueError("Not enough samples") if ensure_min_features > 0 and hasshape and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError("Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required." % (n_features, array.shape, ensure_min_features)) is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array) if is_sparse: check_sparse(array, accept_sparse, accept_large_sparse) if array.format == 'csr': new_array = gpu_csr_matrix(array, copy=copy) elif array.format == 'csc': new_array = gpu_csc_matrix(array, copy=copy) elif array.format == 'coo': new_array = gpu_coo_matrix(array, copy=copy) else: raise ValueError('Sparse matrix format not supported') check_finite(new_array.data, force_all_finite) if correct_dtype != new_array.dtype: new_array = new_array.astype(correct_dtype) return new_array else: X, n_rows, n_cols, dtype = input_to_cuml_array(array, order=order, deepcopy=copy) X = X.to_output('cupy') if correct_dtype != dtype: X = X.astype(correct_dtype) check_finite(X, force_all_finite) return X
def partial_fit(self, X, y=None, check_input=True): """ Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. check_input : bool Run check_array on X. y : Ignored Returns ------- self : object Returns the instance itself. """ if check_input: if scipy.sparse.issparse(X) or cupyx.scipy.sparse.issparse(X): raise TypeError( "IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches.") self._set_output_type(X) X, n_samples, n_features, self.dtype = \ input_to_cuml_array(X, order='K', check_dtype=[cp.float32, cp.float64]) X = X.to_output(output_type='cupy') else: n_samples, n_features = X.shape if not hasattr(self, '_components_'): self._components_ = None if self.n_components is None: if self._components_ is None: self.n_components_ = min(n_samples, n_features) else: self.n_components_ = self._components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) elif not self.n_components <= n_samples: raise ValueError("n_components=%r must be less or equal to " "the batch number of samples " "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components if (self._components_ is not None) and (self._components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self._components_.shape[0], self.n_components_)) if not self._cupy_attributes: self._cumlarray_to_cupy_attrs() self._cupy_attributes = True # This is the first partial_fit if not hasattr(self, 'n_samples_seen_'): self.n_samples_seen_ = 0 self._mean_ = .0 self.var_ = .0 # Update stats - they are 0 if this is the first step col_mean, col_var, n_total_samples = \ _incremental_mean_and_var( X, last_mean=self._mean_, last_variance=self.var_, last_sample_count=cp.repeat(cp.asarray([self.n_samples_seen_]), X.shape[1])) n_total_samples = n_total_samples[0] # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X = X - col_mean else: col_batch_mean = cp.mean(X, axis=0) X = X - col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ cp.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self._mean_ - col_batch_mean) X = cp.vstack((self._singular_values_.reshape( (-1, 1)) * self._components_, X, mean_correction)) U, S, V = cp.linalg.svd(X, full_matrices=False) U, V = _svd_flip(U, V, u_based_decision=False) explained_variance = S**2 / (n_total_samples - 1) explained_variance_ratio = S**2 / cp.sum(col_var * n_total_samples) self.n_samples_seen_ = n_total_samples self._components_ = V[:self.n_components_] self._singular_values_ = S[:self.n_components_] self._mean_ = col_mean self.var_ = col_var self._explained_variance_ = explained_variance[:self.n_components_] self._explained_variance_ratio_ = \ explained_variance_ratio[:self.n_components_] if self.n_components_ < n_features: self._noise_variance_ = \ explained_variance[self.n_components_:].mean() else: self._noise_variance_ = 0. if self._cupy_attributes: self._cupy_to_cumlarray_attrs() self._cupy_attributes = False return self
def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, normalize=None) -> CumlArray: """Compute confusion matrix to evaluate the accuracy of a classification. Parameters ---------- y_true : array-like (device or host) shape = (n_samples,) or (n_samples, n_outputs) Ground truth (correct) target values. y_pred : array-like (device or host) shape = (n_samples,) or (n_samples, n_outputs) Estimated target values. labels : array-like (device or host) shape = (n_classes,), optional List of labels to index the matrix. This may be used to reorder or select a subset of labels. If None is given, those that appear at least once in y_true or y_pred are used in sorted order. sample_weight : array-like (device or host) shape = (n_samples,), optional Sample weights. normalize : string in [‘true’, ‘pred’, ‘all’] Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. If None, confusion matrix will not be normalized. Returns ------- C : array-like (device or host) shape = (n_classes, n_classes) Confusion matrix. """ y_true, n_rows, n_cols, dtype = \ input_to_cuml_array(y_true, check_dtype=[cp.int32, cp.int64]) y_pred, _, _, _ = \ input_to_cuml_array(y_pred, check_dtype=dtype, check_rows=n_rows, check_cols=n_cols) if labels is None: labels = sorted_unique_labels(y_true, y_pred) n_labels = len(labels) else: labels, n_labels, _, _ = \ input_to_cupy_array(labels, check_dtype=dtype, check_cols=1) if sample_weight is None: sample_weight = cp.ones(n_rows, dtype=dtype) else: sample_weight, _, _, _ = \ input_to_cupy_array(sample_weight, check_dtype=[cp.float32, cp.float64, cp.int32, cp.int64], check_rows=n_rows, check_cols=n_cols) if normalize not in ['true', 'pred', 'all', None]: msg = "normalize must be one of " \ f"{{'true', 'pred', 'all', None}}, got {normalize}." raise ValueError(msg) with using_output_type("cupy"): y_true, _ = make_monotonic(y_true, labels, copy=True) y_pred, _ = make_monotonic(y_pred, labels, copy=True) # intersect y_pred, y_true with labels, eliminate items not in labels ind = cp.logical_and(y_pred < n_labels, y_true < n_labels) y_pred = y_pred[ind] y_true = y_true[ind] sample_weight = sample_weight[ind] cm = cupyx.scipy.sparse.coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=np.float64).toarray() # Choose the accumulator dtype to always have high precision if sample_weight.dtype.kind in {'i', 'u', 'b'}: cm = cm.astype(np.int64) with np.errstate(all='ignore'): if normalize == 'true': cm = cp.divide(cm, cm.sum(axis=1, keepdims=True)) elif normalize == 'pred': cm = cp.divide(cm, cm.sum(axis=0, keepdims=True)) elif normalize == 'all': cm = cp.divide(cm, cm.sum()) cm = cp.nan_to_num(cm) return cm
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): """ Log loss, aka logistic loss or cross-entropy loss. This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of a logistic model that returns ``y_pred`` probabilities for its training data ``y_true``. The log loss is only defined for two or more labels. Parameters ---------- y_true : array-like, shape = (n_samples,) y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) eps : float Log loss is undefined for p=0 or p=1, so probabilities are clipped to max(eps, min(1 - eps, p)). normalize : bool, optional (default=True) If true, return the mean loss per sample. Otherwise, return the sum of the per-sample losses. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- loss : float Examples -------- >>> from cuml.metrics import log_loss >>> import numpy as np >>> log_loss(np.array([1, 0, 0, 1]), ... np.array([[.1, .9], [.9, .1], [.8, .2], [.35, .65]])) 0.21616... References ---------- C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, p. 209. Notes ----- The logarithm used is the natural logarithm (base-e). """ y_true, n_rows, n_cols, ytype = \ input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64, np.float32, np.float64]) y_true = y_true.to_output('cupy') if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)): raise ValueError("'y_true' can only have integer values") if y_true.min() < 0: raise ValueError("'y_true' cannot have negative values") y_pred, _, _, _ = \ input_to_cuml_array(y_pred, check_dtype=[np.int32, np.int64, np.float32, np.float64], check_rows=n_rows) y_pred = y_pred.to_output('cupy') y_true_max = y_true.max() if (y_pred.ndim == 1 and y_true_max > 1) \ or (y_pred.ndim > 1 and y_pred.shape[1] <= y_true_max): raise ValueError("The shape of y_pred doesn't " "match the number of classes") y_true = y_true.astype('int32') y_pred = cp.clip(y_pred, eps, 1 - eps) if y_pred.ndim == 1: y_pred = cp.expand_dims(y_pred, axis=1) if y_pred.shape[1] == 1: y_pred = cp.hstack([1 - y_pred, y_pred]) y_pred /= cp.sum(y_pred, axis=1, keepdims=True) loss = -cp.log(y_pred)[cp.arange(y_pred.shape[0]), y_true] return _weighted_sum(loss, sample_weight, normalize).item()
def precision_recall_curve(y_true, probs_pred): """ Compute precision-recall pairs for different probability thresholds .. note:: this implementation is restricted to the binary classification task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The last precision and recall values are 1. and 0. respectively and do not have a corresponding threshold. This ensures that the graph starts on the y axis. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : array, shape = [n_samples] True binary labels, {0, 1}. probas_pred : array, shape = [n_samples] Estimated probabilities or decision function. Returns ------- precision : array, shape = [n_thresholds + 1] Precision values such that element i is the precision of predictions with score >= thresholds[i] and the last element is 1. recall : array, shape = [n_thresholds + 1] Decreasing recall values such that element i is the recall of predictions with score >= thresholds[i] and the last element is 0. thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))] Increasing thresholds on the decision function used to compute precision and recall. Examples -------- .. code-block:: python import numpy as np from cuml.metrics import precision_recall_curve y_true = np.array([0, 0, 1, 1]) y_scores = np.array([0.1, 0.4, 0.35, 0.8]) precision, recall, thresholds = precision_recall_curve( y_true, y_scores) print(precision) print(recall) print(thresholds) Output: .. code-block:: python array([0.66666667, 0.5 , 1. , 1. ]) array([1. , 0.5, 0.5, 0. ]) array([0.35, 0.4 , 0.8 ]) """ y_true, n_rows, n_cols, ytype = \ input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64, np.float32, np.float64]) y_score, _, _, _ = \ input_to_cuml_array(probs_pred, check_dtype=[np.int32, np.int64, np.float32, np.float64], check_rows=n_rows, check_cols=n_cols) y_true = y_true.to_output('cupy') y_score = y_score.to_output('cupy') if cp.any(y_true) == 0: raise ValueError("precision_recall_curve cannot be used when " "y_true is all zero.") fps, tps, thresholds = _binary_clf_curve(y_true, y_score) precision = cp.flip(tps / (tps + fps), axis=0) recall = cp.flip(tps / tps[-1], axis=0) n = (recall == 1).sum() if n > 1: precision = precision[n - 1:] recall = recall[n - 1:] thresholds = thresholds[n - 1:] precision = cp.concatenate([precision, cp.ones(1)]) recall = cp.concatenate([recall, cp.zeros(1)]) return precision, recall, thresholds