def fit(self, y): """ Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ self._set_output_type(y) if y.ndim > 2: raise ValueError("labels cannot be greater than 2 dimensions") if y.ndim == 2: unique_classes = cp.unique(y) if unique_classes != [0, 1]: raise ValueError("2-d array can must be binary") self._classes_ = CumlArray(cp.arange(0, y.shape[1])) else: self._classes_ = CumlArray(cp.unique(y).astype(y.dtype)) cp.cuda.Stream.null.synchronize() return self
def _count(self, X, Y): """ Sum feature counts & class prior counts and add to current model. Parameters ---------- X : cupy.ndarray or cupyx.scipy.sparse matrix of size (n_rows, n_features) Y : cupy.array of monotonic class labels """ if X.ndim != 2: raise ValueError("Input samples should be a 2D array") if Y.dtype != self.classes_.dtype: warnings.warn("Y dtype does not match classes_ dtype. Y will be " "converted, which will increase memory consumption") counts = cp.zeros((self._n_classes_, self._n_features_), order="F", dtype=X.dtype) class_c = cp.zeros(self._n_classes_, order="F", dtype=X.dtype) n_rows = X.shape[0] n_cols = X.shape[1] labels_dtype = self.classes_.dtype if cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() count_features_coo = count_features_coo_kernel( X.dtype, labels_dtype) count_features_coo((math.ceil(X.nnz / 32), ), (32, ), (counts, X.row, X.col, X.data, X.nnz, n_rows, n_cols, Y, self._n_classes_, False)) else: count_features_dense = count_features_dense_kernel( X.dtype, labels_dtype) count_features_dense( (math.ceil(n_rows / 32), math.ceil(n_cols / 32), 1), (32, 32, 1), (counts, X, n_rows, n_cols, Y, self._n_classes_, False, X.flags["C_CONTIGUOUS"])) count_classes = count_classes_kernel(X.dtype, labels_dtype) count_classes((math.ceil(n_rows / 32), ), (32, ), (class_c, n_rows, Y)) self._feature_count_ = CumlArray(self._feature_count_ + counts) self._class_count_ = CumlArray(self._class_count_ + class_c)
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = rmm_cupy_ary(cp.asarray, X) X_m = X_m.astype(to_dtype) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type: %s" % type(X)) return X
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype`, raising a TypeError if the conversion would lose information. """ would_lose_info = _typecast_will_lose_information(X, to_dtype) if would_lose_info: raise TypeError("Data type conversion would lose information.") if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) return X_m elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)): return X.astype(to_dtype, copy=False) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) X_m = X_m.astype(to_dtype, copy=False) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type: %s" % type(X)) return X
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): rand_mat = (cp.random.rand(nrows, ncols) * 10) rand_mat = cp.array(rand_mat, dtype=dtype, order=order) if type == 'numpy': result = np.array(cp.asnumpy(rand_mat), order=order) if type == 'cupy': result = rand_mat if type == 'numba': result = nbcuda.as_cuda_array(rand_mat) if type == 'cudf': result = cudf.DataFrame(rand_mat) if type == 'pandas': result = pdDF(cp.asnumpy(rand_mat)) if type == 'cuml': result = CumlArray(data=rand_mat) if out_dtype: return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype), order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order)
def predict(self, X): """ Perform classification on an array of test vectors X. """ out_type = self._get_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') jll = self._joint_log_likelihood(X) indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype) y_hat = invert_labels(indices, classes=self.classes_) return CumlArray(data=y_hat).to_output(out_type)
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False): rand_mat = (cp.random.rand(nrows, ncols) * 10) rand_mat = cp.array(rand_mat, order=order).astype(dtype) if type == 'numpy': result = np.array(cp.asnumpy(rand_mat), order=order) if type == 'cupy': result = rand_mat if type == 'numba': result = nbcuda.as_cuda_array(rand_mat) if type == 'cudf': result = cudf.DataFrame() result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat)) if type == 'pandas': result = cudf.DataFrame() result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat)) result = result.to_pandas() if type == 'cuml': result = CumlArray(data=rand_mat, dtype=dtype, shape=rand_mat.shape, order=order if order != 'K' else None) if out_dtype: return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype), order=order) else: return result, np.array(cp.asnumpy(rand_mat), order=order)
def predict_proba(self, X): """ Return probability estimates for the test vector X. """ out_type = self._get_output_type(X) result = cp.exp(self.predict_log_proba(X)) return CumlArray(result).to_output(out_type)
def _update_class_log_prior(self, class_prior=None): if class_prior is not None: if class_prior.shape[0] != self._n_classes_: raise ValueError("Number of classes must match " "number of priors") self._class_log_prior_ = cp.log(class_prior) elif self.fit_prior: log_class_count = cp.log(self._class_count_) self._class_log_prior_ = \ CumlArray(log_class_count - cp.log( cp.asarray(self._class_count_).sum())) else: self._class_log_prior_ = CumlArray( cp.full(self._n_classes_, -1 * math.log(self._n_classes_)))
def _update_feature_log_prob(self, alpha): """ Apply add-lambda smoothing to raw counts and recompute log probabilities Parameters ---------- alpha : float amount of smoothing to apply (0. means no smoothing) """ smoothed_fc = cp.asarray(self._feature_count_) + alpha smoothed_cc = smoothed_fc.sum(axis=1).reshape(-1, 1) self._feature_log_prob_ = CumlArray( cp.log(smoothed_fc) - cp.log(smoothed_cc.reshape(-1, 1)))
def _partial_fit(self, X, y, sample_weight=None, _classes=None): self._set_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') y = input_to_cuml_array(y).array.to_output('cupy') Y, label_classes = make_monotonic(y, copy=True) if not self.fit_called_: self.fit_called_ = True if _classes is not None: _classes, *_ = input_to_cuml_array(_classes, order='K') check_labels(Y, _classes.to_output('cupy')) self._classes_ = _classes else: self._classes_ = CumlArray(data=label_classes) self._n_classes_ = self.classes_.shape[0] self._n_features_ = X.shape[1] self._init_counters(self._n_classes_, self._n_features_, X.dtype) else: check_labels(Y, self._classes_) self._count(X, Y) self._update_feature_log_prob(self.alpha) self._update_class_log_prior(class_prior=self._class_prior_) return self
def predict_log_proba(self, X): """ Return log-probability estimates for the test vector X. """ out_type = self._get_output_type(X) if has_scipy(): from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix else: from cuml.common.import_utils import dummy_function_always_false \ as scipy_sparse_isspmatrix # todo: use a sparse CumlArray style approach when ready # https://github.com/rapidsai/cuml/issues/2216 if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X): X = X.tocoo() rows = cp.asarray(X.row, dtype=X.row.dtype) cols = cp.asarray(X.col, dtype=X.col.dtype) data = cp.asarray(X.data, dtype=X.data.dtype) X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)), shape=X.shape) else: X = input_to_cuml_array(X, order='K').array.to_output('cupy') jll = self._joint_log_likelihood(X) # normalize by P(X) = P(f_1, ..., f_n) # Compute log(sum(exp())) # Subtract max in exp to prevent inf a_max = cp.amax(jll, axis=1, keepdims=True) exp = cp.exp(jll - a_max) logsumexp = cp.log(cp.sum(exp, axis=1)) a_max = cp.squeeze(a_max, axis=1) log_prob_x = a_max + logsumexp if log_prob_x.ndim < 2: log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0])) result = jll - log_prob_x.T return CumlArray(result).to_output(out_type)
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ # temporarily importing here, until github issue #1681 reorganizing utils # is dealt with. Otherwise circular import causes issues from cuml.common import CumlArray if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = rmm_cupy_ary(cp.asarray, X) X_m = X_m.astype(to_dtype) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type " % type(X)) return X
def input_to_cuml_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to CumlArray. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: 'F', 'C' or 'K' (default: 'F') Whether to return a F-major ('F'), C-major ('C') array or Keep ('K') the order of X. Used to check the order of the input. If fail_on_order=True, the method will raise ValueError, otherwise it will convert X to be of order `order` if needed. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype') A new CumlArray and associated data. """ # dtype conversion if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False # format conversion if (isinstance(X, cudf.Series)): if X.null_count != 0: raise ValueError("Error: cuDF Series has missing/null values, " + " which are not supported by cuML.") # converting pandas to numpy before sending it to CumlArray if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): # pandas doesn't support custom order in to_numpy X = cp.asarray(X.to_numpy(copy=False), order=order) if isinstance(X, cudf.DataFrame): if order == 'K': X_m = CumlArray(data=X.as_gpu_matrix(order='F')) else: X_m = CumlArray(data=X.as_gpu_matrix(order=order)) elif isinstance(X, CumlArray): X_m = X elif hasattr(X, "__array_interface__") or \ hasattr(X, "__cuda_array_interface__"): X_m = CumlArray(data=X) if deepcopy: X_m = copy.deepcopy(X_m) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) if check_dtype: if not isinstance(check_dtype, list): check_dtype = [check_dtype] check_dtype = [np.dtype(dtype) for dtype in check_dtype] if X_m.dtype not in check_dtype: type_str = X_m.dtype del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(type_str)) # Checks based on parameters n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if n_cols == 1 or n_rows == 1: order = 'K' if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if order != 'K' and X_m.order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = CumlArray(data=X_m) return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
def _init_counters(self, n_effective_classes, n_features, dtype): self._class_count_ = CumlArray.zeros(n_effective_classes, order="F", dtype=dtype) self._feature_count_ = CumlArray.zeros( (n_effective_classes, n_features), order="F", dtype=dtype)