def fit(self, y): """Fit label binarizer` Parameters ---------- y : Dask.Array of shape [n_samples,] or [n_samples, n_classes] chunked by row. Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ # Take the unique classes and broadcast them all around the cluster. futures = self.client_.sync(_extract_partitions, y) unique = [ self.client_.submit(LabelBinarizer._func_unique_classes, f) for w, f in futures ] classes = self.client_.compute(unique, True) self.classes_ = rmm_cupy_ary(cp.unique, rmm_cupy_ary(cp.stack, classes, axis=0)) self.model = LB(**self.kwargs).fit(self.classes_) return self
def fit(self, y): """ Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ if y.ndim > 2: raise ValueError("labels cannot be greater than 2 dimensions") if y.ndim == 2: unique_classes = rmm_cupy_ary(cp.unique, y) if unique_classes != [0, 1]: raise ValueError("2-d array can must be binary") self.classes_ = rmm_cupy_ary(cp.arange, 0, y.shape[1]) else: self.classes_ = rmm_cupy_ary(cp.unique, y).astype(y.dtype) cp.cuda.Stream.null.synchronize() return self
def inverse_transform(self, y, threshold=None): """ Transform binary labels back to original multi-class labels Parameters ---------- y : array of shape [n_samples, n_classes] threshold : float this value is currently ignored Returns ------- arr : array with original labels """ # If we are already given multi-class, just return it. if cp.sparse.isspmatrix(y): y_mapped = y.tocsr().indices.astype(self.classes_.dtype) elif scipy.sparse.isspmatrix(y): y = y.tocsr() y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype) else: y_mapped = rmm_cupy_ary(cp.argmax, rmm_cupy_ary(cp.asarray, y, dtype=y.dtype), axis=1).astype(y.dtype) return invert_labels(y_mapped, self.classes_)
def create_local_data(m, n, centers, cluster_std, random_state, dtype, type, order='F'): X, y = skl_make_blobs(m, n, centers=centers, cluster_std=cluster_std, random_state=random_state) if type == 'array': X = rmm_cupy_ary(cp.asarray, X.astype(dtype), order=order) y = rmm_cupy_ary(cp.asarray, y.astype(dtype), order=order).reshape(m, 1) elif type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X.astype(dtype))) y = cudf.DataFrame.from_pandas(pd.DataFrame(y)) else: raise ValueError('type must be array or dataframe') return X, y
def transform(self, y): """ Transform and return encoded labels Parameters ---------- y : Dask.Array of shape [n_samples,] or [n_samples, n_classes] Returns ------- arr : Dask.Array backed by CuPy arrays containing encoded labels """ parts = self.client_.sync(_extract_partitions, y) xform_func = dask.delayed(LabelBinarizer._func_xform) meta = rmm_cupy_ary(cp.zeros, 1) if self.model.sparse_output: meta = cp.sparse.csr_matrix(meta) f = [ dask.array.from_delayed(xform_func(self.model, part), meta=meta, dtype=cp.float32, shape=(len(y), len(self.classes_))) for w, part in parts ] arr = dask.array.asarray(f) return arr.reshape(arr.shape[1:])
def inverse_transform(self, y, threshold=None): """ Invert a set of encoded labels back to original labels Parameters ---------- y : Dask.Array of shape [n_samples, n_classes] containing encoded labels threshold : float This value is currently ignored Returns ------- arr : Dask.Array backed by CuPy arrays containing original labels """ parts = self.client_.sync(_extract_partitions, y) inv_func = dask.delayed(LabelBinarizer._func_inv_xform) dtype = self.classes_.dtype meta = rmm_cupy_ary(cp.zeros, 1, dtype=dtype) f = [ dask.array.from_delayed(inv_func(self.model, part, threshold), dtype=dtype, shape=(y.shape[0], ), meta=meta) for w, part in parts ] ret = dask.array.stack(f, axis=0) return ret.reshape(ret.shape[1:])
def make_monotonic(labels, classes=None, copy=False): """ Takes a set of labels that might not be drawn from the set [0, n-1] and renumbers them to be drawn that interval. Parameters ---------- labels : array-like of size (n,) labels to convert classes : array-like of size (n_classes,) the unique set of classes in the set of labels copy : boolean if true, a copy will be returned and the operation will not be done in place. Returns ------- mapped_labels : array-like of size (n,) classes : array-like of size (n_classes,) """ labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype) if copy: labels = labels.copy() if labels.ndim != 1: raise ValueError("Labels array must be 1D") if classes is None: classes = rmm_cupy_ary(cp.unique, labels) smem = labels.dtype.itemsize * int(classes.shape[0]) map_labels = _map_kernel(labels.dtype) map_labels((math.ceil(labels.shape[0] / 32), ), (32, ), (labels, labels.shape[0], classes, classes.shape[0]), shared_mem=smem) return labels, classes
def invert_labels(labels, classes, copy=False): """ Takes a set of labels that have been mapped to be drawn from a monotonically increasing set and inverts them to back to the original set of classes. Parameters ---------- labels : array-like of size (n,) labels to invert classes : array-like of size (n_classes,) the unique set of classes for inversion. It is assumed that the classes are ordered by their corresponding monotonically increasing label. copy : boolean if true, a copy will be returned and the operation will not be done in place. Returns ------- inverted labels : array-like of size (n,) """ if labels.dtype != classes.dtype: raise ValueError("Labels and classes must have same dtype (%s != %s" % (labels.dtype, classes.dtype)) labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype) classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype) if copy: labels = labels.copy() smem = labels.dtype.itemsize * len(classes) inverse_map = _inverse_map_kernel(labels.dtype) inverse_map((math.ceil(len(labels) / 32), ), (32, ), (classes, len(classes), labels, len(labels)), shared_mem=smem) return labels
def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): """ A stateless helper function to dummy encode multi-class labels. Parameters ---------- y : array-like of size [n_samples,] or [n_samples, n_classes] classes : the set of unique classes in the input neg_label : integer the negative value for transformed output pos_label : integer the positive value for transformed output sparse_output : bool whether to return sparse array """ classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype) labels = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) if not check_labels(labels, classes): raise ValueError("Unseen classes encountered in input") row_ind = rmm_cupy_ary(cp.arange, 0, labels.shape[0], 1, dtype=y.dtype) col_ind, _ = make_monotonic(labels, classes, copy=True) val = rmm_cupy_ary(cp.full, row_ind.shape[0], pos_label, dtype=y.dtype) sp = cp.sparse.coo_matrix((val, (row_ind, col_ind)), shape=(col_ind.shape[0], classes.shape[0]), dtype=cp.float32) cp.cuda.Stream.null.synchronize() if sparse_output: sp = sp.tocsr() return sp else: arr = sp.toarray().astype(y.dtype) arr[arr == 0] = neg_label return arr
def row_matrix(df): """Compute the C (row major) version gpu matrix of df :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device, its stream will be used to perform the transpose (and to copy `row_major` to the device if necessary). """ col_major = df.as_gpu_matrix(order='F') row_major = rmm_cupy_ary(cp.array, col_major, order='C') return cuda.as_cuda_array(row_major)
def check_labels(labels, classes): """ Validates that a set of labels is drawn from the unique set of given classes. Parameters ---------- labels : array-like of size (n,) labels to validate classes : array-like of size (n_classes,) the unique set of classes to verify Returns ------- result : boolean """ if labels.dtype != classes.dtype: raise ValueError("Labels and classes must have same dtype (%s != %s" % (labels.dtype, classes.dtype)) labels = rmm_cupy_ary(cp.asarray, labels, dtype=labels.dtype) classes = rmm_cupy_ary(cp.asarray, classes, dtype=classes.dtype) if labels.ndim != 1: raise ValueError("Labels array must be 1D") valid = cp.array([1]) smem = labels.dtype.itemsize * int(classes.shape[0]) validate = _validate_kernel(labels.dtype) validate((math.ceil(labels.shape[0] / 32), ), (32, ), (labels, labels.shape[0], classes, classes.shape[0], valid), shared_mem=smem) return valid[0] == 1
def _conv_array_to_sparse(arr): """ Converts an array (or cudf.DataFrame) to a sparse array :param arr: scipy or cupy sparse matrix, cudf DataFrame, dense numpy or cupy array :return: cupy sparse CSR matrix """ if scipy.sparse.isspmatrix(arr): ret = \ cupyx.scipy.sparse.csr_matrix(arr.tocsr()) elif cupyx.scipy.sparse.isspmatrix(arr): ret = arr elif isinstance(arr, cudf.DataFrame): ret = _conv_df_to_sparse(arr) elif isinstance(arr, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype) ret = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(arr, cp.core.core.ndarray): ret = cupyx.scipy.sparse.csr_matrix(arr) else: raise ValueError("Unexpected input type %s" % type(arr)) return ret
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ # temporarily importing here, until github issue #1681 reorganizing utils # is dealt with. Otherwise circular import causes issues from cuml.common import CumlArray if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = rmm_cupy_ary(cp.asarray, X) X_m = X_m.astype(to_dtype) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type " % type(X)) return X
def to_sp_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape if isinstance(cudf_or_array, dask.dataframe.DataFrame) or \ isinstance(cudf_or_array, cudf.DataFrame): dtypes = np.unique(cudf_or_array.dtypes) if len(dtypes) > 1: raise ValueError("DataFrame should contain only a single dtype") dtype = dtypes[0] else: dtype = cudf_or_array.dtype meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) if isinstance(cudf_or_array, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. parts = client.sync(_extract_partitions, cudf_or_array) cudf_or_array = [ client.submit(_conv_np_to_df, part, workers=[w]) for w, part in parts ] cudf_or_array = to_dask_cudf(cudf_or_array) if isinstance(cudf_or_array, dask.dataframe.DataFrame): """ Dask.Dataframe needs special attention since it has multiple dtypes. Just use the first (and assume all the rest are the same) """ cudf_or_array = cudf_or_array.map_partitions( _conv_df_to_sp, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return cudf_or_array else: if scipy.sparse.isspmatrix(cudf_or_array): cudf_or_array = \ cupyx.scipy.sparse.csr_matrix(cudf_or_array.tocsr()) elif cupyx.scipy.sparse.isspmatrix(cudf_or_array): pass elif isinstance(cudf_or_array, cudf.DataFrame): cupy_ary = cp.asarray(cudf_or_array.as_gpu_matrix(), dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, np.ndarray): cupy_ary = rmm_cupy_ary(cp.asarray, cudf_or_array, dtype=cudf_or_array.dtype) cudf_or_array = cupyx.scipy.sparse.csr_matrix(cupy_ary) elif isinstance(cudf_or_array, cp.core.core.ndarray): cudf_or_array = cupyx.scipy.sparse.csr_matrix(cudf_or_array) else: raise ValueError("Unexpected input type %s" % type(cudf_or_array)) # Push to worker cudf_or_array = client.scatter(cudf_or_array) return dask.array.from_delayed(cudf_or_array, shape=shape, meta=meta)
def _conv_np_to_df(x): cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype) return cudf.DataFrame.from_gpu_matrix(cupy_ary)
def _conv_df_to_sp(x): cupy_ary = rmm_cupy_ary(cp.asarray, x.as_gpu_matrix(), dtype=x.dtypes[0]) return cp.sparse.csr_matrix(cupy_ary)
def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, tail_strength=0.5, random_state=None, n_parts=1, n_samples_per_part=None): """ Generate a mostly low rank matrix with bell-shaped singular values Parameters ---------- n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=100) The number of features. effective_rank : int, optional (default=10) The approximate number of singular vectors required to explain most of the data by linear combinations. tail_strength : float between 0.0 and 1.0, optional (default=0.5) The relative importance of the fat noisy tail of the singular values profile. random_state : int, CuPy RandomState instance, Dask RandomState instance or None (default) Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. n_parts : int, optional (default=1) The number of parts of work. Returns ------- X : Dask-CuPy array of shape [n_samples, n_features] The matrix. """ rs = create_rs_generator(random_state) n = min(n_samples, n_features) def generate_chunks_for_qr(total_size, min_size, n_parts): n_total_per_part = max(1, int(total_size / n_parts)) if n_total_per_part > min_size: min_size = n_total_per_part n_partitions = int(max(1, total_size / min_size)) rest = total_size % (n_partitions * min_size) chunks_list = [min_size for i in range(n_partitions - 1)] chunks_list.append(min_size + rest) return tuple(chunks_list) # Random (ortho normal) vectors m1 = rs.standard_normal( (n_samples, n), chunks=(generate_chunks_for_qr(n_samples, n, n_parts), -1)) u, _ = da.linalg.qr(m1) m2 = rs.standard_normal( (n, n_features), chunks=(-1, generate_chunks_for_qr(n_features, n, n_parts))) v, _ = da.linalg.qr(m2) # For final multiplication if n_samples_per_part is None: n_samples_per_part = max(1, int(n_samples / n_parts)) u = u.rechunk({0: n_samples_per_part, 1: -1}) v = v.rechunk({0: n_samples_per_part, 1: -1}) # Index of the singular values sing_ind = rmm_cupy_ary(cp.arange, n, dtype=cp.float64) # Build the singular profile by assembling signal and noise components tmp = sing_ind / effective_rank low_rank = ((1 - tail_strength) * rmm_cupy_ary(cp.exp, -1.0 * tmp**2)) tail = tail_strength * rmm_cupy_ary(cp.exp, -0.1 * tmp) local_s = low_rank + tail s = da.from_array(local_s, chunks=(int(n_samples_per_part), )) u *= s return da.dot(u, v)
def input_to_cuml_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to CumlArray. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: string (default: 'F') Whether to return a F-major or C-major array. Used to check the order of the input. If fail_on_order=True method will raise ValueError, otherwise it will convert X to be of order `order`. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype') A new CumlArray and associated data. """ # temporarily importing here, until github issue #1681 reorganizing utils # is dealt with. Otherwise circular import causes issues from cuml.common import CumlArray # dtype conversion if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False # format conversion if (isinstance(X, cudf.Series)): if X.null_count != 0: raise ValueError("Error: cuDF Series has missing/null values, " + " which are not supported by cuML.") if isinstance(X, cudf.DataFrame): if order == 'F': X_m = CumlArray(data=X.as_gpu_matrix(order='F')) elif order == 'C': X_m = CumlArray(data=cuml.utils.numba_utils.row_matrix(X)) elif cuda.is_cuda_array(X) or isinstance(X, np.ndarray): X_m = CumlArray(data=X) if deepcopy: X_m = copy.deepcopy(X_m) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) if check_dtype: if not isinstance(check_dtype, list): check_dtype = [check_dtype] check_dtype = [np.dtype(dtype) for dtype in check_dtype] if X_m.dtype not in check_dtype: type_str = X_m.dtype del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(type_str)) # Checks based on parameters n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if X_m.order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = CumlArray(data=X_m) return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
def _unique(x): return rmm_cupy_ary(cp.unique, x)
def _func_inv_xform(model, y, threshold): y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return model.inverse_transform(y, threshold)
def _func_xform(model, y): xform_in = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return model.transform(xform_in)
def to_sparse_dask_array(cudf_or_array, client=None): """ Converts an array or cuDF to a sparse Dask array backed by sparse CuPy. CSR matrices. Unfortunately, due to current limitations in Dask, there is no direct path to convert a cupy.sparse.spmatrix into a CuPy backed dask.Array without copying to host. NOTE: Until https://github.com/cupy/cupy/issues/2655 and https://github.com/dask/dask/issues/5604 are implemented, compute() will not be able to be called on a Dask.array that is backed with sparse CuPy arrays because they lack the necessary functionality to be stacked into a single array. The array returned from this utility will, however, still be able to be passed into functions that can make use of sparse CuPy-backed Dask.Array (eg. Distributed Naive Bayes). Relevant cuML issue: https://github.com/rapidsai/cuml/issues/1387 Parameters ---------- cudf_or_array : cuDF Dataframe, array-like sparse / dense array, or Dask DataFrame/Array client : dask.distributed.Client (optional) Dask client dtype : output dtype Returns ------- dask_array : dask.Array backed by cupy.sparse.csr_matrix """ client = default_client() if client is None else client # Makes sure the MatDescriptor workaround for CuPy sparse arrays # is loaded (since Dask lazy-loaded serialization in cuML is only # executed when object from the cuML package needs serialization. # This can go away once the MatDescriptor pickling bug is fixed # in CuPy. # Ref: https://github.com/cupy/cupy/issues/3061 from cuml.comm import serialize # NOQA shape = cudf_or_array.shape meta = cupyx.scipy.sparse.csr_matrix(rmm_cupy_ary(cp.zeros, 1)) ret = cudf_or_array # If we have a Dask array, convert it to a Dask DataFrame if isinstance(ret, dask.array.Array): # At the time of developing this, using map_blocks will not work # to convert a Dask.Array to CuPy sparse arrays underneath. def _conv_np_to_df(x): cupy_ary = rmm_cupy_ary(cp.asarray, x, dtype=x.dtype) return cudf.DataFrame.from_gpu_matrix(cupy_ary) parts = client.sync(_extract_partitions, ret) futures = [ client.submit(_conv_np_to_df, part, workers=[w], pure=False) for w, part in parts ] ret = to_dask_cudf(futures) # If we have a Dask Dataframe, use `map_partitions` to convert it # to a Sparse Cupy-backed Dask Array. This will also convert the dense # Dask array above to a Sparse Cupy-backed Dask Array, since we cannot # use map_blocks on the array, but we can use `map_partitions` on the # Dataframe. if isinstance(ret, dask.dataframe.DataFrame): ret = ret.map_partitions(_conv_df_to_sparse, meta=dask.array.from_array(meta)) # This will also handle the input of dask.array.Array return ret else: ret = _conv_array_to_sparse(ret) # Push to worker final_result = client.scatter(ret) return dask.array.from_delayed(final_result, shape=shape, meta=meta)
def train_test_split(X, y, test_size: Union[float, int] = None, train_size: Union[float, int] = None, shuffle: bool = True, random_state: Union[int, cp.random.RandomState, np.random.RandomState] = None, seed: Union[int, cp.random.RandomState, np.random.RandomState] = None): """ Partitions device data into four collated objects, mimicking Scikit-learn's `train_test_split` Parameters ---------- X : cudf.DataFrame or cuda_array_interface compliant device array Data to split, has shape (n_samples, n_features) y : str, cudf.Series or cuda_array_interface compliant device array Set of labels for the data, either a series of shape (n_samples) or the string label of a column in X (if it is a cuDF DataFrame) containing the labels train_size : float or int, optional If float, represents the proportion [0, 1] of the data to be assigned to the training set. If an int, represents the number of instances to be assigned to the training set. Defaults to 0.8 shuffle : bool, optional Whether or not to shuffle inputs before splitting random_state : int, CuPy RandomState or NumPy RandomState optional If shuffle is true, seeds the generator. Unseeded by default seed: random_state : int, CuPy RandomState or NumPy RandomState optional Deprecated in favor of `random_state`. If shuffle is true, seeds the generator. Unseeded by default Examples -------- .. code-block:: python import cudf from cuml.preprocessing.model_selection import train_test_split # Generate some sample data df = cudf.DataFrame({'x': range(10), 'y': [0, 1] * 5}) print(f'Original data: {df.shape[0]} elements') # Suppose we want an 80/20 split X_train, X_test, y_train, y_test = train_test_split(df, 'y', train_size=0.8) print(f'X_train: {X_train.shape[0]} elements') print(f'X_test: {X_test.shape[0]} elements') print(f'y_train: {y_train.shape[0]} elements') print(f'y_test: {y_test.shape[0]} elements') # Alternatively, if our labels are stored separately labels = df['y'] df = df.drop(['y']) # we can also do X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8) Output: .. code-block:: python Original data: 10 elements X_train: 8 elements X_test: 2 elements y_train: 8 elements y_test: 2 elements Returns ------- X_train, X_test, y_train, y_test : cudf.DataFrame Partitioned dataframes. If `y` was provided as a column name, the column was dropped from the `X`s """ if isinstance(y, str): # Use the column with name `str` as y if isinstance(X, cudf.DataFrame): name = y y = X[name] X = X.drop(name) else: raise TypeError("X needs to be a cuDF Dataframe when y is a \ string") # todo: this check will be replaced with upcoming improvements # to input_utils with PR #1379 if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("X needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("y needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same first dimension" "(found {} and {})".format(X.shape[0], y.shape[0])) if isinstance(train_size, float): if not 0 <= train_size <= 1: raise ValueError("proportion train_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" "first dimension of X (found {})".format(train_size)) if isinstance(test_size, float): if not 0 <= test_size <= 1: raise ValueError("proportion test_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(test_size, int): if not 0 <= test_size <= X.shape[0]: raise ValueError( "Number of instances test_size should be between 0 and the" "first dimension of X (found {})".format(test_size)) x_numba = False y_numba = False if seed is not None: if random_state is None: warnings.warn("Parameter 'seed' is deprecated, please use \ 'random_state' instead.") random_state = seed else: warnings.warn("Both 'seed' and 'random_state' parameters were \ set, using 'random_state' since 'seed' is \ deprecated. ") if shuffle: if random_state is None or isinstance(random_state, int): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) random_state = cp.random.RandomState(seed=random_state) elif isinstance(random_state, cp.random.RandomState): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) elif isinstance(random_state, np.random.RandomState): idxs = np.arange(X.shape[0]) else: raise TypeError("`random_state` must be an int, NumPy RandomState \ or CuPy RandomState.") random_state.shuffle(idxs) if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series): X = X.iloc[idxs].reset_index(drop=True) elif cuda.is_cuda_array(X): # numba (and therefore rmm device_array) does not support # fancy indexing if cuda.devicearray.is_cuda_ndarray(X): x_numba = True X = cp.asarray(X)[idxs] if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series): y = y.iloc[idxs] elif cuda.is_cuda_array(y): if cuda.devicearray.is_cuda_ndarray(y): y_numba = True y = cp.asarray(y)[idxs] # Determining sizes of splits if isinstance(train_size, float): train_size = int(X.shape[0] * train_size) if test_size is None: if train_size is None: train_size = int(X.shape[0] * 0.75) test_size = X.shape[0] - train_size if isinstance(test_size, float): test_size = int(X.shape[0] * test_size) if train_size is None: train_size = X.shape[0] - test_size elif isinstance(test_size, int): if train_size is None: train_size = X.shape[0] - test_size if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix): X_train = X[0:train_size] y_train = y[0:train_size] elif isinstance(X, cudf.DataFrame): X_train = X.iloc[0:train_size] y_train = y.iloc[0:train_size] if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix): X_test = X[-1 * test_size:] y_test = y[-1 * test_size:] elif isinstance(y, cudf.DataFrame): X_test = X.iloc[-1 * test_size:] y_test = y.iloc[-1 * test_size:] if x_numba: X_train = cuda.as_cuda_array(X_train) X_test = cuda.as_cuda_array(X_test) if y_numba: y_train = cuda.as_cuda_array(y_train) y_test = cuda.as_cuda_array(y_test) return X_train, X_test, y_train, y_test
def _count_accurate_predictions(y_hat, y): y_hat = rmm_cupy_ary(cp.asarray, y_hat, dtype=y_hat.dtype) y = rmm_cupy_ary(cp.asarray, y, dtype=y.dtype) return y.shape[0] - cp.count_nonzero(y - y_hat)
def input_to_dev_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to device array suitable for C++ methods. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: string (default: 'F') Whether to return a F-major or C-major array. Used to check the order of the input. If fail_on_order=True method will raise ValueError, otherwise it will convert X to be of order `order`. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype') A new device array if the input was not a numba device array. It is a reference to the input X if it was a numba device array or cuda array interface compliant (like cupy) """ if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False if isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if order == 'F': X_m = X.as_gpu_matrix(order='F') elif order == 'C': X_m = cuml.utils.numba_utils.row_matrix(X) elif (isinstance(X, cudf.Series)): if deepcopy: X_m = X.to_gpu_array() else: if X.null_count == 0: # using __cuda_array_interface__ support of cudf.Series for # this temporarily while switching from rmm device_array to # rmm deviceBuffer https://github.com/rapidsai/cuml/issues/1379 X_m = cuda.as_cuda_array(X._column) else: raise ValueError("Error: cuDF Series has missing/null values") elif isinstance(X, np.ndarray): dtype = X.dtype X_m = rmm.to_device(np.array(X, order=order, copy=False)) elif cuda.is_cuda_array(X): # Use cuda array interface to create a device array by reference X_m = cuda.as_cuda_array(X) if deepcopy: out_dev_array = rmm.device_array_like(X_m) out_dev_array.copy_to_device(X_m) X_m = out_dev_array elif cuda.devicearray.is_cuda_ndarray(X): if deepcopy: out_dev_array = rmm.device_array_like(X) out_dev_array.copy_to_device(X) X_m = out_dev_array else: X_m = X else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) dtype = X_m.dtype if check_dtype: if isinstance(check_dtype, type) or isinstance(check_dtype, np.dtype): if dtype != check_dtype: del X_m raise TypeError("Expected " + str(check_dtype) + "input but" + " got " + str(dtype) + " instead.") elif isinstance(check_dtype, Collection) and \ not isinstance(check_dtype, str): # The 'not isinstance(check_dtype, string)' condition is needed, # because the 'float32' string is a Collection, but in this # branch we only want to process collections like # [np.float32, np.float64]. if dtype not in check_dtype: del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(dtype)) else: raise ValueError("Expected a type as check_dtype arg, but got " + str(check_dtype)) n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if not check_numba_order(X_m, order): if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = cuda.as_cuda_array(X_m) X_ptr = get_dev_array_ptr(X_m) return inp_array(array=X_m, pointer=X_ptr, n_rows=n_rows, n_cols=n_cols, dtype=dtype)
def _func_unique_classes(y): return rmm_cupy_ary(cp.unique, y)