def test_data_generator_types(input_type): X, *_ = datagen.gen_data('blobs', input_type, n_samples=100, n_features=10) if input_type == 'numpy': assert isinstance(X, np.ndarray) elif input_type == 'cudf': assert isinstance(X, cudf.DataFrame) elif input_type == 'pandas': assert isinstance(X, pd.DataFrame) elif input_type == 'gpuarray': assert cuda.is_cuda_array(X) elif input_type == 'gpuarray-c': assert cuda.is_cuda_array(X) else: assert False
def test_as_cuda_array(self): h_arr = np.arange(10) self.assertFalse(cuda.is_cuda_array(h_arr)) d_arr = cuda.to_device(h_arr) self.assertTrue(cuda.is_cuda_array(d_arr)) my_arr = ForeignArray(d_arr) self.assertTrue(cuda.is_cuda_array(my_arr)) wrapped = cuda.as_cuda_array(my_arr) self.assertTrue(cuda.is_cuda_array(wrapped)) # Their values must equal the original array np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr) np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr) # d_arr and wrapped must be the same buffer self.assertPointersEqual(wrapped, d_arr)
def test_as_cuda_array(self): h_arr = np.arange(10) self.assertFalse(cuda.is_cuda_array(h_arr)) d_arr = cuda.to_device(h_arr) self.assertTrue(cuda.is_cuda_array(d_arr)) my_arr = MyArray(d_arr) self.assertTrue(cuda.is_cuda_array(my_arr)) wrapped = cuda.as_cuda_array(my_arr) self.assertTrue(cuda.is_cuda_array(wrapped)) # Their values must equal the original array np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr) np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr) # d_arr and wrapped must be the same buffer self.assertEqual(wrapped.device_ctypes_pointer.value, d_arr.device_ctypes_pointer.value)
def test_output_type_context_mgr(global_output_type, context_type): dataset = get_small_dataset('numba') test_type = 'cupy' if global_output_type != 'cupy' else 'numpy' cuml.set_global_output_type(test_type) # use cuml context manager with cuml.using_output_type(context_type): dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(dataset) res = dbscan_float.labels_ if context_type == 'numba': assert is_cuda_array(res) else: assert isinstance(res, test_output_types[context_type]) # use cuml again outside the context manager dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(dataset) res = dbscan_float.labels_ assert isinstance(res, test_output_types[test_type])
def _typecast_will_lose_information(X, target_dtype): """ Returns True if typecast will cause information loss, else False. Handles float/float, float/int, and int/int typecasts. """ target_dtype = np.dtype(target_dtype).type if target_dtype in (np.int8, np.int16, np.int32, np.int64): target_dtype_range = np.iinfo(target_dtype) else: target_dtype_range = np.finfo(target_dtype) if isinstance(X, (np.ndarray, cp.ndarray, pd.Series, cudf.Series)): if X.dtype.type == target_dtype: return False return ( (X < target_dtype_range.min) | (X > target_dtype_range.max) ).any() elif isinstance(X, (pd.DataFrame, cudf.DataFrame)): X_m = X.values return _typecast_will_lose_information(X_m, target_dtype) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) return _typecast_will_lose_information(X_m, target_dtype) else: raise TypeError("Received unsupported input type: %s" % type(X))
def convert_dtype(X, to_dtype=np.float32): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) X_m = X_m.astype(to_dtype) return cuda.as_cuda_array(X_m) else: raise TypeError("Received unsupported input type " % type(X)) return X
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype`, raising a TypeError if the conversion would lose information. """ would_lose_info = _typecast_will_lose_information(X, to_dtype) if would_lose_info: raise TypeError("Data type conversion would lose information.") if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) return X_m elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)): return X.astype(to_dtype, copy=False) elif cuda.is_cuda_array(X): X_m = cp.asarray(X) X_m = X_m.astype(to_dtype, copy=False) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type: %s" % type(X)) return X
def convert_dtype(X, to_dtype=np.float32): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ # Using cuDF for converting numba and device array interface inputs # if CuPy not installed, temporary while CuPy conda package # causes nccl conflicts if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series): return X.astype(to_dtype) elif cuda.is_cuda_array(X): if has_cupy(): import cupy as cp X_m = cp.asarray(X) X_m = X_m.astype(to_dtype) return cuda.as_cuda_array(X_m) else: warnings.warn("Using cuDF for dtype conversion, install" "CuPy for faster data conversion.") if (len(X.shape) == 1): return cudf.Series(X).astype(to_dtype).to_gpu_array() else: X_df = cudf.DataFrame() X = X_df.from_gpu_matrix(X) X = convert_dtype(X, to_dtype=to_dtype) return X.as_gpu_matrix() elif isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if dtype != to_dtype: new_cols = [(col, X._cols[col].astype(to_dtype)) for col in X._cols] overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) if overflowed > 0: raise TypeError("Data type conversion resulted" "in data loss.") return cudf.DataFrame(new_cols) else: raise TypeError("Received unsupported input type " % type(X)) return X
def test_default_global_output_type(input_type): dataset = get_small_dataset(input_type) dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(dataset) res = dbscan_float.labels_ if input_type == 'numba': assert is_cuda_array(res) else: assert isinstance(res, test_output_types[input_type])
def typeof_pyval(self, val): # Based on _DispatcherBase.typeof_pyval, but differs from it to support # the CUDA Array Interface. try: return typeof(val, Purpose.argument) except ValueError: if cuda.is_cuda_array(val): # When typing, we don't need to synchronize on the array's # stream - this is done when the kernel is launched. return typeof(cuda.as_cuda_array(val, sync=False), Purpose.argument) else: raise
def get_dtype(X): """ Returns dtype of obj as a Numpy style dtype (like np.float32) """ if isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) elif (isinstance(X, cudf.Series)): dtype = np.dtype(X._column.dtype) elif isinstance(X, np.ndarray): dtype = X.dtype elif cuda.is_cuda_array(X): dtype = X.dtype elif cuda.devicearray.is_cuda_ndarray(X): dtype = X.dtype else: raise TypeError("Input object not understood for dtype detection.") return dtype
def convert_dtype(X, to_dtype=np.float32, legacy=True): """ Convert X to be of dtype `dtype` Supported float dtypes for overflow checking. Todo: support other dtypes if needed. """ # temporarily importing here, until github issue #1681 reorganizing utils # is dealt with. Otherwise circular import causes issues from cuml.common import CumlArray if isinstance(X, np.ndarray): dtype = X.dtype if dtype != to_dtype: X_m = X.astype(to_dtype) if len(X[X == np.inf]) > 0: raise TypeError("Data type conversion resulted" "in data loss.") return X_m elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame): return X.astype(to_dtype) elif cuda.is_cuda_array(X): X_m = rmm_cupy_ary(cp.asarray, X) X_m = X_m.astype(to_dtype) if legacy: return cuda.as_cuda_array(X_m) else: return CumlArray(data=X_m) else: raise TypeError("Received unsupported input type " % type(X)) return X
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column(arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
def input_to_dev_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to device array suitable for C++ methods. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: string (default: 'F') Whether to return a F-major or C-major array. Used to check the order of the input. If fail_on_order=True method will raise ValueError, otherwise it will convert X to be of order `order`. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype') A new device array if the input was not a numba device array. It is a reference to the input X if it was a numba device array or cuda array interface compliant (like cupy) """ if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False if isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if order == 'F': X_m = X.as_gpu_matrix(order='F') elif order == 'C': X_m = cuml.utils.numba_utils.row_matrix(X) elif (isinstance(X, cudf.Series)): if deepcopy: X_m = X.to_gpu_array() else: if X.null_count == 0: # using __cuda_array_interface__ support of cudf.Series for # this temporarily while switching from rmm device_array to # rmm deviceBuffer https://github.com/rapidsai/cuml/issues/1379 X_m = cuda.as_cuda_array(X._column) else: raise ValueError("Error: cuDF Series has missing/null values") elif isinstance(X, np.ndarray): dtype = X.dtype X_m = rmm.to_device(np.array(X, order=order, copy=False)) elif cuda.is_cuda_array(X): # Use cuda array interface to create a device array by reference X_m = cuda.as_cuda_array(X) if deepcopy: out_dev_array = rmm.device_array_like(X_m) out_dev_array.copy_to_device(X_m) X_m = out_dev_array elif cuda.devicearray.is_cuda_ndarray(X): if deepcopy: out_dev_array = rmm.device_array_like(X) out_dev_array.copy_to_device(X) X_m = out_dev_array else: X_m = X else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) dtype = X_m.dtype if check_dtype: if isinstance(check_dtype, type) or isinstance(check_dtype, np.dtype): if dtype != check_dtype: del X_m raise TypeError("Expected " + str(check_dtype) + "input but" + " got " + str(dtype) + " instead.") elif isinstance(check_dtype, Collection) and \ not isinstance(check_dtype, str): # The 'not isinstance(check_dtype, string)' condition is needed, # because the 'float32' string is a Collection, but in this # branch we only want to process collections like # [np.float32, np.float64]. if dtype not in check_dtype: del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(dtype)) else: raise ValueError("Expected a type as check_dtype arg, but got " + str(check_dtype)) n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if not check_numba_order(X_m, order): if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = cuda.as_cuda_array(X_m) X_ptr = get_dev_array_ptr(X_m) return inp_array(array=X_m, pointer=X_ptr, n_rows=n_rows, n_cols=n_cols, dtype=dtype)
def input_to_host_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to host array (NumPy) suitable for C++ methods that accept host arrays. Acceptable input formats: * Numpy array - returns a pointer to the original input * cuDF Dataframe - returns a deep copy always * cuDF Series - returns by reference or a deep copy depending on `deepcopy` * cuda array interface compliant array (like Cupy) - returns a \ reference unless deepcopy=True * numba device array - returns a reference unless deepcopy=True Parameters ---------- X: cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: string (default: 'F') Whether to return a F-major or C-major array. Used to check the order of the input. If fail_on_order=True method will raise ValueError, otherwise it will convert X to be of order `order`. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype') `inp_array` is a new device array if the input was not a NumPy device array. It is a reference to the input X if it was a NumPy host array """ if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False if isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if order == 'F': X_m = X.as_gpu_matrix(order='F') elif order == 'C': X_m = cuml.utils.numba_utils.row_matrix(X) X_m = X_m.copy_to_host() elif (isinstance(X, cudf.Series)): if X.null_count == 0: X_m = X.to_array() else: raise ValueError('cuDF Series has missing (null) values.') elif isinstance(X, np.ndarray): X_m = np.array(X, order=order, copy=deepcopy) elif cuda.is_cuda_array(X): # Use cuda array interface to create a device array by reference X_m = cuda.as_cuda_array(X) X_m = np.array(X_m.copy_to_host(), order=order) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) dtype = X_m.dtype if check_dtype: if isinstance(check_dtype, type): if dtype != check_dtype: del X_m raise TypeError("Expected " + str(check_dtype) + "input but" + " got " + str(dtype) + " instead.") elif isinstance(check_dtype, Collection): if dtype not in check_dtype: del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(dtype)) n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") X_ptr = X_m.ctypes.data return inp_array(array=X_m, pointer=X_ptr, n_rows=n_rows, n_cols=n_cols, dtype=dtype)
def train_test_split(X, y, test_size: Union[float, int] = None, train_size: Union[float, int] = None, shuffle: bool = True, random_state: Union[int, cp.random.RandomState, np.random.RandomState] = None, seed: Union[int, cp.random.RandomState, np.random.RandomState] = None): """ Partitions device data into four collated objects, mimicking Scikit-learn's `train_test_split` Parameters ---------- X : cudf.DataFrame or cuda_array_interface compliant device array Data to split, has shape (n_samples, n_features) y : str, cudf.Series or cuda_array_interface compliant device array Set of labels for the data, either a series of shape (n_samples) or the string label of a column in X (if it is a cuDF DataFrame) containing the labels train_size : float or int, optional If float, represents the proportion [0, 1] of the data to be assigned to the training set. If an int, represents the number of instances to be assigned to the training set. Defaults to 0.8 shuffle : bool, optional Whether or not to shuffle inputs before splitting random_state : int, CuPy RandomState or NumPy RandomState optional If shuffle is true, seeds the generator. Unseeded by default seed: random_state : int, CuPy RandomState or NumPy RandomState optional Deprecated in favor of `random_state`. If shuffle is true, seeds the generator. Unseeded by default Examples -------- .. code-block:: python import cudf from cuml.preprocessing.model_selection import train_test_split # Generate some sample data df = cudf.DataFrame({'x': range(10), 'y': [0, 1] * 5}) print(f'Original data: {df.shape[0]} elements') # Suppose we want an 80/20 split X_train, X_test, y_train, y_test = train_test_split(df, 'y', train_size=0.8) print(f'X_train: {X_train.shape[0]} elements') print(f'X_test: {X_test.shape[0]} elements') print(f'y_train: {y_train.shape[0]} elements') print(f'y_test: {y_test.shape[0]} elements') # Alternatively, if our labels are stored separately labels = df['y'] df = df.drop(['y']) # we can also do X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8) Output: .. code-block:: python Original data: 10 elements X_train: 8 elements X_test: 2 elements y_train: 8 elements y_test: 2 elements Returns ------- X_train, X_test, y_train, y_test : cudf.DataFrame Partitioned dataframes. If `y` was provided as a column name, the column was dropped from the `X`s """ if isinstance(y, str): # Use the column with name `str` as y if isinstance(X, cudf.DataFrame): name = y y = X[name] X = X.drop(name) else: raise TypeError("X needs to be a cuDF Dataframe when y is a \ string") # todo: this check will be replaced with upcoming improvements # to input_utils with PR #1379 if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("X needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \ and isinstance(y, cudf.Series): raise TypeError("y needs to be either a cuDF DataFrame, Series or \ a cuda_array_interface compliant array.") if X.shape[0] != y.shape[0]: raise ValueError("X and y must have the same first dimension" "(found {} and {})".format(X.shape[0], y.shape[0])) if isinstance(train_size, float): if not 0 <= train_size <= 1: raise ValueError("proportion train_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(train_size, int): if not 0 <= train_size <= X.shape[0]: raise ValueError( "Number of instances train_size should be between 0 and the" "first dimension of X (found {})".format(train_size)) if isinstance(test_size, float): if not 0 <= test_size <= 1: raise ValueError("proportion test_size should be between" "0 and 1 (found {})".format(train_size)) if isinstance(test_size, int): if not 0 <= test_size <= X.shape[0]: raise ValueError( "Number of instances test_size should be between 0 and the" "first dimension of X (found {})".format(test_size)) x_numba = False y_numba = False if seed is not None: if random_state is None: warnings.warn("Parameter 'seed' is deprecated, please use \ 'random_state' instead.") random_state = seed else: warnings.warn("Both 'seed' and 'random_state' parameters were \ set, using 'random_state' since 'seed' is \ deprecated. ") if shuffle: if random_state is None or isinstance(random_state, int): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) random_state = cp.random.RandomState(seed=random_state) elif isinstance(random_state, cp.random.RandomState): idxs = rmm_cupy_ary(cp.arange, X.shape[0]) elif isinstance(random_state, np.random.RandomState): idxs = np.arange(X.shape[0]) else: raise TypeError("`random_state` must be an int, NumPy RandomState \ or CuPy RandomState.") random_state.shuffle(idxs) if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series): X = X.iloc[idxs].reset_index(drop=True) elif cuda.is_cuda_array(X): # numba (and therefore rmm device_array) does not support # fancy indexing if cuda.devicearray.is_cuda_ndarray(X): x_numba = True X = cp.asarray(X)[idxs] if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series): y = y.iloc[idxs] elif cuda.is_cuda_array(y): if cuda.devicearray.is_cuda_ndarray(y): y_numba = True y = cp.asarray(y)[idxs] # Determining sizes of splits if isinstance(train_size, float): train_size = int(X.shape[0] * train_size) if test_size is None: if train_size is None: train_size = int(X.shape[0] * 0.75) test_size = X.shape[0] - train_size if isinstance(test_size, float): test_size = int(X.shape[0] * test_size) if train_size is None: train_size = X.shape[0] - test_size elif isinstance(test_size, int): if train_size is None: train_size = X.shape[0] - test_size if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix): X_train = X[0:train_size] y_train = y[0:train_size] elif isinstance(X, cudf.DataFrame): X_train = X.iloc[0:train_size] y_train = y.iloc[0:train_size] if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix): X_test = X[-1 * test_size:] y_test = y[-1 * test_size:] elif isinstance(y, cudf.DataFrame): X_test = X.iloc[-1 * test_size:] y_test = y.iloc[-1 * test_size:] if x_numba: X_train = cuda.as_cuda_array(X_train) X_test = cuda.as_cuda_array(X_test) if y_numba: y_train = cuda.as_cuda_array(y_train) y_test = cuda.as_cuda_array(y_test) return X_train, X_test, y_train, y_test
def is_device_array(self, obj): return cuda.is_cuda_array(obj)
def input_to_cuml_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to CumlArray. Acceptable input formats: * cuDF Dataframe - returns a deep copy always. * cuDF Series - returns by reference or a deep copy depending on `deepcopy`. * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless `deepcopy`=True. * numba device array - returns a reference unless deepcopy=True Parameters ---------- X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any cuda_array_interface compliant array like CuPy or pytorch. order: string (default: 'F') Whether to return a F-major or C-major array. Used to check the order of the input. If fail_on_order=True method will raise ValueError, otherwise it will convert X to be of order `order`. deepcopy: boolean (default: False) Set to True to always return a deep copy of X. check_dtype: np.dtype (default: False) Set to a np.dtype to throw an error if X is not of dtype `check_dtype`. convert_to_dtype: np.dtype (default: False) Set to a dtype if you want X to be converted to that dtype if it is not that dtype already. check_cols: int (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. check_rows: boolean (default: False) Set to an int `i` to check that input X has `i` columns. Set to False (default) to not check at all. fail_on_order: boolean (default: False) Set to True if you want the method to raise a ValueError if X is not of order `order`. Returns ------- `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype') A new CumlArray and associated data. """ # temporarily importing here, until github issue #1681 reorganizing utils # is dealt with. Otherwise circular import causes issues from cuml.common import CumlArray # dtype conversion if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False # format conversion if (isinstance(X, cudf.Series)): if X.null_count != 0: raise ValueError("Error: cuDF Series has missing/null values, " + " which are not supported by cuML.") if isinstance(X, cudf.DataFrame): if order == 'F': X_m = CumlArray(data=X.as_gpu_matrix(order='F')) elif order == 'C': X_m = CumlArray(data=cuml.utils.numba_utils.row_matrix(X)) elif cuda.is_cuda_array(X) or isinstance(X, np.ndarray): X_m = CumlArray(data=X) if deepcopy: X_m = copy.deepcopy(X_m) else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) if check_dtype: if not isinstance(check_dtype, list): check_dtype = [check_dtype] check_dtype = [np.dtype(dtype) for dtype in check_dtype] if X_m.dtype not in check_dtype: type_str = X_m.dtype del X_m raise TypeError("Expected input to be of type in " + str(check_dtype) + " but got " + str(type_str)) # Checks based on parameters n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if X_m.order != order: if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order) X_m = CumlArray(data=X_m) return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
def input_to_dev_array(X, order='F', deepcopy=False, check_dtype=False, convert_to_dtype=False, check_cols=False, check_rows=False, fail_on_order=False): """ Convert input X to device array suitable for C++ methods Acceptable input formats: * cuDF Dataframe - returns a deep copy always * cuDF Series - returns by reference or a deep copy depending on `deepcopy` * Numpy array - returns a copy in device always * cuda array interface compliant array (like Cupy) - returns a reference unless deepcopy=True * numba device array - returns a reference unless deepcopy=True Returns: namedtuple('dev_array', 'array pointer n_rows n_cols dtype') `dev_array` is a new device array if the input was not a numba device array. It is a reference to the input X if it was a numba device array or cuda array interface compliant (like cupy) """ if convert_to_dtype: X = convert_dtype(X, to_dtype=convert_to_dtype) check_dtype = False if isinstance(X, cudf.DataFrame): dtype = np.dtype(X[X.columns[0]]._column.dtype) if order == 'F': X_m = X.as_gpu_matrix(order='F') elif order == 'C': X_m = cuml.utils.numba_utils.row_matrix(X) elif (isinstance(X, cudf.Series)): if deepcopy: X_m = X.to_gpu_array() else: if X.null_count == 0: X_m = X._column._data.mem else: raise ValueError("Error: cuDF Series has missing/null values") elif isinstance(X, np.ndarray): dtype = X.dtype X_m = rmm.to_device(np.array(X, order=order, copy=False)) elif cuda.is_cuda_array(X): # Use cuda array interface to create a device array by reference X_m = cuda.as_cuda_array(X) if deepcopy: out_dev_array = rmm.device_array_like(X_m) out_dev_array.copy_to_device(X_m) X_m = out_dev_array elif cuda.devicearray.is_cuda_ndarray(X): if deepcopy: out_dev_array = rmm.device_array_like(X) out_dev_array.copy_to_device(X) X_m = out_dev_array else: X_m = X else: msg = "X matrix format " + str(X.__class__) + " not supported" raise TypeError(msg) dtype = X_m.dtype if check_dtype: if dtype != check_dtype: del X_m raise TypeError("Expected " + str(check_dtype) + "input but got " + str(dtype) + " instead.") n_rows = X_m.shape[0] if len(X_m.shape) > 1: n_cols = X_m.shape[1] else: n_cols = 1 if check_cols: if n_cols != check_cols: raise ValueError("Expected " + str(check_cols) + " columns but got " + str(n_cols) + " columns.") if check_rows: if n_rows != check_rows: raise ValueError("Expected " + str(check_rows) + " rows but got " + str(n_rows) + " rows.") if not check_numba_order(X_m, order): if fail_on_order: raise ValueError("Expected " + order_to_str(order) + " major order, but got the opposite.") else: warnings.warn("Expected " + order_to_str(order) + " major order, " "but got the opposite. Converting data, this will " "result in additional memory utilization.") cuml.utils.numba_utils.gpu_major_converter(X_m, n_rows, n_cols, dtype, to_order=order) X_ptr = get_dev_array_ptr(X_m) result = namedtuple('dev_array', 'array pointer n_rows n_cols dtype') return result(array=X_m, pointer=X_ptr, n_rows=n_rows, n_cols=n_cols, dtype=dtype)