def test_time_series_label_output_type(output_str, output_types): # Set the output type and ensure data of that type is generated with cuml.using_output_type(output_str): data = make_arima(n_obs=10, random_state=0)[0] assert isinstance(data, output_types[1])
def test_output_type_context_mgr(global_output_type, context_type): dataset = get_small_dataset('numba') test_type = 'cupy' if global_output_type != 'cupy' else 'numpy' cuml.set_global_output_type(test_type) # use cuml context manager with cuml.using_output_type(context_type): dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(dataset) res = dbscan_float.labels_ if context_type == 'numba': assert is_cuda_array(res) else: assert isinstance(res, test_output_types[context_type]) # use cuml again outside the context manager dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(dataset) res = dbscan_float.labels_ assert isinstance(res, test_output_types[test_type])
def generate_classification_data(classes=2, rows=1000, cols=32, cat_cols=0): """Generate classification training set""" if cat_cols > 0: output_type = 'cudf' else: output_type = 'numpy' with cuml.using_output_type(output_type): data, labels = cuml.datasets.make_classification( n_samples=rows, n_features=cols, n_informative=cols // 3, n_classes=classes, random_state=0 ) if cat_cols > 0: selected_cols = data.sample(n=min(cat_cols, cols), axis='columns') negatives = (selected_cols < 0) positives = (selected_cols >= 0) selected_cols = selected_cols.astype('object') selected_cols[negatives] = 'negative' selected_cols[positives] = 'positive' data[selected_cols.columns] = selected_cols.astype('category') data = data.to_pandas() labels = labels.to_pandas() return data, labels
def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query): np.random.seed(123) X_host_train = pd.DataFrame(np.random.uniform(0, 1, (n_samples, n_features))) y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1))) X_host_test = pd.DataFrame(np.random.uniform(0, 1, (n_query, n_features))) X_device_train = cudf.DataFrame.from_pandas(X_host_train) y_device_train = cudf.DataFrame.from_pandas(y_host_train) X_device_test = cudf.DataFrame.from_pandas(X_host_test) knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1) knn_sk.fit(X_host_train, y_host_train) sk_result = knn_sk.predict(X_host_test) knn_cuml = cuKNN(n_neighbors=n_neighbors) knn_cuml.fit(X_device_train, y_device_train) with cuml.using_output_type("numpy"): cuml_result = knn_cuml.predict(X_device_test) assert np.array_equal(cuml_result, sk_result)
def test_svm_skl_cmp_multiclass(params, dataset='classification2', n_rows=100, n_cols=6): X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols, n_classes=3, n_informative=6) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5, report_summary=True)
def test_pairwise_distances_output_types(input_type, output_type, use_global): # Test larger sizes to sklearn rng = np.random.RandomState(5) X = rng.random_sample((100, 100)) Y = rng.random_sample((100, 100)) if input_type == "cudf": X = cudf.DataFrame(X) Y = cudf.DataFrame(Y) elif input_type == "cupy": X = cp.asarray(X) Y = cp.asarray(Y) # Set to None if we are using the global object output_type_param = None if use_global else output_type # Use the global manager object. Should do nothing unless use_global is set with cuml.using_output_type(output_type): # Compare to sklearn, fp64 S = pairwise_distances(X, Y, metric="euclidean", output_type=output_type_param) if output_type == "input": assert isinstance(S, type(X)) elif output_type == "cudf": assert isinstance(S, cudf.DataFrame) elif output_type == "numpy": assert isinstance(S, np.ndarray) elif output_type == "cupy": assert isinstance(S, cp.core.core.ndarray)
def test_dec_input_output(input_type, input_dtype, input_shape, output_type): if (input_type == "cudf" or output_type == "cudf"): if (input_dtype in unsupported_cudf_dtypes): pytest.skip("Unsupported cudf combination") X_in = create_input(input_type, input_dtype, input_shape, "C") X_out = create_output(X_in, output_type) # Test with output_type="input" est = DummyTestEstimator(output_type="input") est.store_input(X_in) # Test is was stored internally correctly assert X_in is est.get_input() assert est.__dict__["input_any_"].input_type == input_type # Check the current type matches input type assert determine_array_type(est.input_any_) == input_type assert_array_identical(est.input_any_, X_in) # Switch output type and check type and equality with cuml.using_output_type(output_type): assert determine_array_type(est.input_any_) == output_type assert_array_identical(est.input_any_, X_out) # Now Test with output_type=output_type est = DummyTestEstimator(output_type=output_type) est.store_input(X_in) # Check the current type matches output type assert determine_array_type(est.input_any_) == output_type assert_array_identical(est.input_any_, X_out) with cuml.using_output_type("input"): assert determine_array_type(est.input_any_) == input_type assert_array_identical(est.input_any_, X_in)
def check_correct_type(index): output_type = test_output_types_str[index] # Force a race condition if index == 0: sleep(0.1) with using_output_type(output_type): sleep(0.5) return cuml.global_settings.output_type == output_type
def test_xy_output_type(generator, output_str, output_types): # Set the output type and ensure data of that type is generated with cuml.using_output_type(output_str): data = generator(n_samples=10, random_state=0) for data, type_ in zip(data, output_types): assert isinstance(data, type_)
def generate_regression_data(rows=1000, cols=32): with cuml.using_output_type('numpy'): data, labels = cuml.datasets.make_regression( n_samples=rows, n_features=cols, n_informative=cols // 3, random_state=0) return data, labels
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(n_bins=16, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() with cuml.using_output_type("numpy"): fil_model_preds = fil_model.predict(X_test) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def _fit_indicator(self, X): """Fit a MissingIndicator.""" if self.add_indicator: with cuml.using_output_type("cupy"): self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False) self.indicator_.fit(X) else: self.indicator_ = None
def _func_predict_proba_partial(model, input_data, **kwargs): """ Whole dataset inference with part of the model (trees at disposal locally). Transfer dataset instead of model. Interesting when model is larger than dataset. """ X = concatenate(input_data) with using_output_type('cupy'): prediction = model.predict_proba(X, **kwargs) return cp.expand_dims(prediction, axis=1)
def _check_inverse_transform(self, X): """Check that func and inverse_func are the inverse.""" interval = max(1, X.shape[0] // 100) selection = [i * interval for i in range(X.shape[0] // interval)] with cuml.using_output_type("cupy"): X_round_trip = self.inverse_transform(self.transform(X[selection])) if not _allclose_dense_sparse(X[selection], X_round_trip): warnings.warn("The provided functions are not strictly" " inverse of each other. If you are sure you" " want to proceed regardless, set" " 'check_inverse=False'.", UserWarning)
def test_make_arima(dtype, output_type, batch_size, n_obs, random_state, order): p, d, q, P, D, Q, s, k = order with cuml.using_output_type(output_type): out = cuml.make_arima(batch_size, n_obs, (p, d, q), (P, D, Q, s), k, random_state=random_state, dtype=dtype) assert out.shape == (n_obs, batch_size), "out shape mismatch"
def check_correct_type(index): # Force a race condition if index == 0: sleep(0.1) if index % 2 == 0: with _using_mirror_output_type(): sleep(0.5) return cuml.global_settings.output_type == 'mirror' else: output_type = test_output_types_str[index] with using_output_type(output_type): sleep(0.5) return cuml.global_settings.output_type == output_type
def test_pickle(input_type): if (input_type == "numba"): pytest.skip("numba arrays cant be picked at this time") est = DummyTestEstimator() X_in = create_input(input_type, np.float32, (10, 5), "C") est.store_input(X_in) # Loop and verify we have filled the cache for out_type in test_output_types_str: with cuml.using_output_type(out_type): assert_array_identical(est.input_any_, create_output(X_in, out_type)) est_pickled_bytes = pickle.dumps(est) est_unpickled: DummyTestEstimator = pickle.loads(est_pickled_bytes) # Assert that we only resture the input assert est_unpickled.__dict__["input_any_"].input_type == input_type assert len(est_unpickled.__dict__["input_any_"].values) == 1 assert_array_identical(est.get_input(), est_unpickled.get_input()) assert_array_identical(est.input_any_, est_unpickled.input_any_) # Loop one more time with the picked one to make sure it works right for out_type in test_output_types_str: with cuml.using_output_type(out_type): assert_array_identical(est.input_any_, create_output(X_in, out_type)) est_unpickled.output_type = out_type assert_array_identical(est_unpickled.input_any_, create_output(X_in, out_type))
def test_output_type(input_type: str): # Set the output type and ensure its respected by the function with cuml.using_output_type(input_type): X, y = cuml.make_blobs(n_samples=10, centers=3, n_features=2, random_state=0) if (isinstance(test_output_types[input_type], tuple)): assert (isinstance(X, test_output_types[input_type][0])) assert (isinstance(y, test_output_types[input_type][1])) else: assert (isinstance(X, test_output_types[input_type])) assert (isinstance(y, test_output_types[input_type]))
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X_index.get()) D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k) X_orig = X_index if input_type == "dataframe": X_index = cudf.DataFrame(X_index) X_search = cudf.DataFrame(X_search) knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X_index) D_cuml, I_cuml = knn_cu.kneighbors(X_search, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_np = D_cuml.to_numpy() I_cuml_np = I_cuml.to_numpy() else: assert isinstance(D_cuml, cp.ndarray) assert isinstance(I_cuml, cp.ndarray) D_cuml_np = D_cuml.get() I_cuml_np = I_cuml.get() with cuml.using_output_type("numpy"): # Assert the cuml model was properly reverted np.testing.assert_allclose(knn_cu.X_m, X_orig.get(), atol=1e-3, rtol=1e-3) if metric == 'braycurtis': diff = D_cuml_np - D_sk # Braycurtis has a few differences, but this is computed by FAISS. # So long as the indices all match below, the small discrepancy # should be okay. assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06 else: np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3) assert I_cuml_np.all() == I_sk.all()
def exit_internal_api(): assert (global_output_type_data.root_cm is not None) try: old_root_cm = global_output_type_data.root_cm global_output_type_data.root_cm = None # Set the global output type to the previous value to pretend we never # entered the API with cuml.using_output_type(old_root_cm.prev_output_type): yield finally: global_output_type_data.root_cm = old_root_cm
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5, report_summary=True)
def test_auto_predict(input_type, base_output_type, global_output_type): """ Test autowrapping on predict that will set target_type """ X_in = create_input(input_type, np.float32, (10, 10), "F") # Test with output_type="input" est = DummyTestEstimator() # With cuml.global_settings.output_type == None, this should return the # input type X_out = est.predict(X_in) assert determine_array_type(X_out) == input_type assert_array_identical(X_in, X_out) # Test with output_type=base_output_type est = DummyTestEstimator(output_type=base_output_type) # With cuml.global_settings.output_type == None, this should return the # base_output_type X_out = est.predict(X_in) assert determine_array_type(X_out) == base_output_type assert_array_identical(X_in, X_out) # Test with global_output_type, should return global_output_type with cuml.using_output_type(global_output_type): X_out = est.predict(X_in) target_output_type = global_output_type if (target_output_type is None or target_output_type == "input"): target_output_type = base_output_type if (target_output_type == "input"): target_output_type = input_type assert determine_array_type(X_out) == target_output_type assert_array_identical(X_in, X_out)
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats, p, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) if as_instance: sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(), k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_sk = skKNN(metric=metric, p=p) knn_sk.fit(X.get()) sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode) if input_type == "dataframe": X = cudf.DataFrame(X) with cuml.using_output_type(output_type): if as_instance: sparse_cu = cuml.neighbors.kneighbors_graph(X, k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X) sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode) assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape) assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape) assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape) assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape) if output_type == 'cupy' or output_type is None: assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu) else: assert isspmatrix_csr(sparse_cu)
def transform(self, X) -> SparseCumlArray: """Generate missing values indicator for X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. Returns ------- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \ or (n_samples, n_features_with_missing) The missing indicator for input data. The data type of ``Xt`` will be boolean. """ check_is_fitted(self) X = self._validate_input(X, in_fit=False) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " "than during fitting.") imputer_mask, features = self._get_missing_features_info(X) if self.features == "missing-only": with cuml.using_output_type("numpy"): np_features = np.asnumpy(features) features_diff_fit_trans = numpy.setdiff1d( np_features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] return imputer_mask
def _fit_transform_one(transformer, X, y, weight, message_clsname='', message=None, **fit_params): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned with the fitted transformer. If ``weight`` is not ``None``, the result will be multiplied by ``weight``. """ with _print_elapsed_time(message_clsname, message): with cuml.using_output_type("cupy"): transformer.accept_sparse = True if hasattr(transformer, 'fit_transform'): res = transformer.fit_transform(X, y, **fit_params) else: res = transformer.fit(X, y, **fit_params).transform(X) if weight is None: return res, transformer return res * weight, transformer
def _iter(self, fitted=False, replace_strings=False): """ Generate (name, trans, column, weight) tuples. If fitted=True, use the fitted transformers, else use the user specified transformers updated with converted column names and potentially appended with transformer for remainder. """ if fitted: transformers = self.transformers_ else: # interleave the validated column specifiers transformers = [ (name, trans, column) for (name, trans, _), column in zip(self.transformers, self._columns) ] # add transformer tuple for remainder if self._remainder[2] is not None: transformers = chain(transformers, [self._remainder]) get_weight = (self.transformer_weights or {}).get for name, trans, column in transformers: if replace_strings: # replace 'passthrough' with identity transformer and # skip in case of 'drop' if trans == 'passthrough': with cuml.using_output_type("cupy"): trans = FunctionTransformer(accept_sparse=True, check_inverse=False) elif trans == 'drop': continue elif _is_empty_column_selection(column): continue yield (name, trans, column, get_weight(name))
def test_rf_classification_sparse(small_clf, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40) cuml_model.fit(X_train, y_train) if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_acc = accuracy_score(y_test, fil_preds) fil_model = cuml_model.convert_to_fil_model() with cuml.using_output_type("numpy"): fil_model_preds = fil_model.predict(X_test) fil_model_acc = accuracy_score(y_test, fil_model_preds) assert fil_acc == fil_model_acc tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features if X.shape[0] < 500000: sk_model = skrfc(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07)