def test_time_series_label_output_type(output_str, output_types):

    # Set the output type and ensure data of that type is generated
    with cuml.using_output_type(output_str):
        data = make_arima(n_obs=10, random_state=0)[0]

    assert isinstance(data, output_types[1])
示例#2
0
def test_output_type_context_mgr(global_output_type, context_type):
    dataset = get_small_dataset('numba')

    test_type = 'cupy' if global_output_type != 'cupy' else 'numpy'
    cuml.set_global_output_type(test_type)

    # use cuml context manager
    with cuml.using_output_type(context_type):
        dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1)
        dbscan_float.fit(dataset)

        res = dbscan_float.labels_

        if context_type == 'numba':
            assert is_cuda_array(res)
        else:
            assert isinstance(res, test_output_types[context_type])

    # use cuml again outside the context manager

    dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1)
    dbscan_float.fit(dataset)

    res = dbscan_float.labels_
    assert isinstance(res, test_output_types[test_type])
示例#3
0
def generate_classification_data(classes=2, rows=1000, cols=32, cat_cols=0):
    """Generate classification training set"""
    if cat_cols > 0:
        output_type = 'cudf'
    else:
        output_type = 'numpy'

    with cuml.using_output_type(output_type):
        data, labels = cuml.datasets.make_classification(
            n_samples=rows,
            n_features=cols,
            n_informative=cols // 3,
            n_classes=classes,
            random_state=0
        )

    if cat_cols > 0:
        selected_cols = data.sample(n=min(cat_cols, cols), axis='columns')
        negatives = (selected_cols < 0)
        positives = (selected_cols >= 0)
        selected_cols = selected_cols.astype('object')
        selected_cols[negatives] = 'negative'
        selected_cols[positives] = 'positive'
        data[selected_cols.columns] = selected_cols.astype('category')
        data = data.to_pandas()
        labels = labels.to_pandas()
    return data, labels
def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query):

    np.random.seed(123)

    X_host_train = pd.DataFrame(np.random.uniform(0, 1,
                                                  (n_samples, n_features)))
    y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1)))
    X_host_test = pd.DataFrame(np.random.uniform(0, 1,
                                                 (n_query, n_features)))

    X_device_train = cudf.DataFrame.from_pandas(X_host_train)
    y_device_train = cudf.DataFrame.from_pandas(y_host_train)

    X_device_test = cudf.DataFrame.from_pandas(X_host_test)

    knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1)
    knn_sk.fit(X_host_train, y_host_train)

    sk_result = knn_sk.predict(X_host_test)

    knn_cuml = cuKNN(n_neighbors=n_neighbors)
    knn_cuml.fit(X_device_train, y_device_train)

    with cuml.using_output_type("numpy"):
        cuml_result = knn_cuml.predict(X_device_test)

        assert np.array_equal(cuml_result, sk_result)
示例#5
0
def test_svm_skl_cmp_multiclass(params,
                                dataset='classification2',
                                n_rows=100,
                                n_cols=6):
    X_train, X_test, y_train, y_test = make_dataset(dataset,
                                                    n_rows,
                                                    n_cols,
                                                    n_classes=3,
                                                    n_informative=6)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)

        sklSVC = svm.SVC(**params)
        sklSVC.fit(X_train, y_train)

        compare_svm(cuSVC,
                    sklSVC,
                    X_test,
                    y_test,
                    coef_tol=1e-5,
                    report_summary=True)
示例#6
0
def test_pairwise_distances_output_types(input_type, output_type, use_global):
    # Test larger sizes to sklearn
    rng = np.random.RandomState(5)

    X = rng.random_sample((100, 100))
    Y = rng.random_sample((100, 100))

    if input_type == "cudf":
        X = cudf.DataFrame(X)
        Y = cudf.DataFrame(Y)
    elif input_type == "cupy":
        X = cp.asarray(X)
        Y = cp.asarray(Y)

    # Set to None if we are using the global object
    output_type_param = None if use_global else output_type

    # Use the global manager object. Should do nothing unless use_global is set
    with cuml.using_output_type(output_type):

        # Compare to sklearn, fp64
        S = pairwise_distances(X, Y, metric="euclidean",
                               output_type=output_type_param)

        if output_type == "input":
            assert isinstance(S, type(X))
        elif output_type == "cudf":
            assert isinstance(S, cudf.DataFrame)
        elif output_type == "numpy":
            assert isinstance(S, np.ndarray)
        elif output_type == "cupy":
            assert isinstance(S, cp.core.core.ndarray)
示例#7
0
def test_dec_input_output(input_type, input_dtype, input_shape, output_type):

    if (input_type == "cudf" or output_type == "cudf"):
        if (input_dtype in unsupported_cudf_dtypes):
            pytest.skip("Unsupported cudf combination")

    X_in = create_input(input_type, input_dtype, input_shape, "C")
    X_out = create_output(X_in, output_type)

    # Test with output_type="input"
    est = DummyTestEstimator(output_type="input")

    est.store_input(X_in)

    # Test is was stored internally correctly
    assert X_in is est.get_input()

    assert est.__dict__["input_any_"].input_type == input_type

    # Check the current type matches input type
    assert determine_array_type(est.input_any_) == input_type

    assert_array_identical(est.input_any_, X_in)

    # Switch output type and check type and equality
    with cuml.using_output_type(output_type):

        assert determine_array_type(est.input_any_) == output_type

        assert_array_identical(est.input_any_, X_out)

    # Now Test with output_type=output_type
    est = DummyTestEstimator(output_type=output_type)

    est.store_input(X_in)

    # Check the current type matches output type
    assert determine_array_type(est.input_any_) == output_type

    assert_array_identical(est.input_any_, X_out)

    with cuml.using_output_type("input"):

        assert determine_array_type(est.input_any_) == input_type

        assert_array_identical(est.input_any_, X_in)
示例#8
0
 def check_correct_type(index):
     output_type = test_output_types_str[index]
     # Force a race condition
     if index == 0:
         sleep(0.1)
     with using_output_type(output_type):
         sleep(0.5)
         return cuml.global_settings.output_type == output_type
def test_xy_output_type(generator, output_str, output_types):

    # Set the output type and ensure data of that type is generated
    with cuml.using_output_type(output_str):
        data = generator(n_samples=10, random_state=0)

    for data, type_ in zip(data, output_types):
        assert isinstance(data, type_)
示例#10
0
def generate_regression_data(rows=1000, cols=32):
    with cuml.using_output_type('numpy'):
        data, labels = cuml.datasets.make_regression(
            n_samples=rows,
            n_features=cols,
            n_informative=cols // 3,
            random_state=0)
    return data, labels
示例#11
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(n_bins=16, split_criterion=2,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        with cuml.using_output_type("numpy"):
            fil_model_preds = fil_model.predict(X_test)
            fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
            fil_model_r2 = r2_score(y_test, fil_model_preds,
                                    convert_dtype=datatype)
            assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(n_estimators=50, max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
示例#12
0
 def _fit_indicator(self, X):
     """Fit a MissingIndicator."""
     if self.add_indicator:
         with cuml.using_output_type("cupy"):
             self.indicator_ = MissingIndicator(
                 missing_values=self.missing_values, error_on_new=False)
             self.indicator_.fit(X)
     else:
         self.indicator_ = None
示例#13
0
def _func_predict_proba_partial(model, input_data, **kwargs):
    """
    Whole dataset inference with part of the model (trees at disposal locally).
    Transfer dataset instead of model. Interesting when model is larger
    than dataset.
    """
    X = concatenate(input_data)
    with using_output_type('cupy'):
        prediction = model.predict_proba(X, **kwargs)
        return cp.expand_dims(prediction, axis=1)
示例#14
0
 def _check_inverse_transform(self, X):
     """Check that func and inverse_func are the inverse."""
     interval = max(1, X.shape[0] // 100)
     selection = [i * interval for i in range(X.shape[0] // interval)]
     with cuml.using_output_type("cupy"):
         X_round_trip = self.inverse_transform(self.transform(X[selection]))
         if not _allclose_dense_sparse(X[selection], X_round_trip):
             warnings.warn("The provided functions are not strictly"
                           " inverse of each other. If you are sure you"
                           " want to proceed regardless, set"
                           " 'check_inverse=False'.", UserWarning)
示例#15
0
def test_make_arima(dtype, output_type, batch_size, n_obs, random_state,
                    order):
    p, d, q, P, D, Q, s, k = order

    with cuml.using_output_type(output_type):
        out = cuml.make_arima(batch_size,
                              n_obs, (p, d, q), (P, D, Q, s),
                              k,
                              random_state=random_state,
                              dtype=dtype)

    assert out.shape == (n_obs, batch_size), "out shape mismatch"
示例#16
0
 def check_correct_type(index):
     # Force a race condition
     if index == 0:
         sleep(0.1)
     if index % 2 == 0:
         with _using_mirror_output_type():
             sleep(0.5)
             return cuml.global_settings.output_type == 'mirror'
     else:
         output_type = test_output_types_str[index]
         with using_output_type(output_type):
             sleep(0.5)
             return cuml.global_settings.output_type == output_type
示例#17
0
def test_pickle(input_type):

    if (input_type == "numba"):
        pytest.skip("numba arrays cant be picked at this time")

    est = DummyTestEstimator()

    X_in = create_input(input_type, np.float32, (10, 5), "C")

    est.store_input(X_in)

    # Loop and verify we have filled the cache
    for out_type in test_output_types_str:
        with cuml.using_output_type(out_type):
            assert_array_identical(est.input_any_,
                                   create_output(X_in, out_type))

    est_pickled_bytes = pickle.dumps(est)
    est_unpickled: DummyTestEstimator = pickle.loads(est_pickled_bytes)

    # Assert that we only resture the input
    assert est_unpickled.__dict__["input_any_"].input_type == input_type
    assert len(est_unpickled.__dict__["input_any_"].values) == 1

    assert_array_identical(est.get_input(), est_unpickled.get_input())
    assert_array_identical(est.input_any_, est_unpickled.input_any_)

    # Loop one more time with the picked one to make sure it works right
    for out_type in test_output_types_str:
        with cuml.using_output_type(out_type):
            assert_array_identical(est.input_any_,
                                   create_output(X_in, out_type))

        est_unpickled.output_type = out_type

        assert_array_identical(est_unpickled.input_any_,
                               create_output(X_in, out_type))
示例#18
0
def test_output_type(input_type: str):

    # Set the output type and ensure its respected by the function
    with cuml.using_output_type(input_type):
        X, y = cuml.make_blobs(n_samples=10,
                               centers=3,
                               n_features=2,
                               random_state=0)

        if (isinstance(test_output_types[input_type], tuple)):
            assert (isinstance(X, test_output_types[input_type][0]))
            assert (isinstance(y, test_output_types[input_type][1]))
        else:
            assert (isinstance(X, test_output_types[input_type]))
            assert (isinstance(y, test_output_types[input_type]))
示例#19
0
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X_index.get())
    D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)

    X_orig = X_index

    if input_type == "dataframe":
        X_index = cudf.DataFrame(X_index)
        X_search = cudf.DataFrame(X_search)

    knn_cu = cuKNN(metric=metric, p=p)
    knn_cu.fit(X_index)
    D_cuml, I_cuml = knn_cu.kneighbors(X_search, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_np = D_cuml.to_numpy()
        I_cuml_np = I_cuml.to_numpy()
    else:
        assert isinstance(D_cuml, cp.ndarray)
        assert isinstance(I_cuml, cp.ndarray)
        D_cuml_np = D_cuml.get()
        I_cuml_np = I_cuml.get()

    with cuml.using_output_type("numpy"):
        # Assert the cuml model was properly reverted
        np.testing.assert_allclose(knn_cu.X_m,
                                   X_orig.get(),
                                   atol=1e-3,
                                   rtol=1e-3)

    if metric == 'braycurtis':
        diff = D_cuml_np - D_sk
        # Braycurtis has a few differences, but this is computed by FAISS.
        # So long as the indices all match below, the small discrepancy
        # should be okay.
        assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06
    else:
        np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3)
    assert I_cuml_np.all() == I_sk.all()
示例#20
0
def exit_internal_api():

    assert (global_output_type_data.root_cm is not None)

    try:
        old_root_cm = global_output_type_data.root_cm

        global_output_type_data.root_cm = None

        # Set the global output type to the previous value to pretend we never
        # entered the API
        with cuml.using_output_type(old_root_cm.prev_output_type):

            yield

    finally:
        global_output_type_data.root_cm = old_root_cm
示例#21
0
def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols):
    if (params['kernel'] == 'linear' and
            dataset in ['gaussian', 'classification2'] and
            n_rows > 1000 and n_cols >= 1000):
        # linear kernel will not fit the gaussian dataset, but takes very long
        return
    X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols)

    # Default to numpy for testing
    with cuml.using_output_type("numpy"):

        cuSVC = cu_svm.SVC(**params)
        cuSVC.fit(X_train, y_train)

        sklSVC = svm.SVC(**params)
        sklSVC.fit(X_train, y_train)

        compare_svm(cuSVC, sklSVC, X_test, y_test, coef_tol=1e-5,
                    report_summary=True)
示例#22
0
def test_auto_predict(input_type, base_output_type, global_output_type):
    """
    Test autowrapping on predict that will set target_type
    """
    X_in = create_input(input_type, np.float32, (10, 10), "F")

    # Test with output_type="input"
    est = DummyTestEstimator()

    # With cuml.global_settings.output_type == None, this should return the
    # input type
    X_out = est.predict(X_in)

    assert determine_array_type(X_out) == input_type

    assert_array_identical(X_in, X_out)

    # Test with output_type=base_output_type
    est = DummyTestEstimator(output_type=base_output_type)

    # With cuml.global_settings.output_type == None, this should return the
    # base_output_type
    X_out = est.predict(X_in)

    assert determine_array_type(X_out) == base_output_type

    assert_array_identical(X_in, X_out)

    # Test with global_output_type, should return global_output_type
    with cuml.using_output_type(global_output_type):
        X_out = est.predict(X_in)

        target_output_type = global_output_type

        if (target_output_type is None or target_output_type == "input"):
            target_output_type = base_output_type

        if (target_output_type == "input"):
            target_output_type = input_type

        assert determine_array_type(X_out) == target_output_type

        assert_array_identical(X_in, X_out)
示例#23
0
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats,
                   p, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    if as_instance:
        sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(),
                                                       k,
                                                       mode=mode,
                                                       metric=metric,
                                                       p=p,
                                                       include_self='auto')
    else:
        knn_sk = skKNN(metric=metric, p=p)
        knn_sk.fit(X.get())
        sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode)

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    with cuml.using_output_type(output_type):
        if as_instance:
            sparse_cu = cuml.neighbors.kneighbors_graph(X,
                                                        k,
                                                        mode=mode,
                                                        metric=metric,
                                                        p=p,
                                                        include_self='auto')
        else:
            knn_cu = cuKNN(metric=metric, p=p)
            knn_cu.fit(X)
            sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode)

    assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape)
    assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape)
    assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape)
    assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape)

    if output_type == 'cupy' or output_type is None:
        assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu)
    else:
        assert isspmatrix_csr(sparse_cu)
示例#24
0
    def transform(self, X) -> SparseCumlArray:
        """Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \
        or (n_samples, n_features_with_missing)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.

        """
        check_is_fitted(self)
        X = self._validate_input(X, in_fit=False)

        if X.shape[1] != self._n_features:
            raise ValueError("X has a different number of features "
                             "than during fitting.")

        imputer_mask, features = self._get_missing_features_info(X)

        if self.features == "missing-only":
            with cuml.using_output_type("numpy"):
                np_features = np.asnumpy(features)
                features_diff_fit_trans = numpy.setdiff1d(
                    np_features, self.features_)
                if (self.error_on_new and features_diff_fit_trans.size > 0):
                    raise ValueError("The features {} have missing values "
                                     "in transform but have no missing values "
                                     "in fit.".format(features_diff_fit_trans))

            if self.features_.size < self._n_features:
                imputer_mask = imputer_mask[:, self.features_]

        return imputer_mask
示例#25
0
def _fit_transform_one(transformer,
                       X,
                       y,
                       weight,
                       message_clsname='',
                       message=None,
                       **fit_params):
    """
    Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
    with the fitted transformer. If ``weight`` is not ``None``, the result will
    be multiplied by ``weight``.
    """
    with _print_elapsed_time(message_clsname, message):
        with cuml.using_output_type("cupy"):
            transformer.accept_sparse = True
            if hasattr(transformer, 'fit_transform'):
                res = transformer.fit_transform(X, y, **fit_params)
            else:
                res = transformer.fit(X, y, **fit_params).transform(X)

    if weight is None:
        return res, transformer
    return res * weight, transformer
示例#26
0
    def _iter(self, fitted=False, replace_strings=False):
        """
        Generate (name, trans, column, weight) tuples.

        If fitted=True, use the fitted transformers, else use the
        user specified transformers updated with converted column names
        and potentially appended with transformer for remainder.

        """
        if fitted:
            transformers = self.transformers_
        else:
            # interleave the validated column specifiers
            transformers = [
                (name, trans, column)
                for (name, trans,
                     _), column in zip(self.transformers, self._columns)
            ]
            # add transformer tuple for remainder
            if self._remainder[2] is not None:
                transformers = chain(transformers, [self._remainder])
        get_weight = (self.transformer_weights or {}).get

        for name, trans, column in transformers:
            if replace_strings:
                # replace 'passthrough' with identity transformer and
                # skip in case of 'drop'
                if trans == 'passthrough':
                    with cuml.using_output_type("cupy"):
                        trans = FunctionTransformer(accept_sparse=True,
                                                    check_inverse=False)
                elif trans == 'drop':
                    continue
                elif _is_empty_column_selection(column):
                    continue

            yield (name, trans, column, get_weight(name))
示例#27
0
def test_rf_classification_sparse(small_clf, datatype,
                                  fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(n_bins=16, split_criterion=0,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40)
    cuml_model.fit(X_train, y_train)

    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           output_class=True,
                                           threshold=0.5,
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       output_class=True,
                                       threshold=0.5,
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_acc = accuracy_score(y_test, fil_preds)

        fil_model = cuml_model.convert_to_fil_model()

        with cuml.using_output_type("numpy"):
            fil_model_preds = fil_model.predict(X_test)
            fil_model_acc = accuracy_score(y_test, fil_model_preds)
            assert fil_acc == fil_model_acc

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        if X.shape[0] < 500000:
            sk_model = skrfc(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_acc = accuracy_score(y_test, sk_preds)
            assert fil_acc >= (sk_acc - 0.07)