Пример #1
0
def test_basic_functions(labels, dtype, sparse_output):

    fit_labels, xform_labels = labels

    skl_bin = skLB(sparse_output=sparse_output)
    skl_bin.fit(fit_labels)

    fit_labels = cp.asarray(fit_labels, dtype=dtype)
    xform_labels = cp.asarray(xform_labels, dtype=dtype)

    binarizer = LabelBinarizer(sparse_output=sparse_output)
    binarizer.fit(fit_labels)

    assert array_equal(binarizer.classes_.get(),
                       np.unique(fit_labels.get()))

    xformed = binarizer.transform(xform_labels)

    if sparse_output:
        skl_bin_xformed = skl_bin.transform(xform_labels.get())

        if has_scipy():
            import scipy.sparse
        else:
            pytest.skip('Skipping test_basic_functions(sparse_output=True) ' +
                        'because Scipy is missing')

        skl_csr = scipy.sparse.coo_matrix(skl_bin_xformed).tocsr()
        cuml_csr = xformed

        array_equal(skl_csr.data, cuml_csr.data.get())

        # #todo: Support sparse inputs
        # xformed = xformed.todense().astype(dtype)

    assert xformed.shape[1] == binarizer.classes_.shape[0]

    original = binarizer.inverse_transform(xformed)

    assert array_equal(original.get(),
                       xform_labels.get())
Пример #2
0
def test_lars_attributes(datatype, params):
    X, y = load_boston(return_X_y=True)
    X = X.astype(datatype)
    y = y.astype(datatype)

    culars = cuLars(**params)
    culars.fit(X, y)

    sklars = skLars(**params)
    sklars.fit(X, y)

    assert culars.score(X, y) >= sklars.score(X, y) - 0.01

    limit_max_iter = "n_nonzero_coefs" in params
    if limit_max_iter:
        n_iter_tol = 0
    else:
        n_iter_tol = 2

    assert abs(culars.n_iter_ - sklars.n_iter_) <= n_iter_tol

    tol = 1e-4 if params.pop("fit_intercept", True) else 1e-1
    n = min(culars.n_iter_, sklars.n_iter_)
    assert array_equal(culars.alphas_[:n],
                       sklars.alphas_[:n],
                       unit_tol=tol,
                       total_tol=1e-4)
    assert array_equal(culars.active_[:n], sklars.active_[:n])

    if limit_max_iter:
        assert array_equal(culars.coef_, sklars.coef_)

        if hasattr(sklars, 'coef_path_'):
            assert array_equal(culars.coef_path_,
                               sklars.coef_path_[sklars.active_],
                               unit_tol=1e-3)

        intercept_diff = abs(culars.intercept_ - sklars.intercept_)
        if abs(sklars.intercept_) > 1e-6:
            intercept_diff /= sklars.intercept_
            assert intercept_diff <= 1e-3
Пример #3
0
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import PCA as daskPCA
        from sklearn.decomposition import PCA

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        try:

            cupca = daskPCA(n_components=5, whiten=True)
            cupca.fit(X_train)
        except Exception as e:
            print(str(e))

        skpca = PCA(n_components=5, whiten=True, svd_solver="full")
        skpca.fit(X_cpu)

        from cuml.test.utils import array_equal

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

        for attr in all_attr:
            with_sign = False if attr in ['components_'] else True
            cuml_res = (getattr(cupca, attr))
            if type(cuml_res) == np.ndarray:
                cuml_res = cuml_res.as_matrix()
            skl_res = getattr(skpca, attr)
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
    finally:
        client.close()
Пример #4
0
def test_lasso(input_type, selection):
    n_samples = 20
    n_feats = 5
    dtype = np.float64
    train_rows = np.int32(n_samples * 0.8)
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_feats,
                           n_informative=n_feats,
                           random_state=0)
    X_test = np.array(X[train_rows:, 0:]).astype(dtype)
    y_train = np.array(y[0:train_rows, ]).astype(dtype)
    y_test = np.array(y[train_rows:, ]).astype(dtype)
    X_train = np.array(X[0:train_rows, :]).astype(dtype)

    sklas = Lasso(alpha=np.array([0.01]),
                  fit_intercept=True,
                  normalize=False,
                  max_iter=1000,
                  selection=selection,
                  tol=1e-10)
    sklas.fit(X_train, y_train)
    sk_predict = sklas.predict(X_test)

    cu_lasso = cuLasso(alpha=np.array([0.01]),
                       fit_intercept=True,
                       normalize=False,
                       max_iter=1000,
                       selection=selection,
                       tol=1e-10)

    if input_type == 'dataframe':
        X_train = pd.DataFrame(
            {'fea%d' % i: X_train[0:, i]
             for i in range(X_train.shape[1])})
        y_train = pd.DataFrame({'fea0': y[0:train_rows, ]})
        X_test = pd.DataFrame(
            {'fea%d' % i: X_test[0:, i]
             for i in range(X_test.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_train)
        y_cudf = y_train.values
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
        cu_lasso.fit(X_cudf, y_cudf)
        cu_predict = cu_lasso.predict(X_cudf_test).to_array()

    else:
        cu_lasso.fit(X, y)
        cu_predict = cu_lasso.predict(X_test).to_array()

    error_sk = mean_squared_error(y_test, sk_predict)
    error_cu = mean_squared_error(y_test, cu_predict)
    assert array_equal(error_sk, error_cu, 1e-2, with_sign=True)
Пример #5
0
def test_nonmonotonic_labels():

    X = np.array([[0, 0, 1], [1, 0, 1]]).astype(np.float32)

    y = np.array([15, 5]).astype(np.int32)

    knn_cu = cuKNN(n_neighbors=1)
    knn_cu.fit(X, y)

    p = knn_cu.predict(X)

    assert array_equal(p.astype(np.int32), y)
Пример #6
0
def test_output_args(small_classifier_and_preds):
    model_path, model_type, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              model_type=model_type,
                              algo='TREE_REORG',
                              output_class=False,
                              threshold=0.50)
    X = np.asarray(X)
    fil_preds = fm.predict(X)
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds))

    assert array_equal(fil_preds, xgb_preds, 1e-3)
Пример #7
0
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth,
                            storage_type, model_class):

    # skip depth 20 for dense tests
    if max_depth == 20 and storage_type == 'DENSE':
        return

    # settings
    n_categories = 1
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=False)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestRegressor:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingRegressor
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)

    skl_mse = mean_squared_error(y_validation, skl_preds)

    algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=False,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds))

    fil_mse = mean_squared_error(y_validation, fil_preds)

    assert fil_mse == pytest.approx(skl_mse, 1e-4)
    assert array_equal(fil_preds, skl_preds)
Пример #8
0
def test_logistic_regression_predict_proba(dtype, nrows, column_info,
                                           num_classes, fit_intercept,
                                           sparse_input):
    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype,
        nrows=nrows,
        ncols=ncols,
        n_info=n_info,
        num_classes=num_classes)
    X_train = csr_matrix(X_train) if sparse_input else X_train
    X_test = csr_matrix(X_test) if sparse_input else X_test

    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)

    culog = cuLog(fit_intercept=fit_intercept, output_type="numpy")
    culog.fit(X_train, y_train)

    if num_classes > 2:
        sklog = skLog(fit_intercept=fit_intercept,
                      solver="lbfgs",
                      multi_class="multinomial")
    else:
        sklog = skLog(fit_intercept=fit_intercept)
    sklog.coef_ = culog.coef_.T
    if fit_intercept:
        sklog.intercept_ = culog.intercept_
    else:
        skLog.intercept_ = 0
    sklog.classes_ = np.arange(num_classes)

    cu_proba = culog.predict_proba(X_test)
    sk_proba = sklog.predict_proba(X_test)

    cu_log_proba = culog.predict_log_proba(X_test)
    sk_log_proba = sklog.predict_log_proba(X_test)

    assert array_equal(cu_proba, sk_proba)
    assert array_equal(cu_log_proba, sk_log_proba)
Пример #9
0
def test_sparse_pca_inputs(nrows, ncols, whiten, return_sparse, cupy_input):
    if ncols == 20000 and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            ncols = int(ncols * pytest.max_gpu_memory / 48)
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    if return_sparse:
        pytest.skip("Loss of information in converting to cupy sparse csr")

    X = cupyx.scipy.sparse.random(nrows,
                                  ncols,
                                  density=0.07,
                                  dtype=cp.float32,
                                  random_state=10)
    if not (cupy_input):
        X = X.get()

    p_sparse = cuPCA(n_components=ncols, whiten=whiten)

    p_sparse.fit(X)
    t_sparse = p_sparse.transform(X)
    i_sparse = p_sparse.inverse_transform(t_sparse,
                                          return_sparse=return_sparse)

    if return_sparse:

        assert isinstance(i_sparse, cupyx.scipy.sparse.csr_matrix)

        assert array_equal(i_sparse.todense(),
                           X.todense(),
                           1e-1,
                           with_sign=True)
    else:
        if cupy_input:
            assert isinstance(i_sparse, cp.ndarray)

        assert array_equal(i_sparse, X.todense(), 1e-1, with_sign=True)
Пример #10
0
def test_fil_classification(n_rows, n_columns, num_rounds, n_classes,
                            tmp_path):
    # settings
    classification = True  # change this to false to use regression
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_classes,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    n_rows, n_columns = X.shape
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    model_path = os.path.join(tmp_path, 'xgb_class.model')

    bst = _build_and_save_xgboost(model_path,
                                  X_train,
                                  y_train,
                                  num_rounds=num_rounds,
                                  classification=classification,
                                  n_classes=n_classes)

    dvalidation = xgb.DMatrix(X_validation, label=y_validation)

    if n_classes == 2:
        xgb_preds = bst.predict(dvalidation)
        xgb_preds_int = np.around(xgb_preds)
        xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1)
    else:
        xgb_proba = bst.predict(dvalidation)
        xgb_preds_int = xgb_proba.argmax(axis=1)
    xgb_acc = accuracy_score(y_validation, xgb_preds_int)

    fm = ForestInference.load(model_path,
                              algo='auto',
                              output_class=True,
                              threshold=0.50)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_proba = np.asarray(fm.predict_proba(X_validation))
    fil_acc = accuracy_score(y_validation, fil_preds)

    assert fil_acc == pytest.approx(xgb_acc, abs=0.01)
    assert array_equal(fil_preds, xgb_preds_int)
    np.testing.assert_allclose(fil_proba,
                               xgb_proba,
                               atol=proba_atol[n_classes > 2])
def test_elastic_net(datatype, X_type, alpha, algorithm, nrows, ncols, n_info):

    train_rows = np.int32(nrows * 0.8)
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=0)

    X_test = np.asarray(X[train_rows:, 0:], dtype=datatype)
    X_train = np.asarray(X[0:train_rows, :], dtype=datatype)
    y_train = np.asarray(y[0:train_rows, ], dtype=datatype)

    elastic_cu = cuElasticNet(alpha=np.array([alpha]),
                              fit_intercept=True,
                              normalize=False,
                              max_iter=1000,
                              selection=algorithm,
                              tol=1e-10)

    if X_type == 'dataframe':
        y_train = pd.DataFrame({'fea0': y_train[0:, ]})
        X_train = pd.DataFrame(
            {'fea%d' % i: X_train[0:, i]
             for i in range(X_train.shape[1])})
        X_test = pd.DataFrame(
            {'fea%d' % i: X_test[0:, i]
             for i in range(X_test.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_train)
        X_cudf_test = cudf.DataFrame.from_pandas(X_test)
        y_cudf = y_train.values
        y_cudf = y_cudf[:, 0]
        y_cudf = cudf.Series(y_cudf)
        elastic_cu.fit(X_cudf, y_cudf)
        cu_predict = elastic_cu.predict(X_cudf_test)

    elif X_type == 'ndarray':

        elastic_cu.fit(X_train, y_train)
        cu_predict = elastic_cu.predict(X_test)

    if nrows < 500000:
        elastic_sk = ElasticNet(alpha=np.array([alpha]),
                                fit_intercept=True,
                                normalize=False,
                                max_iter=1000,
                                selection=algorithm,
                                tol=1e-10)
        elastic_sk.fit(X_train, y_train)
        sk_predict = elastic_sk.predict(X_test)
        assert array_equal(sk_predict, cu_predict, 1e-1, with_sign=True)
Пример #12
0
def test_pca_fit(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import PCA as daskPCA
    from sklearn.decomposition import PCA

    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=0.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    X = X_cudf.compute().to_pandas().values

    cupca = daskPCA(n_components=5, whiten=True)
    cupca.fit(X_cudf)

    skpca = PCA(n_components=5, whiten=True, svd_solver="full")
    skpca.fit(X)

    from cuml.test.utils import array_equal

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    if owns_cluster:
        client.close()
        cluster.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cupca, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Пример #13
0
def test_basic_functions(labels, cluster):

    client = None

    try:

        client = Client(cluster)

        fit_labels, xform_labels = labels

        s = cp.asarray(fit_labels, dtype=np.int32)
        df = dask.array.from_array(s)

        s2 = cp.asarray(xform_labels, dtype=np.int32)
        df2 = dask.array.from_array(s2)

        binarizer = LabelBinarizer(client=client, sparse_output=False)
        binarizer.fit(df)

        assert array_equal(cp.asnumpy(binarizer.classes_),
                           np.unique(cp.asnumpy(s)))

        xformed = binarizer.transform(df2)

        xformed = xformed.map_blocks(lambda x: x.get(), dtype=cp.float32)
        xformed.compute_chunk_sizes()

        assert xformed.compute().shape[1] == binarizer.classes_.shape[0]

        original = binarizer.inverse_transform(xformed)
        test = original.compute()

        assert array_equal(cp.asnumpy(test), xform_labels)
    finally:
        if client is not None:
            print("Closing client")
            client.close()
Пример #14
0
def test_linear_models_set_params(algo):
    x = np.linspace(0, 1, 50)
    y = 2 * x

    model = algo()
    model.fit(x, y)
    coef_before = model.coef_

    if algo == cuLog:
        params = {'penalty': "none", 'C': 1, 'max_iter': 30}
        model = algo(penalty='none', C=1, max_iter=30)
    else:
        model = algo(solver='svd', alpha=0.1)
        params = {'solver': "svd", 'alpha': 0.1}
    model.fit(x, y)
    coef_after = model.coef_

    model = algo()
    model.set_params(**params)
    model.fit(x, y)
    coef_test = model.coef_

    assert not array_equal(coef_before, coef_after)
    assert array_equal(coef_after, coef_test)
Пример #15
0
def test_targetencoder_cupy():
    """
    Note that there are newly-encountered values in x_test,
    namely, 3 and 4.
    """
    x_train = cp.array([1, 2, 2, 1])
    y_train = cp.array([1, 0, 1, 1])
    x_test = cp.array([1, 2, 3, 4])
    encoder = TargetEncoder()
    encoder.fit_transform(x_train, y_train)
    test_encoded = encoder.transform(x_test)
    answer = np.array([1., 0.5, 0.75, 0.75])
    assert array_equal(test_encoded, answer)
    print(type(test_encoded))
    assert isinstance(test_encoded, cp.ndarray)
Пример #16
0
def test_tsvd_inverse_transform(datatype, input_type):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        Xcutsvd = cutsvd.fit_transform(gdf)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)
        Xcutsvd = cutsvd.fit_transform(X)

    input_gdf = cutsvd.inverse_transform(Xcutsvd)
    assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
Пример #17
0
def test_ridge_regression_model_default(datatype):

    X_train, X_test, y_train, y_test = small_regression_dataset(datatype)

    curidge = cuRidge()

    # fit and predict cuml ridge regression model
    curidge.fit(X_train, y_train)
    curidge_predict = curidge.predict(X_test)

    # sklearn ridge regression model initialization, fit and predict
    skridge = skRidge()
    skridge.fit(X_train, y_train)
    skridge_predict = skridge.predict(X_test)

    assert array_equal(skridge_predict, curidge_predict, 1e-1, with_sign=True)
Пример #18
0
def test_lightgbm(tmp_path, num_classes):
    import lightgbm as lgb
    X, y = simulate_data(500,
                         10 if num_classes == 2 else 50,
                         num_classes,
                         random_state=43210,
                         classification=True)
    train_data = lgb.Dataset(X, label=y)

    if num_classes == 2:
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_class': 1
        }
    else:
        param = {
            'objective': 'ova',  # 'multiclass', would use softmax
            'metric': 'multi_logloss',
            'num_class': num_classes
        }
    num_round = 5
    bst = lgb.train(param, train_data, num_round)
    gbm_preds = bst.predict(X)
    if num_classes > 2:
        gbm_preds = gbm_preds.argmax(axis=1)
    model_path = str(os.path.join(tmp_path, 'lgb.model'))
    bst.save_model(model_path)
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=True,
                              model_type="lightgbm")
    fil_preds = fm.predict(X)
    assert array_equal(np.round(gbm_preds), fil_preds)

    if num_classes == 2:
        lcls = lgb.LGBMClassifier().set_params(**param)
        lcls.fit(X, y)
        gbm_proba = lcls.predict_proba(X)

        lcls.booster_.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        fil_proba = fm.predict_proba(X)
        assert np.allclose(gbm_proba, fil_proba, 1e-2)
Пример #19
0
def test_monotonic_validate_invert_labels(arr_type, dtype, copy):

    arr = np.array([0, 15, 10, 50, 20, 50], dtype=dtype)

    original = arr.copy()

    if arr_type == "cp":
        arr = cp.asarray(arr, dtype=dtype)
        arr_orig = arr.copy()

    monotonic, mapped_classes = make_monotonic(arr, copy=copy)

    cp.cuda.Stream.null.synchronize()

    assert array_equal(monotonic.get(), np.array([0, 2, 1, 4, 3, 4]))

    # We only care about in-place updating if data is on device
    if arr_type == "cp":
        if copy:
            assert array_equal(arr_orig.get(), arr.get())
        else:
            assert array_equal(arr.get(), monotonic.get())

    wrong_classes = cp.asarray([0, 1, 2], dtype=dtype)
    val_labels = check_labels(monotonic.get(), classes=wrong_classes)

    cp.cuda.Stream.null.synchronize()

    assert not val_labels

    correct_classes = cp.asarray([0, 1, 2, 3, 4], dtype=dtype)
    val_labels = check_labels(monotonic.get(), classes=correct_classes)

    cp.cuda.Stream.null.synchronize()

    assert val_labels

    if arr_type == "cp":
        monotonic_copy = monotonic.copy()

    inverted = invert_labels(monotonic,
                             classes=cp.asarray([0, 10, 15, 20, 50],
                                                dtype=dtype),
                             copy=copy)

    cp.cuda.Stream.null.synchronize()

    if arr_type == "cp":
        if copy:
            assert array_equal(monotonic_copy.get(), monotonic.get())
        else:
            assert array_equal(monotonic.get(), arr_orig.get())

    assert array_equal(inverted.get(), original)
Пример #20
0
def test_targetencoder_pandas():
    """
    Note that there are newly-encountered values in test,
    namely, 'c' and 'd'.
    """
    train = pandas.DataFrame({
        'category': ['a', 'b', 'b', 'a'],
        'label': [1, 0, 1, 1]
    })
    test = pandas.DataFrame({'category': ['c', 'b', 'a', 'd']})
    encoder = TargetEncoder()
    encoder.fit_transform(train.category, train.label)
    test_encoded = encoder.transform(test.category)
    answer = np.array([0.75, 0.5, 1., 0.75])
    assert array_equal(test_encoded, answer)
    print(type(test_encoded))
    assert isinstance(test_encoded, np.ndarray)
Пример #21
0
def test_pca_inverse_transform(datatype, input_type):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
    cupca = cuPCA(n_components=2)

    if input_type == 'dataframe':
        Xcupca = cupca.fit_transform(gdf)

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)
        Xcupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(Xcupca)

    assert array_equal(input_gdf, gdf, 1e-3, with_sign=True)
Пример #22
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cupca.fit(X_cudf)

    else:
        cupca.fit(X)

    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Пример #23
0
def test_pca_fit(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import PCA as daskPCA
        from sklearn.decomposition import PCA

        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts,
                               cluster_std=0.5, verbose=False,
                               random_state=10, dtype=np.float32)

        wait(X_cudf)

        print(str(X_cudf.head(3)))

        try:

            cupca = daskPCA(n_components=5, whiten=True)
            cupca.fit(X_cudf)
        except Exception as e:
            print(str(e))

        X = X_cudf.compute().to_pandas().values

        skpca = PCA(n_components=5, whiten=True, svd_solver="full")
        skpca.fit(X)

        from cuml.test.utils import array_equal

        all_attr = ['singular_values_', 'components_',
                    'explained_variance_', 'explained_variance_ratio_']

        for attr in all_attr:
            with_sign = False if attr in ['components_'] else True
            cuml_res = (getattr(cupca, attr))
            if type(cuml_res) == np.ndarray:
                cuml_res = cuml_res.as_matrix()
            skl_res = getattr(skpca, attr)
            assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
    finally:
        client.close()
Пример #24
0
def test_tsvd_fit_transform(datatype, input_type):
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)
    skpca = skTSVD(n_components=1)
    Xsktsvd = skpca.fit_transform(X)

    cutsvd = cuTSVD(n_components=1)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
        gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)
        Xcutsvd = cutsvd.fit_transform(gdf)

    else:
        Xcutsvd = cutsvd.fit_transform(X)

    assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
Пример #25
0
def test_linear_regression_model_default(datatype):

    X_train, X_test, y_train, y_test = small_regression_dataset(datatype)

    # Initialization of cuML's linear regression model
    cuols = cuLinearRegression()

    # fit and predict cuml linear regression model
    cuols.fit(X_train, y_train)
    cuols_predict = cuols.predict(X_test)

    # sklearn linear regression model initialization and fit
    skols = skLinearRegression()
    skols.fit(X_train, y_train)

    skols_predict = skols.predict(X_test)

    assert array_equal(skols_predict, cuols_predict, 1e-1, with_sign=True)
Пример #26
0
def test_targetencoder_random(n_samples, dtype):

    x = cp.random.randint(0, 1000, n_samples).astype(dtype)
    y = cp.random.randint(0, 2, n_samples).astype(dtype)
    xt = cp.random.randint(0, 1000, n_samples).astype(dtype)

    encoder = TargetEncoder()
    encoder.fit_transform(x, y)
    test_encoded = encoder.transform(xt)

    df_train = cudf.DataFrame({'x': x, 'y': y})
    dg = df_train.groupby('x', as_index=False).agg({'y': 'mean'})
    df_test = cudf.DataFrame({'x': xt})
    df_test['row_id'] = cp.arange(len(df_test))
    df_test = df_test.merge(dg, on='x', how='left')
    df_test = df_test.sort_values('row_id')
    answer = df_test['y'].fillna(cp.mean(y).item()).values
    assert array_equal(test_encoded, answer)
Пример #27
0
def test_ann_distances_metrics(algo, metric):
    X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0)

    cu_knn = cuKNN(algorithm=algo, metric=metric)
    cu_knn.fit(X)
    cu_dist, cu_ind = cu_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)
    del cu_knn
    gc.collect()

    X = X.get()
    sk_knn = skKNN(metric=metric)
    sk_knn.fit(X)
    sk_dist, sk_ind = sk_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)

    return array_equal(sk_dist, cu_dist)
Пример #28
0
def test_ivfflat_pred(nrows, ncols, n_neighbors, nlist):
    algo_params = {'nlist': nlist, 'nprobe': nlist * 0.25}

    X, y = make_blobs(n_samples=nrows,
                      centers=5,
                      n_features=ncols,
                      random_state=0)

    knn_cu = cuKNN(algorithm="ivfflat", algo_params=algo_params)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Пример #29
0
def test_cov(nrows, ncols, sparse, dtype):
    if sparse:
        x = cupyx.scipy.sparse.random(nrows,
                                      ncols,
                                      density=0.07,
                                      format='csr',
                                      dtype=dtype)
    else:
        x = cp.random.random((nrows, ncols), dtype=dtype)

    cov_result = cov(x, x)

    assert cov_result.shape == (ncols, ncols)

    if sparse:
        x = x.todense()
    local_cov = cp.cov(x, rowvar=False, ddof=0)

    assert array_equal(cov_result, local_cov, 1e-6, with_sign=True)
Пример #30
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test. "
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64