Пример #1
0
def test_make_column_selector():
    X_np = pdDataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
                        'rating': [5, 3, 4, 5],
                        'temperature': [21., 21., 24., 28.]})
    X = cudf.from_pandas(X_np)

    cu_transformers = [
        ("ohe", cuOneHotEncoder(),
         cu_make_column_selector(dtype_exclude=np.number)),
        ("scaler", cuStandardScaler(),
         cu_make_column_selector(dtype_include=np.integer)),
        ("normalizer", cuNormalizer(),
         cu_make_column_selector(pattern="temp"))
    ]
    transformer = cuColumnTransformer(cu_transformers, remainder='drop')
    t_X = transformer.fit_transform(X)

    sk_transformers = [
        ("ohe", skOneHotEncoder(),
         sk_make_column_selector(dtype_exclude=np.number)),
        ("scaler", skStandardScaler(),
         sk_make_column_selector(dtype_include=np.integer)),
        ("normalizer", skNormalizer(),
         sk_make_column_selector(pattern="temp"))
    ]
    transformer = skColumnTransformer(sk_transformers, remainder='drop')
    sk_t_X = transformer.fit_transform(X_np)

    assert_allclose(t_X, sk_t_X)
    assert type(t_X) == type(X)
Пример #2
0
def test_maxabs_scaler_sparse(failure_logger,
                              sparse_clf_dataset):  # noqa: F811
    X_np, X = sparse_clf_dataset

    scaler = cuMaxAbsScaler(copy=True)
    t_X = scaler.fit_transform(X)
    scaler.fit_transform(X)
    r_X = scaler.inverse_transform(t_X)
    #  assert type(t_X) == type(X)
    #  assert type(r_X) == type(t_X)
    if cpx.scipy.sparse.issparse(X):
        assert cpx.scipy.sparse.issparse(t_X)
    if scipy.sparse.issparse(X):
        assert scipy.sparse.issparse(t_X)
    if cpx.scipy.sparse.issparse(t_X):
        assert cpx.scipy.sparse.issparse(r_X)
    if scipy.sparse.issparse(t_X):
        assert scipy.sparse.issparse(r_X)

    scaler = skMaxAbsScaler(copy=True)
    sk_t_X = scaler.fit_transform(X_np)
    sk_r_X = scaler.inverse_transform(sk_t_X)

    assert_allclose(t_X, sk_t_X)
    assert_allclose(r_X, sk_r_X)
Пример #3
0
def test_imputer_sparse(
        sparse_int_dataset,
        strategy,  # noqa: F811
        missing_values):
    X_np, X = sparse_int_dataset

    if X.format == 'csr':
        pytest.skip("Skipping CSR matrices")

    X_sp = X_np.tocsc()

    if np.isnan(missing_values):
        # Adding nan when missing value is nan
        random_loc = np.random.choice(X.nnz, int(X.nnz * 0.1), replace=False)
        X_sp.data[random_loc] = np.nan
        X = X.copy()
        X.data[random_loc] = np.nan

    fill_value = np.random.randint(10, size=1)[0]

    imputer = cuSimpleImputer(copy=True,
                              missing_values=missing_values,
                              strategy=strategy,
                              fill_value=fill_value)
    t_X = imputer.fit_transform(X)
    assert type(t_X) == type(X)

    imputer = skSimpleImputer(copy=True,
                              missing_values=missing_values,
                              strategy=strategy,
                              fill_value=fill_value)
    sk_t_X = imputer.fit_transform(X_sp)
    assert_allclose(t_X, sk_t_X)
Пример #4
0
def test_make_column_transformer(clf_dataset, remainder):  # noqa: F811
    X_np, X = clf_dataset

    sk_selec1 = [0, 2]
    sk_selec2 = [1, 3]
    cu_selec1 = sk_selec1
    cu_selec2 = sk_selec2
    if isinstance(X, (pdDataFrame, cuDataFrame)):
        cu_selec1 = ['c'+str(i) for i in sk_selec1]
        cu_selec2 = ['c'+str(i) for i in sk_selec2]

    transformer = cu_make_column_transformer(
        (cuStandardScaler(), cu_selec1),
        (cuNormalizer(), cu_selec2),
        remainder=remainder)

    ft_X = transformer.fit_transform(X)
    t_X = transformer.transform(X)
    assert type(t_X) == type(X)

    transformer = sk_make_column_transformer(
        (skStandardScaler(), sk_selec1),
        (skNormalizer(), sk_selec2),
        remainder=remainder)
    sk_t_X = transformer.fit_transform(X_np)

    assert_allclose(ft_X, sk_t_X)
    assert_allclose(t_X, sk_t_X)
Пример #5
0
def test_missing_indicator(failure_logger, int_dataset,  # noqa: F811
                           missing_values, features):
    zero_filled, one_filled, nan_filled = int_dataset
    if missing_values == 0:
        X_np, X = zero_filled
    elif missing_values == 1:
        X_np, X = one_filled
    else:
        X_np, X = nan_filled

    indicator = cuMissingIndicator(missing_values=missing_values,
                                   features=features)
    ft_X = indicator.fit_transform(X)
    assert type(ft_X) == type(X)
    indicator.fit(X)
    t_X = indicator.transform(X)
    assert type(t_X) == type(X)

    indicator = skMissingIndicator(missing_values=missing_values,
                                   features=features)
    sk_ft_X = indicator.fit_transform(X_np)
    indicator.fit(X_np)
    sk_t_X = indicator.transform(X_np)

    assert_allclose(ft_X, sk_ft_X)
    assert_allclose(t_X, sk_t_X)
Пример #6
0
def test_make_column_transformer_sparse(sparse_clf_dataset,  # noqa: F811
                                        remainder, sparse_threshold):
    X_np, X = sparse_clf_dataset

    if X.format == 'csc':
        pytest.xfail()
    dataset_density = X.nnz / X.size

    transformer = cu_make_column_transformer(
        (cuStandardScaler(with_mean=False), [0, 2]),
        (cuNormalizer(), [1, 3]),
        remainder=remainder,
        sparse_threshold=sparse_threshold)

    ft_X = transformer.fit_transform(X)
    t_X = transformer.transform(X)
    if dataset_density < sparse_threshold:
        # Sparse input -> sparse output if dataset_density > sparse_threshold
        # else sparse input -> dense output
        assert type(t_X) == type(X)

    transformer = sk_make_column_transformer(
        (skStandardScaler(with_mean=False), [0, 2]),
        (skNormalizer(), [1, 3]),
        remainder=remainder,
        sparse_threshold=sparse_threshold)

    sk_t_X = transformer.fit_transform(X_np)

    assert_allclose(ft_X, sk_t_X)
    assert_allclose(t_X, sk_t_X)
Пример #7
0
def test_robust_scaler_sparse(
        sparse_clf_dataset,  # noqa: F811
        with_scaling,
        quantile_range):
    X_np, X = sparse_clf_dataset

    if X.format != 'csc':
        X = X.tocsc()

    scaler = cuRobustScaler(with_centering=False,
                            with_scaling=with_scaling,
                            quantile_range=quantile_range,
                            copy=True)
    t_X = scaler.fit_transform(X)
    r_X = scaler.inverse_transform(t_X)
    assert type(t_X) == type(X)
    assert type(r_X) == type(t_X)

    scaler = skRobustScaler(with_centering=False,
                            with_scaling=with_scaling,
                            quantile_range=quantile_range,
                            copy=True)
    sk_t_X = scaler.fit_transform(X_np)
    sk_r_X = scaler.inverse_transform(sk_t_X)

    assert_allclose(t_X, sk_t_X)
    assert_allclose(r_X, sk_r_X)
Пример #8
0
def test_robust_scaler(
        clf_dataset,
        with_centering,  # noqa: F811
        with_scaling,
        quantile_range):
    X_np, X = clf_dataset

    scaler = cuRobustScaler(with_centering=with_centering,
                            with_scaling=with_scaling,
                            quantile_range=quantile_range,
                            copy=True)
    t_X = scaler.fit_transform(X)
    r_X = scaler.inverse_transform(t_X)
    assert type(t_X) == type(X)
    assert type(r_X) == type(t_X)

    scaler = skRobustScaler(with_centering=with_centering,
                            with_scaling=with_scaling,
                            quantile_range=quantile_range,
                            copy=True)
    sk_t_X = scaler.fit_transform(X_np)
    sk_r_X = scaler.inverse_transform(sk_t_X)

    assert_allclose(t_X, sk_t_X)
    assert_allclose(r_X, sk_r_X)
Пример #9
0
def test_poly_features(
        clf_dataset,
        degree,  # noqa: F811
        interaction_only,
        include_bias,
        order):
    X_np, X = clf_dataset

    polyfeatures = cuPolynomialFeatures(degree=degree,
                                        order=order,
                                        interaction_only=interaction_only,
                                        include_bias=include_bias)
    t_X = polyfeatures.fit_transform(X)
    assert type(X) == type(t_X)

    if isinstance(t_X, np.ndarray):
        if order == 'C':
            assert t_X.flags['C_CONTIGUOUS']
        elif order == 'F':
            assert t_X.flags['F_CONTIGUOUS']

    polyfeatures = skPolynomialFeatures(degree=degree,
                                        order=order,
                                        interaction_only=interaction_only,
                                        include_bias=include_bias)
    sk_t_X = polyfeatures.fit_transform(X_np)

    assert_allclose(t_X, sk_t_X, rtol=0.1, atol=0.1)
Пример #10
0
def test_standard_scaler_sparse(failure_logger,
                                sparse_clf_dataset,  # noqa: F811
                                with_std):
    X_np, X = sparse_clf_dataset

    scaler = cuStandardScaler(with_mean=False, with_std=with_std, copy=True)
    t_X = scaler.fit_transform(X)
    scaler.fit_transform(X)
    r_X = scaler.inverse_transform(t_X)
    #  assert type(t_X) == type(X)
    #  assert type(r_X) == type(t_X)
    if cpx.scipy.sparse.issparse(X):
        assert cpx.scipy.sparse.issparse(t_X)
    if scipy.sparse.issparse(X):
        assert scipy.sparse.issparse(t_X)
    if cpx.scipy.sparse.issparse(t_X):
        assert cpx.scipy.sparse.issparse(r_X)
    if scipy.sparse.issparse(t_X):
        assert scipy.sparse.issparse(r_X)

    scaler = skStandardScaler(copy=True, with_mean=False, with_std=with_std)
    sk_t_X = scaler.fit_transform(X_np)
    sk_r_X = scaler.inverse_transform(sk_t_X)

    assert_allclose(t_X, sk_t_X)
    assert_allclose(r_X, sk_r_X)
Пример #11
0
def test_robust_scale_sparse(
        sparse_clf_dataset,  # noqa: F811
        axis,
        with_scaling,
        quantile_range):
    X_np, X = sparse_clf_dataset

    if X.format != 'csc' and axis == 0:
        X = X.tocsc()
    elif X.format != 'csr' and axis == 1:
        X = X.tocsr()

    t_X = cu_robust_scale(X,
                          axis=axis,
                          with_centering=False,
                          with_scaling=with_scaling,
                          quantile_range=quantile_range,
                          copy=True)
    assert type(t_X) == type(X)

    sk_t_X = sk_robust_scale(X_np,
                             axis=axis,
                             with_centering=False,
                             with_scaling=with_scaling,
                             quantile_range=quantile_range,
                             copy=True)

    assert_allclose(t_X, sk_t_X)
Пример #12
0
def test_add_dummy_feature_sparse(sparse_clf_dataset, value):  # noqa: F811
    X_np, X = sparse_clf_dataset

    t_X = cu_add_dummy_feature(X, value=value)
    assert type(t_X) == type(X)

    sk_t_X = sk_add_dummy_feature(X_np, value=value)
    assert_allclose(t_X, sk_t_X)
Пример #13
0
def test_inplace_csr_row_normalize_l2(failure_logger, sparse_random_dataset):
    X_np, _, _, X_sparse = sparse_random_dataset
    if X_sparse.format != 'csr':
        pytest.skip('Skip non CSR matrices')

    inplace_csr_row_normalize_l2(X_sparse)
    X_np = sk_normalize(X_np, norm='l2', axis=1)
    assert_allclose(X_sparse, X_np)
Пример #14
0
def test_binarize_sparse(sparse_clf_dataset, threshold):  # noqa: F811
    X_np, X = sparse_clf_dataset

    t_X = cu_binarize(X, threshold=threshold, copy=True)
    assert type(t_X) == type(X)

    sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True)

    assert_allclose(t_X, sk_t_X)
Пример #15
0
def test_scale_sparse(sparse_clf_dataset, with_std):  # noqa: F811
    X_np, X = sparse_clf_dataset

    t_X = cu_scale(X, copy=True, with_mean=False, with_std=with_std)
    assert type(t_X) == type(X)

    sk_t_X = sk_scale(X_np, copy=True, with_mean=False, with_std=with_std)

    assert_allclose(t_X, sk_t_X)
Пример #16
0
def test_binarize(failure_logger, clf_dataset, threshold):  # noqa: F811
    X_np, X = clf_dataset

    t_X = cu_binarize(X, threshold=threshold, copy=True)
    assert type(t_X) == type(X)

    sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True)

    assert_allclose(t_X, sk_t_X)
Пример #17
0
def test_maxabs_scale(failure_logger, clf_dataset, axis):  # noqa: F811
    X_np, X = clf_dataset

    t_X = cu_maxabs_scale(X, axis=axis)
    assert type(t_X) == type(X)

    sk_t_X = sk_maxabs_scale(X_np, axis=axis)

    assert_allclose(t_X, sk_t_X)
Пример #18
0
def test_minmax_scale(clf_dataset):  # noqa: F811
    X_np, X = clf_dataset

    t_X = cu_minmax_scale(X)
    assert type(t_X) == type(X)

    sk_t_X = sk_minmax_scale(X_np)

    assert_allclose(t_X, sk_t_X)
Пример #19
0
def test_minmax_scale(failure_logger, clf_dataset,  # noqa: F811
                      axis, feature_range):
    X_np, X = clf_dataset

    t_X = cu_minmax_scale(X, feature_range=feature_range, axis=axis)
    assert type(t_X) == type(X)

    sk_t_X = sk_minmax_scale(X_np, feature_range=feature_range, axis=axis)

    assert_allclose(t_X, sk_t_X)
Пример #20
0
def test_row_norms(failure_logger, sparse_random_dataset, square):
    X_np, X, X_sparse_np, X_sparse = sparse_random_dataset

    cu_norms = cu_row_norms(X_np, squared=square)
    sk_norms = sk_row_norms(X, squared=square)
    assert_allclose(cu_norms, sk_norms)

    cu_norms = cu_row_norms(X_sparse, squared=square)
    sk_norms = sk_row_norms(X_sparse_np, squared=square)
    assert_allclose(cu_norms, sk_norms)
Пример #21
0
def test_inplace_csr_row_scale(failure_logger, random_seed,
                               sparse_random_dataset):
    _, _, X_sparse_np, X_sparse = sparse_random_dataset
    if X_sparse.format != 'csr':
        pytest.skip()
    cp.random.seed(random_seed)
    scale = cp.random.rand(100)
    cu_inplace_csr_row_scale(X_sparse, scale)
    sk_inplace_csr_row_scale(X_sparse_np, scale.get())
    assert_allclose(X_sparse, X_sparse_np)
Пример #22
0
def test_inplace_column_scale(failure_logger, random_seed,
                              sparse_random_dataset):
    _, X, X_sparse_np, X_sparse = sparse_random_dataset
    cp.random.seed(random_seed)
    scale = cp.random.rand(10)
    cu_inplace_column_scale(X_sparse, scale)
    sk_inplace_column_scale(X_sparse_np, scale.get())
    assert_allclose(X_sparse, X_sparse_np)
    with pytest.raises(Exception):
        cu_inplace_column_scale(X, scale)
Пример #23
0
def test_normalizer(clf_dataset, norm):  # noqa: F811
    X_np, X = clf_dataset

    normalizer = cuNormalizer(norm=norm, copy=True)
    t_X = normalizer.fit_transform(X)
    assert type(t_X) == type(X)

    normalizer = skNormalizer(norm=norm, copy=True)
    sk_t_X = normalizer.fit_transform(X_np)

    assert_allclose(t_X, sk_t_X)
Пример #24
0
def test_binarizer_sparse(sparse_clf_dataset, threshold):  # noqa: F811
    X_np, X = sparse_clf_dataset

    binarizer = cuBinarizer(threshold=threshold, copy=True)
    t_X = binarizer.fit_transform(X)
    assert type(t_X) == type(X)

    binarizer = skBinarizer(threshold=threshold, copy=True)
    sk_t_X = binarizer.fit_transform(X_np)

    assert_allclose(t_X, sk_t_X)
Пример #25
0
def test_normalize_sparse(sparse_clf_dataset, norm):  # noqa: F811
    X_np, X = sparse_clf_dataset

    axis = 0 if X.format == 'csc' else 1

    t_X = cu_normalize(X, axis=axis, norm=norm)
    assert type(t_X) == type(X)

    sk_t_X = sk_normalize(X_np, axis=axis, norm=norm)

    assert_allclose(t_X, sk_t_X)
Пример #26
0
def test_csc_mean_variance_axis0(failure_logger, sparse_random_dataset):
    X_np, _, _, X_sparse = sparse_random_dataset
    if X_sparse.format != 'csc':
        pytest.skip('Skip non CSC matrices')

    means, variances = csc_mean_variance_axis0(X_sparse)

    ref_means = np.nanmean(X_np, axis=0)
    ref_variances = np.nanvar(X_np, axis=0)

    assert_allclose(means, ref_means)
    assert_allclose(variances, ref_variances)
Пример #27
0
def test_scale(failure_logger, clf_dataset, axis,  # noqa: F811
               with_mean, with_std):
    X_np, X = clf_dataset

    t_X = cu_scale(X, axis=axis, with_mean=with_mean,
                   with_std=with_std, copy=True)
    assert type(t_X) == type(X)

    sk_t_X = sk_scale(X_np, axis=axis, with_mean=with_mean,
                      with_std=with_std, copy=True)

    assert_allclose(t_X, sk_t_X)
Пример #28
0
def test_add_dummy_feature_sparse(sparse_dataset_with_coo,  # noqa: F811
                                  value):
    X_np, X = sparse_dataset_with_coo

    t_X = cu_add_dummy_feature(X, value=value)
    #  assert type(t_X) == type(X)
    if cp.sparse.issparse(X):
        assert cp.sparse.issparse(t_X)
    if scipy.sparse.issparse(X):
        assert scipy.sparse.issparse(t_X)

    sk_t_X = sk_add_dummy_feature(X_np, value=value)
    assert_allclose(t_X, sk_t_X)
Пример #29
0
def test_binarize_sparse(failure_logger, sparse_clf_dataset,  # noqa: F811
                         threshold):
    X_np, X = sparse_clf_dataset

    t_X = cu_binarize(X, threshold=threshold, copy=True)
    #  assert type(t_X) == type(X)
    if cpx.scipy.sparse.issparse(X):
        assert cpx.scipy.sparse.issparse(t_X)
    if scipy.sparse.issparse(X):
        assert scipy.sparse.issparse(t_X)

    sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True)

    assert_allclose(t_X, sk_t_X)
Пример #30
0
def test_normalizer_sparse(sparse_clf_dataset, norm):  # noqa: F811
    X_np, X = sparse_clf_dataset

    if X.format == 'csc':
        pytest.skip("Skipping CSC matrices")

    normalizer = cuNormalizer(norm=norm, copy=True)
    t_X = normalizer.fit_transform(X)
    assert type(t_X) == type(X)

    normalizer = skNormalizer(norm=norm, copy=True)
    sk_t_X = normalizer.fit_transform(X_np)

    assert_allclose(t_X, sk_t_X)