def test_standard_scaler_sparse(failure_logger, sparse_clf_dataset, # noqa: F811 with_std): X_np, X = sparse_clf_dataset scaler = cuStandardScaler(with_mean=False, with_std=with_std, copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) # assert type(t_X) == type(X) # assert type(r_X) == type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) if cpx.scipy.sparse.issparse(t_X): assert cpx.scipy.sparse.issparse(r_X) if scipy.sparse.issparse(t_X): assert scipy.sparse.issparse(r_X) scaler = skStandardScaler(copy=True, with_mean=False, with_std=with_std) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_column_transformer( clf_dataset, remainder, # noqa: F811 transformer_weights): X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c' + str(i) for i in sk_selec1] cu_selec2 = ['c' + str(i) for i in sk_selec2] cu_transformers = [("scaler", cuStandardScaler(), cu_selec1), ("normalizer", cuNormalizer(), cu_selec2)] transformer = cuColumnTransformer(cu_transformers, remainder=remainder, transformer_weights=transformer_weights) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) sk_transformers = [("scaler", skStandardScaler(), sk_selec1), ("normalizer", skNormalizer(), sk_selec2)] transformer = skColumnTransformer(sk_transformers, remainder=remainder, transformer_weights=transformer_weights) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_make_column_selector(): X_np = pdDataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5], 'temperature': [21., 21., 24., 28.] }) X = cudf.from_pandas(X_np) cu_transformers = [("ohe", cuOneHotEncoder(), cu_make_column_selector(dtype_exclude=np.number)), ("scaler", cuStandardScaler(), cu_make_column_selector(dtype_include=np.integer)), ("normalizer", cuNormalizer(), cu_make_column_selector(pattern="temp"))] transformer = cuColumnTransformer(cu_transformers, remainder='drop') t_X = transformer.fit_transform(X) sk_transformers = [("ohe", skOneHotEncoder(), sk_make_column_selector(dtype_exclude=np.number)), ("scaler", skStandardScaler(), sk_make_column_selector(dtype_include=np.integer)), ("normalizer", skNormalizer(), sk_make_column_selector(pattern="temp"))] transformer = skColumnTransformer(sk_transformers, remainder='drop') sk_t_X = transformer.fit_transform(X_np) assert_allclose(t_X, sk_t_X) assert type(t_X) == type(X)
def test_make_column_transformer_sparse( sparse_clf_dataset, # noqa: F811 remainder, sparse_threshold): X_np, X = sparse_clf_dataset if X.format == 'csc': pytest.xfail() dataset_density = X.nnz / X.size transformer = cu_make_column_transformer( (cuStandardScaler(with_mean=False), [0, 2]), (cuNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output assert type(t_X) == type(X) transformer = sk_make_column_transformer( (skStandardScaler(with_mean=False), [0, 2]), (skNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_make_column_transformer(clf_dataset, remainder): # noqa: F811 X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c' + str(i) for i in sk_selec1] cu_selec2 = ['c' + str(i) for i in sk_selec2] transformer = cu_make_column_transformer((cuStandardScaler(), cu_selec1), (cuNormalizer(), cu_selec2), remainder=remainder) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) transformer = sk_make_column_transformer((skStandardScaler(), sk_selec1), (skNormalizer(), sk_selec2), remainder=remainder) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_standard_scaler_sparse(sparse_clf_dataset, with_std): # noqa: F811 X_np, X = sparse_clf_dataset scaler = cuStandardScaler(copy=True, with_mean=False, with_std=with_std) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skStandardScaler(copy=True, with_mean=False, with_std=with_std) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_standard_scaler(failure_logger, clf_dataset, # noqa: F811 with_mean, with_std): X_np, X = clf_dataset scaler = cuStandardScaler(with_mean=with_mean, with_std=with_std, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skStandardScaler(with_mean=with_mean, with_std=with_std, copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)