def test_column_transformer( clf_dataset, remainder, # noqa: F811 transformer_weights): X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c' + str(i) for i in sk_selec1] cu_selec2 = ['c' + str(i) for i in sk_selec2] cu_transformers = [("scaler", cuStandardScaler(), cu_selec1), ("normalizer", cuNormalizer(), cu_selec2)] transformer = cuColumnTransformer(cu_transformers, remainder=remainder, transformer_weights=transformer_weights) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) sk_transformers = [("scaler", skStandardScaler(), sk_selec1), ("normalizer", skNormalizer(), sk_selec2)] transformer = skColumnTransformer(sk_transformers, remainder=remainder, transformer_weights=transformer_weights) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_make_column_selector(): X_np = pdDataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5], 'temperature': [21., 21., 24., 28.] }) X = cudf.from_pandas(X_np) cu_transformers = [("ohe", cuOneHotEncoder(), cu_make_column_selector(dtype_exclude=np.number)), ("scaler", cuStandardScaler(), cu_make_column_selector(dtype_include=np.integer)), ("normalizer", cuNormalizer(), cu_make_column_selector(pattern="temp"))] transformer = cuColumnTransformer(cu_transformers, remainder='drop') t_X = transformer.fit_transform(X) sk_transformers = [("ohe", skOneHotEncoder(), sk_make_column_selector(dtype_exclude=np.number)), ("scaler", skStandardScaler(), sk_make_column_selector(dtype_include=np.integer)), ("normalizer", skNormalizer(), sk_make_column_selector(pattern="temp"))] transformer = skColumnTransformer(sk_transformers, remainder='drop') sk_t_X = transformer.fit_transform(X_np) assert_allclose(t_X, sk_t_X) assert type(t_X) == type(X)
def test_column_transformer_get_feature_names(clf_dataset): # noqa: F811 X_np, X = clf_dataset cu_transformers = [("PolynomialFeatures", cuPolynomialFeatures(), [0, 2])] transformer = cuColumnTransformer(cu_transformers) transformer.fit_transform(X) cu_feature_names = transformer.get_feature_names() sk_transformers = [("PolynomialFeatures", skPolynomialFeatures(), [0, 2])] transformer = skColumnTransformer(sk_transformers) transformer.fit_transform(X_np) sk_feature_names = transformer.get_feature_names() assert cu_feature_names == sk_feature_names
def test_column_transformer_named_transformers_(clf_dataset): # noqa: F811 X_np, X = clf_dataset cu_transformers = [("PolynomialFeatures", cuPolynomialFeatures(), [0, 2])] transformer = cuColumnTransformer(cu_transformers) transformer.fit_transform(X) cu_named_transformers = transformer.named_transformers_ sk_transformers = [("PolynomialFeatures", skPolynomialFeatures(), [0, 2])] transformer = skColumnTransformer(sk_transformers) transformer.fit_transform(X_np) sk_named_transformers = transformer.named_transformers_ assert cu_named_transformers.keys() == sk_named_transformers.keys()
def test_column_transformer_sparse(sparse_clf_dataset, remainder, # noqa: F811 transformer_weights, sparse_threshold): X_np, X = sparse_clf_dataset if X.format == 'csc': pytest.xfail() dataset_density = X.nnz / X.size cu_transformers = [ ("scaler", cuStandardScaler(with_mean=False), [0, 2]), ("normalizer", cuNormalizer(), [1, 3]) ] transformer = cuColumnTransformer(cu_transformers, remainder=remainder, transformer_weights=transformer_weights, sparse_threshold=sparse_threshold) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output assert type(t_X) == type(X) sk_transformers = [ ("scaler", skStandardScaler(with_mean=False), [0, 2]), ("normalizer", skNormalizer(), [1, 3]) ] transformer = skColumnTransformer(sk_transformers, remainder=remainder, transformer_weights=transformer_weights, sparse_threshold=sparse_threshold) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)