def test_robust_imputer_categorical_custom_function(): robust_imputer = RobustImputer( dtype=np.dtype("O"), strategy="constant", fill_values="not hot dog", mask_function=lambda x: x == "hot dog" ) robust_imputer.fit(X_impute_categorical) X_observed = robust_imputer.transform(X_impute_categorical) assert_array_equal(X_observed, X_imputed_categorical)
def test_robust_imputer_transform_dim_error(): with pytest.raises(ValueError, match=transform_error_msg): robust_imputer = RobustImputer() robust_imputer.fit(X_impute) robust_imputer.transform(np.zeros((3, 4)))
def test_robust_imputer(X, X_expected, strategy, fill_values): robust_imputer = RobustImputer(strategy=strategy, fill_values=fill_values) robust_imputer.fit(X) X_observed = robust_imputer.transform(X) assert_array_equal(X_observed, X_expected)
class NALabelEncoder(BaseEstimator, TransformerMixin): """Encoder for transforming labels to NA values. Uses `RobustImputer` on 1D inputs of labels - Uses `is_finite_numeric` mask for encoding by default - Only uses the `RobustImputer` strategy `constant` and fills using `np.nan` - Default behavior encodes non-float and non-finite values as nan values in the target column of a given regression dataset Parameters ---------- mask_function : callable -> np.array, dtype('bool') (default=None) A vectorized python function, accepts np.array, returns np.array with dtype('bool') For each value, if mask_function(val) == False, that value will be imputed. mask_function is used to create a boolean mask that determines which values in the input to impute. Use np.vectorize to vectorize singular python functions. """ def __init__(self, mask_function=None): self.mask_function = mask_function def fit(self, y): """Fit the encoder on y. Parameters ---------- y : {array-like}, shape (n_samples,) Input column, where `n_samples` is the number of samples. Returns ------- self : NALabelEncoder """ self.model_ = RobustImputer(strategy="constant", fill_values=np.nan, mask_function=self.mask_function) y = y.reshape(-1, 1) self.model_.fit(X=y) return self def transform(self, y): """Encode all non-float and non-finite values in y as NA values. Parameters ---------- y : {array-like}, shape (n_samples) The input column to encode. Returns ------- yt : {ndarray}, shape (n_samples,) The encoded input column. """ check_is_fitted(self, "model_") y = y.reshape(-1, 1) return self.model_.transform(y).flatten() def inverse_transform(self, y): """Returns input column""" return y def _more_tags(self): return {"X_types": ["1dlabels"]}