def test_robust_imputer_categorical_custom_function():
    robust_imputer = RobustImputer(
        dtype=np.dtype("O"), strategy="constant", fill_values="not hot dog", mask_function=lambda x: x == "hot dog"
    )
    robust_imputer.fit(X_impute_categorical)
    X_observed = robust_imputer.transform(X_impute_categorical)

    assert_array_equal(X_observed, X_imputed_categorical)
def test_robust_imputer_transform_dim_error():
    with pytest.raises(ValueError, match=transform_error_msg):
        robust_imputer = RobustImputer()
        robust_imputer.fit(X_impute)
        robust_imputer.transform(np.zeros((3, 4)))
def test_robust_imputer(X, X_expected, strategy, fill_values):
    robust_imputer = RobustImputer(strategy=strategy, fill_values=fill_values)
    robust_imputer.fit(X)
    X_observed = robust_imputer.transform(X)

    assert_array_equal(X_observed, X_expected)
示例#4
0
class NALabelEncoder(BaseEstimator, TransformerMixin):
    """Encoder for transforming labels to NA values.

       Uses `RobustImputer` on 1D inputs of labels
       - Uses `is_finite_numeric` mask for encoding by default
       - Only uses the `RobustImputer` strategy `constant` and fills using `np.nan`
       - Default behavior encodes non-float and non-finite values as nan values in
          the target column of a given regression dataset

       Parameters
       ----------

       mask_function : callable -> np.array, dtype('bool') (default=None)
           A vectorized python function, accepts np.array, returns np.array
           with dtype('bool')

           For each value, if mask_function(val) == False, that value will
           be imputed. mask_function is used to create a boolean mask that determines
           which values in the input to impute.

           Use np.vectorize to vectorize singular python functions.

    """
    def __init__(self, mask_function=None):
        self.mask_function = mask_function

    def fit(self, y):
        """Fit the encoder on y.

        Parameters
        ----------
        y : {array-like}, shape (n_samples,)
            Input column, where `n_samples` is the number of samples.

        Returns
        -------
        self : NALabelEncoder
        """
        self.model_ = RobustImputer(strategy="constant",
                                    fill_values=np.nan,
                                    mask_function=self.mask_function)
        y = y.reshape(-1, 1)
        self.model_.fit(X=y)
        return self

    def transform(self, y):
        """Encode all non-float and non-finite values in y as NA values.

        Parameters
        ----------
        y : {array-like}, shape (n_samples)
            The input column to encode.

        Returns
        -------
        yt : {ndarray}, shape (n_samples,)
            The encoded input column.
        """
        check_is_fitted(self, "model_")
        y = y.reshape(-1, 1)
        return self.model_.transform(y).flatten()

    def inverse_transform(self, y):
        """Returns input column"""
        return y

    def _more_tags(self):
        return {"X_types": ["1dlabels"]}