示例#1
0
def test_variables_cast_as_category(df_enc_category_dtypes):
    df = df_enc_category_dtypes.copy()
    encoder = MeanEncoder(variables=["var_A"])
    encoder.fit(df[["var_A", "var_B"]], df["target"])
    X = encoder.transform(df[["var_A", "var_B"]])

    # expected output
    transf_df = df.copy()
    transf_df["var_A"] = [
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.5,
        0.5,
        0.5,
        0.5,
    ]

    pd.testing.assert_frame_equal(X,
                                  transf_df[["var_A", "var_B"]],
                                  check_dtype=False)
    assert X["var_A"].dtypes == float
示例#2
0
    def _make_combined_pipeline(self):

        encoder_num = MeanEncoder(variables=self.variables_numerical_, errors="raise")
        encoder_cat = MeanEncoder(variables=self.variables_categorical_, errors="raise")

        pipeline = Pipeline(
            [
                ("discretiser", self._make_discretiser()),
                ("encoder_num", encoder_num),
                ("encoder_cat", encoder_cat),
            ]
        )

        return pipeline
示例#3
0
def encode_mean_labels(var_list, X_train, y_train, X_test, X_val=None, 
                       file_path='../models/transformers/mean_enc/', 
                       file_name='mean_enc', file_suffix=''):
    
    """
    Encode labels of categorical features using each label's mean target value 
    in the training set, test set, and optionally the validation set. This 
    function uses feature_engine's MeanEncoder to encode the labels. The 
    encoder will be saved to the specified path.
    
    Parameters
    ----------
    var_list : list[str]
        Categorical features to encode
    X_train : pandas.core.frame.DataFrame
        Training set feature variables
    y_train : pandas.core.series.Series
        Training set target variables
    X_test : pandas.core.frame.DataFrame
        Test set feature variables
    X_val : pandas.core.frame.DataFrame, optional
        Validation set feature variables, by default None
    file_path : str, optional
        Output directory path, by default "../models/transformers/mean_enc/"
    file_name : str, optional
        Output file name, by default "mean_enc"
    file_suffix : str, optional
        File name suffix that goes before the file extension, by default an 
        empty string
    
    Returns
    -------
    pandas.core.frame.DataFrame
        Transformed train set
    pandas.core.frame.DataFrame
        Transformed validation set
    pandas.core.frame.DataFrame
        Transformed test set
    dict
        Mapping of original to encoded values
    """
    
    enc = MeanEncoder(variables=var_list).fit(X_train, y_train)
    joblib.dump(enc, os.path.join(file_path, file_name + file_suffix + '.pkl'))
    X_train = enc.transform(X_train)
    X_test = enc.transform(X_test)
    if X_val is not None:
        X_val = enc.transform(X_val)
    return X_train, X_val, X_test, enc.encoder_dict_
示例#4
0
    def _make_categorical_pipeline(self):
        """
        Instantiate the target mean encoder. Used when all variables are categorical.
        """

        pipeline = MeanEncoder(variables=self.variables_categorical_, errors="raise")

        return pipeline
示例#5
0
def test_user_enters_1_variable(df_enc):
    # test case 1: 1 variable
    encoder = MeanEncoder(variables=["var_A"])
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.5,
        0.5,
        0.5,
        0.5,
    ]

    # test init params
    assert encoder.variables == ["var_A"]
    # test fit attr
    assert encoder.variables_ == ["var_A"]
    assert encoder.encoder_dict_ == {
        "var_A": {
            "A": 0.3333333333333333,
            "B": 0.2,
            "C": 0.5
        }
    }
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def test_warning_if_transform_df_contains_categories_not_present_in_fit_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset
    with pytest.warns(UserWarning):
        encoder = MeanEncoder()
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])
    def _make_combined_pipeline(self):

        if self.strategy == "equal_width":
            discretizer = EqualWidthDiscretiser(
                bins=self.bins, variables=self.variables_numerical_, return_object=True
            )
        else:
            discretizer = EqualFrequencyDiscretiser(
                q=self.bins, variables=self.variables_numerical_, return_object=True
            )

        encoder_num = MeanEncoder(variables=self.variables_numerical_)
        encoder_cat = MeanEncoder(variables=self.variables_categorical_)

        _pipeline_combined = Pipeline(
            [
                ("discretization", discretizer),
                ("encoder_num", encoder_num),
                ("encoder_cat", encoder_cat),
            ]
        )

        return _pipeline_combined
示例#8
0
    def _make_numerical_pipeline(self):
        """
        Create pipeline for a dataframe solely comprised of numerical variables
        using a discretiser and an encoder.
        """
        encoder = MeanEncoder(variables=self.variables_numerical_, errors="raise")

        pipeline = Pipeline(
            [
                ("discretiser", self._make_discretiser()),
                ("encoder", encoder),
            ]
        )

        return pipeline
    def _make_categorical_pipeline(self):

        return MeanEncoder(variables=self.variables_categorical_)
示例#10
0
def test_automatically_find_variables(df_enc):
    # test case 2: automatically select variables
    encoder = MeanEncoder(variables=None)
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.5,
        0.5,
        0.5,
        0.5,
    ]
    transf_df["var_B"] = [
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.2,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.3333333333333333,
        0.5,
        0.5,
        0.5,
        0.5,
    ]

    # test init params
    assert encoder.variables is None
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.encoder_dict_ == {
        "var_A": {
            "A": 0.3333333333333333,
            "B": 0.2,
            "C": 0.5
        },
        "var_B": {
            "A": 0.2,
            "B": 0.3333333333333333,
            "C": 0.5
        },
    }
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
示例#11
0
def test_error_if_rare_labels_not_permitted_value():
    with pytest.raises(ValueError):
        MeanEncoder(errors="empanada")
示例#12
0
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = MeanEncoder()
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_na)
示例#13
0
def test_warning_if_transform_df_contains_categories_not_present_in_fit_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset

    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    # check for warning when rare_labels equals 'ignore'
    with pytest.warns(UserWarning) as record:
        encoder = MeanEncoder(errors="ignore")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that only one warning was raised
    assert len(record) == 1
    # check that the message matches
    assert record[0].message.args[0] == msg

    # check for error when rare_labels equals 'raise'
    with pytest.raises(ValueError) as record:
        encoder = MeanEncoder(errors="raise")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that the error message matches
    assert str(record.value) == msg
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)

def test_error_if_y_not_passed_to_fit(df_enc):
    # test case 3: raises error if target is not passed
    with pytest.raises(TypeError):
        encoder = MeanEncoder()
        encoder.fit(df_enc)