def test_arbitrary_encoding_automatically_find_variables(df_enc):
    # test case 2: automatically select variables, unordered encoding
    encoder = OrdinalEncoder(encoding_method="arbitrary", variables=None)
    X = encoder.fit_transform(df_enc)

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
    ]
    transf_df["var_B"] = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "arbitrary"
    assert encoder.variables is None
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.encoder_dict_ == {
        "var_A": {
            "A": 0,
            "B": 1,
            "C": 2
        },
        "var_B": {
            "A": 0,
            "B": 1,
            "C": 2
        },
    }
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)
def test_variables_cast_as_category(df_enc_category_dtypes):
    df = df_enc_category_dtypes.copy()
    encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"])
    encoder.fit(df[["var_A", "var_B"]], df["target"])
    X = encoder.transform(df[["var_A", "var_B"]])

    # expected output
    transf_df = df.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test transform output
    pd.testing.assert_frame_equal(X,
                                  transf_df[["var_A", "var_B"]],
                                  check_dtype=False)
    assert X["var_A"].dtypes == int
예제 #3
0
def test_ordered_encoding_1_variable(df_enc):
    # test case 1: 1 variable, ordered encoding
    encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"])
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "ordered"
    assert encoder.variables == ["var_A"]
    # test fit attr
    assert encoder.encoder_dict_ == {"var_A": {"A": 1, "B": 0, "C": 2}}
    assert encoder.input_shape_ == (20, 2)
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
예제 #4
0
def test_error_if_input_df_contains_categories_not_present_in_training_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset
    with pytest.warns(UserWarning):
        encoder = OrdinalEncoder(encoding_method="arbitrary")
        encoder.fit(df_enc)
        encoder.transform(df_enc_rare)
def test_ordered_encoding_1_variable_ignore_format(df_enc_numeric):

    encoder = OrdinalEncoder(encoding_method="ordered",
                             variables=["var_A"],
                             ignore_format=True)
    encoder.fit(df_enc_numeric[["var_A", "var_B"]], df_enc_numeric["target"])
    X = encoder.transform(df_enc_numeric[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc_numeric.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "ordered"
    assert encoder.variables == ["var_A"]
    # test fit attr
    assert encoder.variables_ == ["var_A"]
    assert encoder.encoder_dict_ == {"var_A": {1: 1, 2: 0, 3: 2}}
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def test_arbitrary_encoding_automatically_find_variables_ignore_format(
        df_enc_numeric):

    encoder = OrdinalEncoder(encoding_method="arbitrary",
                             variables=None,
                             ignore_format=True)
    X = encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc_numeric[["var_A", "var_B"]].copy()
    transf_df["var_A"] = [
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
    ]
    transf_df["var_B"] = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "arbitrary"
    assert encoder.variables is None
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.encoder_dict_ == {
        "var_A": {
            1: 0,
            2: 1,
            3: 2
        },
        "var_B": {
            1: 0,
            2: 1,
            3: 2
        },
    }
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df)
def feature_engineering_ordinal_encoding(X_train, y_train, X_test):

    encoder = OrdinalEncoder(
        encoding_method='ordered',
        variables=[
            'gender',
            # 'hypertension',
            #'heart_disease',
            'ever_married',
            'work_type',
            'Residence_type',
            'smoking_status'
        ])

    encoder.fit(X_train, y_train)

    train_t = encoder.transform(X_train)
    test_t = encoder.transform(X_test)

    return train_t, test_t
def test_error_if_input_df_contains_categories_not_present_in_training_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset
    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    # check for warning when rare_labels equals 'ignore'
    with pytest.warns(UserWarning) as record:
        encoder = OrdinalEncoder(errors="ignore")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that only one warning was raised
    assert len(record) == 1
    # check that the message matches
    assert record[0].message.args[0] == msg

    # check for error when rare_labels equals 'raise'
    with pytest.raises(ValueError) as record:
        encoder = OrdinalEncoder(errors="raise")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that the error message matches
    assert str(record.value) == msg
예제 #9
0
    df = pd.read_csv("test_data.csv")
    df.head()

    # Regularize data set
    df.price_per_size = df.price_per_size / 10000
    df.price = df.price / 1000000
    df.rent = df.rent / 1000

    # Test train split
    X = df.drop(columns=['price'], axis=1)
    Y = df['price']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

    # Encoding the regions
    regions_df = np.asarray(X['region']).reshape(1, -1)
    enc = OrdinalEncoder(encoding_method='ordered', variables=['region'])
    enc.fit(X_train, y_train)
    X_train_enc = enc.transform(X_train)
    X_test_enc = enc.transform(X_test)

    # fit model no training data
    regressor = xgboost.XGBRegressor(n_estimators=100,
                                     reg_lambda=1,
                                     gamma=0,
                                     max_depth=3)
    regressor.fit(X_train_enc, y_train)

    # make predictions for test data
    y_pred = regressor.predict(X_test_enc)

    predictions = [round(value) for value in y_pred]
def test_error_if_encoding_method_not_allowed():
    with pytest.raises(ValueError):
        OrdinalEncoder(encoding_method="other")
def test_error_if_ordinal_encoding_and_no_y_passed(df_enc):
    # test case 3: raises error if target is  not passed
    with pytest.raises(ValueError):
        encoder = OrdinalEncoder(encoding_method="ordered")
        encoder.fit(df_enc)
def test_error_if_rare_labels_not_permitted_value():
    with pytest.raises(ValueError):
        OrdinalEncoder(errors="empanada")
예제 #13
0
        "mapper_garage",
        pp.Mapper(
            variables=config.model_config.garage_vars,
            mappings=config.model_config.garage_mappings,
        ),
    ),
    # == CATEGORICAL ENCODING
    (
        "rare_label_encoder",
        RareLabelEncoder(tol=0.01,
                         n_categories=1,
                         variables=config.model_config.categorical_vars),
    ),
    # encode categorical variables using the target mean
    (
        "categorical_encoder",
        OrdinalEncoder(
            encoding_method="ordered",
            variables=config.model_config.categorical_vars,
        ),
    ),
    ("scaler", MinMaxScaler()),
    (
        "Lasso",
        Lasso(
            alpha=config.model_config.alpha,
            random_state=config.model_config.random_state,
        ),
    ),
])
    DecisionTreeEncoder,
    MeanEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    PRatioEncoder,
    RareLabelEncoder,
    WoEEncoder,
)
from tests.estimator_checks.estimator_checks import check_feature_engine_estimator

_estimators = [
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
]


@pytest.mark.parametrize("estimator", _estimators)
def test_check_estimator_from_sklearn(estimator):
    return check_estimator(estimator)
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)


# outliers
@parametrize_with_checks([
예제 #16
0
rf_pipe = Pipeline(
[
    ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)),
    
    ('categorical_impute', CategoricalImputer(imputation_method='missing', 
                                              variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES)),
    
    ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10,
                                           variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES,
                                            replace_with='Rare')),
    
    ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', 
                                          variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),
    
    ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', 
                                          variables=config.DISCRETE_SET1_FEATURES)),
    
    ('categorical_encode3', CountFrequencyEncoder(encoding_method='count',
                                          variables=config.DISCRETE_SET3_FEATURES)),
    
    ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)),
    
    ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)),
    
    ('scaling', StandardScaler()),
        
    ('clf', RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=10, random_state=0))    
])
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = OrdinalEncoder(encoding_method="arbitrary")
        encoder.fit(df_enc)
        encoder.transform(df_enc_na)
예제 #18
0
def test_non_fitted_error(df_enc):
    with pytest.raises(NotFittedError):
        imputer = OrdinalEncoder()
        imputer.transform(df_enc)