def test_encode_numerical_variables(df_enc_numeric):
    encoder = OneHotEncoder(
        top_categories=None,
        variables=None,
        drop_last=False,
        ignore_format=True,
    )

    X = encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]])

    # test fit attr
    transf = {
        "var_A_1":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_A_2":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_A_3":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_B_1":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_2":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_3":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
    }

    transf = pd.DataFrame(transf).astype("int32")
    X = pd.DataFrame(X).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
def test_encode_into_kminus1_binary_plus_drop_binary(df_enc_binary):
    encoder = OneHotEncoder(top_categories=None,
                            variables=None,
                            drop_last=True,
                            drop_last_binary=True)
    X = encoder.fit_transform(df_enc_binary)

    # test fit attr
    transf = {
        "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        "var_A_A":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_A_B":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_B":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_C_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    }

    transf = pd.DataFrame(transf).astype("int32")
    X = pd.DataFrame(X).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 4
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
    assert "var_C_B" not in X.columns
def test_encode_top_categories(df_enc_big):
    # test case 3: encode only the most popular categories
    encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False)
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories == 4
    # test fit attr
    transf = {
        "var_A_D": 10,
        "var_A_B": 10,
        "var_A_A": 6,
        "var_A_G": 6,
        "var_B_A": 10,
        "var_B_D": 10,
        "var_B_G": 6,
        "var_B_B": 6,
        "var_C_D": 10,
        "var_C_C": 10,
        "var_C_G": 6,
        "var_C_B": 6,
    }

    assert encoder.n_features_in_ == 3
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_F" not in X.columns
def test_encode_categories_in_k_minus_1_binary_plus_list_of_variables(df_enc_big):
    # test case 2: encode all categories into k-1 binary variables,
    # pass list of variables
    encoder = OneHotEncoder(
        top_categories=None, variables=["var_A", "var_B"], drop_last=True
    )
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories is None
    assert encoder.variables == ["var_A", "var_B"]
    assert encoder.drop_last is True
    # test fit attr
    transf = {
        "var_A_A": 6,
        "var_A_B": 10,
        "var_A_C": 4,
        "var_A_D": 10,
        "var_A_E": 2,
        "var_A_F": 2,
        "var_B_A": 10,
        "var_B_B": 6,
        "var_B_C": 4,
        "var_B_D": 10,
        "var_B_E": 2,
        "var_B_F": 2,
    }
    assert encoder.input_shape_ == (40, 3)
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_G" not in X.columns
    assert "var_C" in X.columns
示例#5
0
def test_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na):
    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = OneHotEncoder()
        encoder.fit(df_enc_big_na)

    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = OneHotEncoder()
        encoder.fit(df_enc_big)
        encoder.transform(df_enc_big_na)
示例#6
0
def test_encode_categories_in_k_binary_plus_select_vars_automatically(
        df_enc_big):
    # test case 1: encode all categories into k binary variables, select variables
    # automatically
    encoder = OneHotEncoder(top_categories=None,
                            variables=None,
                            drop_last=False)
    X = encoder.fit_transform(df_enc_big)

    # test init params
    assert encoder.top_categories is None
    assert encoder.variables is None
    assert encoder.drop_last is False
    # test fit attr
    transf = {
        "var_A_A": 6,
        "var_A_B": 10,
        "var_A_C": 4,
        "var_A_D": 10,
        "var_A_E": 2,
        "var_A_F": 2,
        "var_A_G": 6,
        "var_B_A": 10,
        "var_B_B": 6,
        "var_B_C": 4,
        "var_B_D": 10,
        "var_B_E": 2,
        "var_B_F": 2,
        "var_B_G": 6,
        "var_C_A": 4,
        "var_C_B": 6,
        "var_C_C": 10,
        "var_C_D": 10,
        "var_C_E": 2,
        "var_C_F": 2,
        "var_C_G": 6,
    }

    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.variables_binary_ == []
    assert encoder.n_features_in_ == 3
    assert encoder.encoder_dict_ == {
        "var_A": ["A", "B", "C", "D", "E", "F", "G"],
        "var_B": ["A", "B", "C", "D", "E", "F", "G"],
        "var_C": ["A", "B", "C", "D", "E", "F", "G"],
    }
    # test transform output
    assert X.sum().to_dict() == transf
    assert "var_A" not in X.columns
示例#7
0
def test_get_feature_names_out_from_pipeline(df_enc_binary):
    original_features = ["var_num"]
    input_features = ["var_A", "var_B", "var_C", "var_D"]

    tr = Pipeline([("transformer", OneHotEncoder())])
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_A_C",
        "var_B_A",
        "var_B_B",
        "var_B_C",
        "var_C_AHA",
        "var_C_UHU",
        "var_D_OHO",
        "var_D_EHE",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:6]
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:3]
示例#8
0
def test_encode_top_categories():
    # test case 3: encode only the most popular categories

    df = pd.DataFrame({
        "var_A": ["A"] * 5 + ["B"] * 11 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 7,
        "var_B": ["A"] * 11 + ["B"] * 7 + ["C"] * 4 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 5,
        "var_C": ["A"] * 4 + ["B"] * 5 + ["C"] * 11 + ["D"] * 9 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 7,
    })

    encoder = OneHotEncoder(top_categories=4, variables=None, drop_last=False)
    X = encoder.fit_transform(df)

    # test init params
    assert encoder.top_categories == 4
    # test fit attr
    transf = {
        "var_A_D": 9,
        "var_A_B": 11,
        "var_A_A": 5,
        "var_A_G": 7,
        "var_B_A": 11,
        "var_B_D": 9,
        "var_B_G": 5,
        "var_B_B": 7,
        "var_C_D": 9,
        "var_C_C": 11,
        "var_C_G": 7,
        "var_C_B": 5,
    }

    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.variables_binary_ == []
    assert encoder.n_features_in_ == 3
    assert encoder.encoder_dict_ == {
        "var_A": ["B", "D", "G", "A"],
        "var_B": ["A", "D", "B", "G"],
        "var_C": ["C", "D", "G", "B"],
    }
    # test transform output
    for col in transf.keys():
        assert X[col].sum() == transf[col]
    assert "var_B" not in X.columns
    assert "var_B_F" not in X.columns
示例#9
0
def test_encode_into_k_dummy_plus_drop_binary(df_enc_binary):
    encoder = OneHotEncoder(top_categories=None,
                            variables=None,
                            drop_last=False,
                            drop_last_binary=True)
    X = encoder.fit_transform(df_enc_binary)
    X = X.astype("int32")

    # test fit attr
    transf = {
        "var_num":
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0],
        "var_A_A":
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_A_B":
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_A_C":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_B_A":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_B_B":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        "var_B_C":
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
        "var_C_AHA":
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        "var_D_OHO":
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    }
    transf = pd.DataFrame(transf).astype("int32")

    assert encoder.variables_ == ["var_A", "var_B", "var_C", "var_D"]
    assert encoder.variables_binary_ == ["var_C", "var_D"]
    assert encoder.n_features_in_ == 5
    assert encoder.encoder_dict_ == {
        "var_A": ["A", "B", "C"],
        "var_B": ["A", "B", "C"],
        "var_C": ["AHA"],
        "var_D": ["OHO"],
    }
    # test transform output
    pd.testing.assert_frame_equal(X, transf)
    assert "var_C_B" not in X.columns
示例#10
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
示例#11
0
    #get numerical labels
    numerical_labels = list(X_train._get_numeric_data().columns)
    categorical_labels = X_train.select_dtypes(
        include=['object']).columns.tolist()

    #moving 'MSSubClass' feature from numerical to categorical
    numerical_labels.remove('MSSubClass')
    categorical_labels.append('MSSubClass')

    print(f'Numerical labels are (contains ordinal cat):{numerical_labels}')
    print(f'Categorical labels are:{categorical_labels}')
    #print(X_train.head())

    num_pipeline = Pipeline([
        ('imputer', MeanMedianImputer(imputation_method='median'))  #,
        #('std_scaler',StandardScaler())
    ])
    cat_pipeline = Pipeline([('imputer',
                              CategoricalImputer(imputation_method='missing',
                                                 fill_value='Missing')),
                             ('one_hot',
                              OneHotEncoder(top_categories=None,
                                            drop_last=False))])

    full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels),
                                       ('cat', cat_pipeline,
                                        categorical_labels)])

    X_converted = cat_pipeline.fit_transform(X_train)
    print(X_converted.head())
示例#12
0
security = word_convert(
    st.selectbox('Do you need security guards ?', ('Yes', 'No')))
furnished = word_convert(
    st.selectbox('Do you want the house to be furnished ?', ('Yes', 'No')))
security_doors = word_convert(
    st.selectbox('Do you want security doors ?', ('Yes', 'No')))
cctv = word_convert(
    st.selectbox('Do you want CCTV surveillance ?', ('Yes', 'No')))
bq = word_convert(st.selectbox('Do you want Boys Quarters ?', ('Yes', 'No')))
gym = word_convert(st.selectbox('Do you need gym facilities ?', ('Yes', 'No')))
pool = word_convert(st.selectbox('Do you need swimming pool ?', ('Yes', 'No')))

# Modeling step

# Encoding Step
encode = OneHotEncoder()
target = data['Price']
features = data.drop('Price', 1)
encode.fit(features)
features = encode.transform(features)

# Getting the target and features variables

# print(data.head())

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)
# Creating the algorithm class
model = RandomForestRegressor()
 def encorder(self, y):
     """Y dataframe"""
     encode = OneHotEncoder()
     encode.fit(y)
     return encode.transform(y)
def test_error_if_top_categories_not_integer():
    with pytest.raises(ValueError):
        OneHotEncoder(top_categories=0.5)
def test_error_if_drop_last_not_bool():
    with pytest.raises(ValueError):
        OneHotEncoder(drop_last=0.5)
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)


# outliers
示例#17
0
from flask import Flask, render_template, request
import pickle
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import MinMaxScaler

###########################################################################################
#preprocessing pipeline
cat_features = ['Fuel_Type', 'Seller_Type', 'Transmission']

pipe = Pipeline(steps=[('one hot encoder',
                        OneHotEncoder(variables=cat_features)),
                       ('min max scaler',
                        SklearnTransformerWrapper(variables=['Kms_Driven'],
                                                  transformer=MinMaxScaler(
                                                      feature_range=(0,
                                                                     100))))])
###########################################################################################
with open("pipe.pkl", "rb") as f:
    preprocessor = pickle.load(f)

with open("best_model.pkl", "rb") as f:
    model = pickle.load(f)


def predict_price(ex):
    columns = [
        'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type',
示例#18
0
def test_get_feature_names_out(df_enc_binary):
    original_features = ["var_num"]
    input_features = ["var_A", "var_B", "var_C", "var_D"]

    tr = OneHotEncoder()
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_A_C",
        "var_B_A",
        "var_B_B",
        "var_B_C",
        "var_C_AHA",
        "var_C_UHU",
        "var_D_OHO",
        "var_D_EHE",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:6]
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:3]

    tr = OneHotEncoder(drop_last=True)
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_B_A",
        "var_B_B",
        "var_C_AHA",
        "var_D_OHO",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:4]
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:2]

    tr = OneHotEncoder(drop_last_binary=True)
    tr.fit(df_enc_binary)

    out = [
        "var_A_A",
        "var_A_B",
        "var_A_C",
        "var_B_A",
        "var_B_B",
        "var_B_C",
        "var_C_AHA",
        "var_D_OHO",
    ]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=[input_features[0]]) == out[0:3]
    assert tr.get_feature_names_out(input_features=[input_features[3]]) == [
        out[-1]
    ]

    tr = OneHotEncoder(top_categories=1)
    tr.fit(df_enc_binary)

    out = ["var_A_B", "var_B_A", "var_C_AHA", "var_D_EHE"]

    assert tr.get_feature_names_out(
        input_features=None) == original_features + out
    assert tr.get_feature_names_out(input_features=input_features) == out
    assert tr.get_feature_names_out(
        input_features=input_features[0:2]) == out[0:2]
    assert tr.get_feature_names_out(input_features=[input_features[3]]) == [
        out[3]
    ]

    with pytest.raises(ValueError):
        tr.get_feature_names_out("var_A")

    with pytest.raises(ValueError):
        tr.get_feature_names_out(["var_A", "hola"])
示例#19
0
early_stop = EarlyStopping(monitor='val_loss',
                           mode='min',
                           min_delta=0,
                           verbose=1,
                           patience=20)

pump_pipeline = Pipeline(
    steps=[("feature_to_keeper",
            pp.FeatureKeeper(variables_to_keep=config.VARIABLES_TO_KEEP)),
           ("missing_imputer",
            pp.MissingImputer(numerical_variables=config.NUMERICAL_VARIABLES)),
           ("yeoJohnson",
            YeoJohnsonTransformer(variables=config.YEO_JHONSON_VARIABLES)),
           ("discretization",
            EqualWidthDiscretiser(bins=5, variables=config.NUMERICAL_VARIABLES)
            ),
           ("categorical_grouper",
            pp.CategoricalGrouping(config_dict=config.VARIABLES_TO_GROUP)),
           ("rareCategories_grouper",
            pp.RareCategoriesGrouping(threshold=config.VARIABLES_THRESHOLD)),
           ("one_hot_encoder",
            OneHotEncoder(variables=config.REAL_CATEGORICAL_VARIABLES,
                          drop_last=False)), ("scaler", MinMaxScaler()),
           ("model",
            KerasClassifier(build_fn=create_model,
                            epochs=1,
                            validation_split=0.2,
                            batch_size=256,
                            verbose=1,
                            callbacks=[early_stop, reduce_lr],
                            shuffle=True))])
示例#20
0
# %%
titanic_train_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
titanic_train_crosstab
# %%
# Criando um modelo com todas as features e usando pipeline
num_features = df.select_dtypes(include=['int64', 'float64']).drop(
    'Survived', axis=1).columns
num_features
# %%
cat_features = df.select_dtypes(include=['category', 'object']).columns
cat_features
#%%
features = df.drop('Survived', axis=1).columns.to_list()
features
# %%
onehot = OneHotEncoder(variables=['Pclass', 'Sex', 'Embarked'],
                       drop_last=False)

# %%
onehot.fit(df[features])
onehot.transform(df[features]).head()
# %%
X = onehot.transform(df[features])
y = df['Survived']
print(X.shape)
# %%
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)