def test_impute_numerical_variables_with_mode(df_na): # set up transformer imputer = CategoricalImputer( imputation_method="frequent", variables=["City", "Studies", "Marks"], ignore_format=True, ) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["Marks"] = X_reference["Marks"].fillna(0.8) # test init params assert imputer.variables == ["City", "Studies", "Marks"] # test fit attributes assert imputer.variables_ == ["City", "Studies", "Marks"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { "City": "London", "Studies": "Bachelor", "Marks": 0.8, } # test transform output pd.testing.assert_frame_equal(X_transformed, X_reference)
def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_variables_cast_as_category_missing(df_na): # string missing df_na = df_na.copy() df_na["City"] = df_na["City"].astype("category") imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") X_reference["City"].cat.add_categories("Missing", inplace=True) X_reference["City"] = X_reference["City"].fillna("Missing") # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.imputer_dict_ == { "Name": "Missing", "City": "Missing", "Studies": "Missing", } # test transform output # selected columns should have no NA # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_variables_cast_as_category_frequent(df_na): df_na = df_na.copy() df_na["City"] = df_na["City"].astype("category") # this variable does not have a mode, so drop df_na.drop(labels=["Name"], axis=1, inplace=True) imputer = CategoricalImputer(imputation_method="frequent", variables=None) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["City"] = X_reference["City"].fillna("London") # test fit attributes assert imputer.variables_ == ["City", "Studies"] assert imputer.imputer_dict_ == { "City": "London", "Studies": "Bachelor", } # test transform output # selected columns should have no NA # non selected columns should still have NA assert X_transformed[["City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_impute_with_string_missing_and_automatically_find_variables(df_na): # set up transformer imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["City"] = X_reference["City"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") # test init params assert imputer.imputation_method == "missing" assert imputer.variables is None # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { "Name": "Missing", "City": "Missing", "Studies": "Missing", } # test transform output # selected columns should have no NA # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_user_defined_string_and_automatically_find_variables(df_na): # set up imputer imputer = CategoricalImputer( imputation_method="missing", fill_value="Unknown", variables=None ) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Unknown") X_reference["City"] = X_reference["City"].fillna("Unknown") X_reference["Studies"] = X_reference["Studies"].fillna("Unknown") # test init params assert imputer.imputation_method == "missing" assert imputer.fill_value == "Unknown" assert imputer.variables is None # tes fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { "Name": "Unknown", "City": "Unknown", "Studies": "Unknown", } # test transform output: assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_na): # test case 6: imputing of numerical variables cast as object + return as object # after imputation df_na = df_na.copy() df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer( imputation_method="frequent", variables=["City", "Studies", "Marks"], return_object=True, ) X_transformed = imputer.fit_transform(df_na) assert X_transformed["Marks"].dtype == "O"
def test_mode_imputation_with_multiple_variables(df_na): # set up imputer imputer = CategoricalImputer(imputation_method="frequent", variables=["Studies", "City"]) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") # test fit attr and transform output assert imputer.imputer_dict_ == {"Studies": "Bachelor", "City": "London"} pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_mode_imputation_and_single_variable(df_na): # set up imputer imputer = CategoricalImputer(imputation_method="frequent", variables="City") X_transformed = imputer.fit_transform(df_na) # set up expected result X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") # test init, fit and transform params, attr and output assert imputer.imputation_method == "frequent" assert imputer.variables == ["City"] assert imputer.input_shape_ == (8, 6) assert imputer.imputer_dict_ == {"City": "London"} assert X_transformed["City"].isnull().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical( df_na): # test case: imputing of numerical variables cast as object + return numeric df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer(imputation_method="frequent", variables=["City", "Studies", "Marks"]) X_transformed = imputer.fit_transform(df_na) X_reference = df_na.copy() X_reference["Marks"] = X_reference["Marks"].fillna(0.8) X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") assert imputer.variables == ["City", "Studies", "Marks"] assert imputer.imputer_dict_ == { "Studies": "Bachelor", "City": "London", "Marks": 0.8, } assert X_transformed["Marks"].dtype == "float" pd.testing.assert_frame_equal(X_transformed, X_reference)
def test_impute_numerical_variables(df_na): # set up transformer imputer = CategoricalImputer( imputation_method="missing", fill_value=0, variables=["Name", "City", "Studies", "Age", "Marks"], ignore_format=True, ) X_transformed = imputer.fit_transform(df_na) # set up expected output X_reference = df_na.copy() X_reference = X_reference.fillna(0) # test init params assert imputer.imputation_method == "missing" assert imputer.variables == ["Name", "City", "Studies", "Age", "Marks"] # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"] assert imputer.n_features_in_ == 6 # test transform params pd.testing.assert_frame_equal(X_transformed, X_reference)
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
def test_non_fitted_error(df_na): with pytest.raises(NotFittedError): imputer = CategoricalImputer() imputer.transform(df_na)
) from feature_engine.transformation import ( BoxCoxTransformer, LogTransformer, PowerTransformer, ReciprocalTransformer, YeoJohnsonTransformer, ) from feature_engine.wrappers import SklearnTransformerWrapper # imputation @parametrize_with_checks([ MeanMedianImputer(), ArbitraryNumberImputer(), CategoricalImputer(ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True),
def test_error_when_variable_contains_multiple_modes(df_na): msg = "The variable Name contains multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent", variables="Name") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg msg = "The variable(s) Name contain(s) multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_na) # check that error message matches assert str(record.value) == msg df_ = df_na.copy() df_["Name_dup"] = df_["Name"] msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent") with pytest.raises(ValueError) as record: imputer.fit(df_) # check that error message matches assert str(record.value) == msg
def test_error_when_imputation_method_not_frequent_or_missing(): with pytest.raises(ValueError): CategoricalImputer(imputation_method="arbitrary")
from feature_engine.timeseries.forecasting import LagFeatures from feature_engine.transformation import ( BoxCoxTransformer, LogTransformer, PowerTransformer, ReciprocalTransformer, YeoJohnsonTransformer, ) from feature_engine.wrappers import SklearnTransformerWrapper # imputation @parametrize_with_checks([ MeanMedianImputer(), ArbitraryNumberImputer(), CategoricalImputer(fill_value=0, ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True),
from feature_engine.transformation import LogTransformer from feature_engine.wrappers import SklearnTransformerWrapper from sklearn.linear_model import Lasso from sklearn.pipeline import Pipeline from sklearn.preprocessing import Binarizer, MinMaxScaler from regression_model.config.core import config from regression_model.processing import features as pp price_pipe = Pipeline([ # ===== IMPUTATION ===== # impute categorical variables with string missing ( "missing_imputation", CategoricalImputer( imputation_method="missing", variables=config.model_config.categorical_vars_with_na_missing, ), ), ( "frequent_imputation", CategoricalImputer( imputation_method="frequent", variables=config.model_config.categorical_vars_with_na_frequent, ), ), # add missing indicator ( "missing_indicator", AddMissingIndicator( variables=config.model_config.numerical_vars_with_na), ),
# %% Categorical. X_train['MSSubClass'] = X_train['MSSubClass'].astype('O') X_test['MSSubClass'] = X_test['MSSubClass'].astype('O') cat_vars = [var for var in data.columns if data[var].dtype == 'O'] cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0] # variables to impute with the string missing. () with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1] with_frequent_category = [var for var in cat_vars_with_na if X_train[var].isnull().mean() < 0.1] # variables to impute with the most frequent category # %% Missing values -- Categorical -- Missing. cat_imputer_missing = CategoricalImputer( imputation_method='missing', variables=with_string_missing ) cat_imputer_missing.fit(X_train) print(cat_imputer_missing.imputer_dict_) X_train = cat_imputer_missing.transform(X_train) X_test = cat_imputer_missing.transform(X_test) # %% Missing values -- Categorical -- Frequency. cat_imputer_frequent = CategoricalImputer( imputation_method='frequent', variables=with_frequent_category ) cat_imputer_frequent.fit(X_train) print(cat_imputer_frequent.imputer_dict_)
def test_error_when_variable_contains_multiple_modes(df_na): with pytest.raises(ValueError): imputer = CategoricalImputer(imputation_method="frequent") imputer.fit(df_na)
#get numerical labels numerical_labels = list(X_train._get_numeric_data().columns) categorical_labels = X_train.select_dtypes( include=['object']).columns.tolist() #moving 'MSSubClass' feature from numerical to categorical numerical_labels.remove('MSSubClass') categorical_labels.append('MSSubClass') print(f'Numerical labels are (contains ordinal cat):{numerical_labels}') print(f'Categorical labels are:{categorical_labels}') #print(X_train.head()) num_pipeline = Pipeline([ ('imputer', MeanMedianImputer(imputation_method='median')) #, #('std_scaler',StandardScaler()) ]) cat_pipeline = Pipeline([('imputer', CategoricalImputer(imputation_method='missing', fill_value='Missing')), ('one_hot', OneHotEncoder(top_categories=None, drop_last=False))]) full_pipeline = ColumnTransformer([('num', num_pipeline, numerical_labels), ('cat', cat_pipeline, categorical_labels)]) X_converted = cat_pipeline.fit_transform(X_train) print(X_converted.head())
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier from feature_engine.imputation import MeanMedianImputer, CategoricalImputer from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, CountFrequencyEncoder from feature_engine.discretisation import EqualFrequencyDiscretiser import logging _logger = logging.getLogger(__name__) rf_pipe = Pipeline( [ ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)), ('categorical_impute', CategoricalImputer(imputation_method='missing', variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES)), ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10, variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES, replace_with='Rare')), ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)), ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', variables=config.DISCRETE_SET1_FEATURES)), ('categorical_encode3', CountFrequencyEncoder(encoding_method='count',