def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_user_provides_grouping_label_name_and_variable_list(df_enc_big): # test case 2: user provides alternative grouping value and variable list encoder = RareLabelEncoder(tol=0.15, n_categories=5, variables=["var_A", "var_B"], replace_with="Other") X = encoder.fit_transform(df_enc_big) # expected output df = { "var_A": ["A"] * 6 + ["B"] * 10 + ["Other"] * 4 + ["D"] * 10 + ["Other"] * 4 + ["G"] * 6, "var_B": ["A"] * 10 + ["B"] * 6 + ["Other"] * 4 + ["D"] * 10 + ["Other"] * 4 + ["G"] * 6, "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["E"] * 2 + ["F"] * 2 + ["G"] * 6, } df = pd.DataFrame(df) # test init params assert encoder.tol == 0.15 assert encoder.n_categories == 5 assert encoder.replace_with == "Other" assert encoder.variables == ["var_A", "var_B"] # test fit attr assert encoder.variables_ == ["var_A", "var_B"] assert encoder.n_features_in_ == 3 # test transform output pd.testing.assert_frame_equal(X, df)
def test_defo_params_plus_automatically_find_variables(df_enc_big): # test case 1: defo params, automatically select variables encoder = RareLabelEncoder(tol=0.06, n_categories=5, variables=None, replace_with="Rare") X = encoder.fit_transform(df_enc_big) # expected output df = { "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, } df = pd.DataFrame(df) # test init params assert encoder.tol == 0.06 assert encoder.n_categories == 5 assert encoder.replace_with == "Rare" assert encoder.variables is None # test fit attr assert encoder.variables_ == ["var_A", "var_B", "var_C"] assert encoder.n_features_in_ == 3 # test transform output pd.testing.assert_frame_equal(X, df)
def test_variables_cast_as_category(df_enc_big): # test case 1: defo params, automatically select variables encoder = RareLabelEncoder(tol=0.06, n_categories=5, variables=None, replace_with="Rare") df_enc_big = df_enc_big.copy() df_enc_big["var_B"] = df_enc_big["var_B"].astype("category") X = encoder.fit_transform(df_enc_big) # expected output df = { "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, } df = pd.DataFrame(df) # test fit attr assert encoder.variables_ == ["var_A", "var_B", "var_C"] assert encoder.n_features_in_ == 3 # test transform output pd.testing.assert_frame_equal(X, df)
def encode_rare_labels(var_list, train, test, val=None, tol=0.05, file_path='../models/transformers/rare_enc/', file_name='rare_enc', file_suffix=''): """ Encode rare labels of categorical features in the training set, test set, and optionally the validation set. In the specified features, if the proportion of any label in all observations is less than the `tol` threshold, then it is replaced with the label "rare". This function uses feature_engine's RareLabelEncoder to encode the rare labels. The encoder will be saved to the specified path. Parameters ---------- var_list : list[str] Categorical features to encode train : pandas.core.frame.DataFrame Training data test : pandas.core.frame.DataFrame Test data val : pandas.core.frame.DataFrame, optional Validation data, by default None tol : float, optional Frequency threshold at which to consider a label rare, by default 0.05 file_path : str, optional Output directory path, by default "../models/transformers/rare_enc/" file_name : str, optional Output file name, by default "rare_enc" file_suffix : str, optional File name suffix that goes before the file extension, by default an empty string Returns ------- pandas.core.frame.DataFrame Transformed train set pandas.core.frame.DataFrame Transformed validation set pandas.core.frame.DataFrame Transformed test set dict Mapping of original to encoded values """ enc = RareLabelEncoder(tol=tol, variables=var_list).fit(train) joblib.dump(enc, os.path.join(file_path, file_name + file_suffix + '.pkl')) train = enc.transform(train) test = enc.transform(test) if val is not None: val = enc.transform(val) return train, val, test, enc.encoder_dict_
def test_max_n_categories(df_enc_big): # test case 6: user provides the maximum number of categories they want rare_encoder = RareLabelEncoder(tol=0.10, max_n_categories=4, n_categories=5) X = rare_encoder.fit_transform(df_enc_big) df = { "var_A": ["A"] * 6 + ["B"] * 10 + ["Rare"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_B": ["A"] * 10 + ["B"] * 6 + ["Rare"] * 4 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, "var_C": ["Rare"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["Rare"] * 4 + ["G"] * 6, } df = pd.DataFrame(df) pd.testing.assert_frame_equal(X, df)
def test_max_n_categories_with_numeric_var(df_enc_numeric): # ignore_format=True rare_encoder = RareLabelEncoder(tol=0.10, max_n_categories=2, n_categories=1, ignore_format=True) X = rare_encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]]) df = df_enc_numeric[["var_A", "var_B"]].copy() df.replace({3: "Rare"}, inplace=True) # massive workaround because for some reason, doing a normal pd.assert_equal # was telling me that 2 columns that were identical, were actually not. # I think there was a problem with the type of each number perhaps for i in range(len(df)): assert str(list(X["var_A"])[i]) == str(list(df["var_A"])[i]) assert str(list(X["var_B"])[i]) == str(list(df["var_B"])[i])
def transform(self, X, y=None): pd.options.mode.chained_assignment = None # default='warn' - turn off warning about data overwrite for category in self.categories: x = X[category].copy() # not use copy to intentionally change value idx_nan = x.loc[pd.isnull(x)].index # find nan values in analyzed feature column # replace missing values x[idx_nan] = 'MISS' encoder = RareLabelEncoder(tol=self.tol, n_categories=self.n_categories, max_n_categories=self.max_n_categories, replace_with=self.replace_with) x = x.to_frame(name=category) # convert pd.series to dataframe x = encoder.fit_transform(x) X[category] = x if not self.impute_missing_label: X[category].loc[idx_nan] = np.nan pd.options.mode.chained_assignment = 'warn' # default='warn' - turn on warning about data overwrite return X
def create_pipeline(params: dict = None): """ Create sklearn.pipeline.Pipeline Parameters ---------- params : dict dictionary of parameters for the pipeline Returns ------- sklearn.pipeline.Pipeline """ # pipeline for numeric variables p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)), ("rmmean", MeanMedianImputer()), ("drop_quasi_constant", DropConstantFeatures(tol=0.97))]) # pipeline for categorical variables p_cat = Pipeline([("fill_cat_nas", CategoricalImputer(fill_value='MISSING')), ("rlc", RareLabelEncoder()), ("one_hot_encoder", OneHotEncoder())]) # list of pipelines to combine transformers = [("num", p_num, make_column_selector(dtype_include=np.number)), ("cat", p_cat, make_column_selector(dtype_include=object))] # combine pipelines and add XGBClassifier col_transforms = ColumnTransformer(transformers) p = Pipeline([("col_transformers", col_transforms), ("xgb", XGBClassifier(min_child_weight=1, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1, gpu_id=0, tree_method='gpu_hist'))]) if params: p.set_params(**params) return p
X_sm_c1 #2. rare label + weight of evidence (WOE) encoding: #as the categorical features are hashed, we do not know if they are ordinal data #so to avoid ranking these features, we apply rare label + weight of evidence encoding for C5 #this is for the Logistic Regression model only, as ordinality isn't a problem for tree-based models # rare label encoding: # we set the threshold to 0.1 # categories with proporation lower than 0.1 may not have any class label 1 due to the label imbalance # and this will impede the application of WOE encoding (log 0 is undefined) encoder = RareLabelEncoder(tol=0.1, n_categories=2, variables=[ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12' ], replace_with='Rare') train_enc = encoder.fit_transform(X_sm_c) #WOE encoding: woe_encoder = WoEEncoder(variables=[ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12' ]) train_enc1 = woe_encoder.fit_transform(train_enc, X_sm['newlabel']) train_enc1 """# 3. Model Building # Logistic Regression
DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder( tol=0.00000000001, n_categories=100000000000, replace_with=10, ignore_format=True, ), WoEEncoder(ignore_format=True), PRatioEncoder(ignore_format=True), ]) def test_sklearn_compatible_encoder(estimator, check): check(estimator) # outliers @parametrize_with_checks([ ArbitraryOutlierCapper(max_capping_dict={"0": 10}), OutlierTrimmer(), Winsorizer(), ])
def test_transform_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na): # test case 5: when dataset contains na, transform method with pytest.raises(ValueError): encoder = RareLabelEncoder(n_categories=4) encoder.fit(df_enc_big) encoder.transform(df_enc_big_na)
def test_fit_raises_error_if_df_contains_na(df_enc_big_na): # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = RareLabelEncoder(n_categories=4) encoder.fit(df_enc_big_na)
def test_warning_if_variable_cardinality_less_than_n_categories(df_enc_big): # test case 3: when the variable has low cardinality with pytest.warns(UserWarning): encoder = RareLabelEncoder(n_categories=10) encoder.fit(df_enc_big)
def test_error_if_n_categories_not_int(): with pytest.raises(ValueError): RareLabelEncoder(n_categories=0.5)
def test_error_if_tol_not_between_0_and_1(): with pytest.raises(ValueError): RareLabelEncoder(tol=5)
import logging _logger = logging.getLogger(__name__) rf_pipe = Pipeline( [ ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)), ('categorical_impute', CategoricalImputer(imputation_method='missing', variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES)), ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10, variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES, replace_with='Rare')), ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)), ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', variables=config.DISCRETE_SET1_FEATURES)), ('categorical_encode3', CountFrequencyEncoder(encoding_method='count', variables=config.DISCRETE_SET3_FEATURES)), ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)), ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)),
variables=config.model_config.finish_vars, mappings=config.model_config.finish_mappings, ), ), ( "mapper_garage", pp.Mapper( variables=config.model_config.garage_vars, mappings=config.model_config.garage_mappings, ), ), # == CATEGORICAL ENCODING ( "rare_label_encoder", RareLabelEncoder(tol=0.01, n_categories=1, variables=config.model_config.categorical_vars), ), # encode categorical variables using the target mean ( "categorical_encoder", OrdinalEncoder( encoding_method="ordered", variables=config.model_config.categorical_vars, ), ), ("scaler", MinMaxScaler()), ( "Lasso", Lasso( alpha=config.model_config.alpha,
def test_error_if_replace_with_not_string(): with pytest.raises(ValueError): RareLabelEncoder(replace_with=0.5)