예제 #1
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
예제 #2
0
def test_user_provides_grouping_label_name_and_variable_list(df_enc_big):
    # test case 2: user provides alternative grouping value and variable list
    encoder = RareLabelEncoder(tol=0.15,
                               n_categories=5,
                               variables=["var_A", "var_B"],
                               replace_with="Other")
    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["Other"] * 4 + ["D"] * 10 +
        ["Other"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["Other"] * 4 + ["D"] * 10 +
        ["Other"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test init params
    assert encoder.tol == 0.15
    assert encoder.n_categories == 5
    assert encoder.replace_with == "Other"
    assert encoder.variables == ["var_A", "var_B"]
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
예제 #3
0
def test_defo_params_plus_automatically_find_variables(df_enc_big):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelEncoder(tol=0.06,
                               n_categories=5,
                               variables=None,
                               replace_with="Rare")
    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test init params
    assert encoder.tol == 0.06
    assert encoder.n_categories == 5
    assert encoder.replace_with == "Rare"
    assert encoder.variables is None
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
예제 #4
0
def test_variables_cast_as_category(df_enc_big):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelEncoder(tol=0.06,
                               n_categories=5,
                               variables=None,
                               replace_with="Rare")

    df_enc_big = df_enc_big.copy()
    df_enc_big["var_B"] = df_enc_big["var_B"].astype("category")

    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
예제 #5
0
def encode_rare_labels(var_list, train, test, val=None, tol=0.05, 
                       file_path='../models/transformers/rare_enc/', 
                       file_name='rare_enc', file_suffix=''):
    
    """
    Encode rare labels of categorical features in the training set, test set, 
    and optionally the validation set. In the specified features, if the 
    proportion of any label in all observations is less than the `tol` 
    threshold, then it is replaced with the label "rare". This function uses 
    feature_engine's RareLabelEncoder to encode the rare labels. The encoder 
    will be saved to the specified path.
    
    Parameters
    ----------
    var_list : list[str]
        Categorical features to encode
    train : pandas.core.frame.DataFrame
        Training data
    test : pandas.core.frame.DataFrame
        Test data
    val : pandas.core.frame.DataFrame, optional
        Validation data, by default None
    tol : float, optional
        Frequency threshold at which to consider a label rare, by default 0.05
    file_path : str, optional
        Output directory path, by default "../models/transformers/rare_enc/"
    file_name : str, optional
        Output file name, by default "rare_enc"
    file_suffix : str, optional
        File name suffix that goes before the file extension, by default an 
        empty string
    
    Returns
    -------
    pandas.core.frame.DataFrame
        Transformed train set
    pandas.core.frame.DataFrame
        Transformed validation set
    pandas.core.frame.DataFrame
        Transformed test set
    dict
        Mapping of original to encoded values
    """
    
    enc = RareLabelEncoder(tol=tol, variables=var_list).fit(train)
    joblib.dump(enc, os.path.join(file_path, file_name + file_suffix + '.pkl'))
    train = enc.transform(train)
    test = enc.transform(test)
    if val is not None:
        val = enc.transform(val)
    return train, val, test, enc.encoder_dict_
예제 #6
0
def test_max_n_categories(df_enc_big):
    # test case 6: user provides the maximum number of categories they want
    rare_encoder = RareLabelEncoder(tol=0.10,
                                    max_n_categories=4,
                                    n_categories=5)
    X = rare_encoder.fit_transform(df_enc_big)
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["Rare"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["Rare"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["Rare"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)
    pd.testing.assert_frame_equal(X, df)
예제 #7
0
def test_max_n_categories_with_numeric_var(df_enc_numeric):
    # ignore_format=True
    rare_encoder = RareLabelEncoder(tol=0.10,
                                    max_n_categories=2,
                                    n_categories=1,
                                    ignore_format=True)

    X = rare_encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]])

    df = df_enc_numeric[["var_A", "var_B"]].copy()
    df.replace({3: "Rare"}, inplace=True)

    # massive workaround because for some reason, doing a normal pd.assert_equal
    # was telling me that 2 columns that were identical, were actually not.
    # I think there was a problem with the type of each number perhaps
    for i in range(len(df)):
        assert str(list(X["var_A"])[i]) == str(list(df["var_A"])[i])
        assert str(list(X["var_B"])[i]) == str(list(df["var_B"])[i])
예제 #8
0
    def transform(self, X, y=None):
        pd.options.mode.chained_assignment = None  # default='warn' - turn off warning about  data overwrite
        for category in self.categories:
            x = X[category].copy()  # not use copy to intentionally change value
            idx_nan = x.loc[pd.isnull(x)].index  # find nan values in analyzed feature column

            # replace missing values
            x[idx_nan] = 'MISS'
            encoder = RareLabelEncoder(tol=self.tol, n_categories=self.n_categories,
                                       max_n_categories=self.max_n_categories,
                                       replace_with=self.replace_with)

            x = x.to_frame(name=category)  # convert pd.series to dataframe
            x = encoder.fit_transform(x)
            X[category] = x
            if not self.impute_missing_label:
                X[category].loc[idx_nan] = np.nan
        pd.options.mode.chained_assignment = 'warn'  # default='warn' - turn on warning about  data overwrite
        return X
예제 #9
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
예제 #10
0
X_sm_c1

#2. rare label + weight of evidence (WOE) encoding:
#as the categorical features are hashed, we do not know if they are ordinal data
#so to avoid ranking these features, we apply rare label + weight of evidence encoding for C5
#this is for the Logistic Regression model only, as ordinality isn't a problem for tree-based models

# rare label encoding:
# we set the threshold to 0.1
# categories with proporation lower than 0.1 may not have any class label 1 due to the label imbalance
# and this will impede the application of WOE encoding (log 0 is undefined)

encoder = RareLabelEncoder(tol=0.1,
                           n_categories=2,
                           variables=[
                               'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
                               'C9', 'C10', 'C11', 'C12'
                           ],
                           replace_with='Rare')
train_enc = encoder.fit_transform(X_sm_c)

#WOE encoding:
woe_encoder = WoEEncoder(variables=[
    'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12'
])
train_enc1 = woe_encoder.fit_transform(train_enc, X_sm['newlabel'])

train_enc1
"""# 3. Model Building

# Logistic Regression
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)


# outliers
@parametrize_with_checks([
    ArbitraryOutlierCapper(max_capping_dict={"0": 10}),
    OutlierTrimmer(),
    Winsorizer(),
])
예제 #12
0
def test_transform_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na):
    # test case 5: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = RareLabelEncoder(n_categories=4)
        encoder.fit(df_enc_big)
        encoder.transform(df_enc_big_na)
예제 #13
0
def test_fit_raises_error_if_df_contains_na(df_enc_big_na):
    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = RareLabelEncoder(n_categories=4)
        encoder.fit(df_enc_big_na)
예제 #14
0
def test_warning_if_variable_cardinality_less_than_n_categories(df_enc_big):
    # test case 3: when the variable has low cardinality
    with pytest.warns(UserWarning):
        encoder = RareLabelEncoder(n_categories=10)
        encoder.fit(df_enc_big)
예제 #15
0
def test_error_if_n_categories_not_int():
    with pytest.raises(ValueError):
        RareLabelEncoder(n_categories=0.5)
예제 #16
0
def test_error_if_tol_not_between_0_and_1():
    with pytest.raises(ValueError):
        RareLabelEncoder(tol=5)
예제 #17
0
import logging

_logger = logging.getLogger(__name__)

rf_pipe = Pipeline(
[
    ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)),
    
    ('categorical_impute', CategoricalImputer(imputation_method='missing', 
                                              variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES)),
    
    ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10,
                                           variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES,
                                            replace_with='Rare')),
    
    ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', 
                                          variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),
    
    ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', 
                                          variables=config.DISCRETE_SET1_FEATURES)),
    
    ('categorical_encode3', CountFrequencyEncoder(encoding_method='count',
                                          variables=config.DISCRETE_SET3_FEATURES)),
    
    ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)),
    
    ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)),
예제 #18
0
         variables=config.model_config.finish_vars,
         mappings=config.model_config.finish_mappings,
     ),
 ),
 (
     "mapper_garage",
     pp.Mapper(
         variables=config.model_config.garage_vars,
         mappings=config.model_config.garage_mappings,
     ),
 ),
 # == CATEGORICAL ENCODING
 (
     "rare_label_encoder",
     RareLabelEncoder(tol=0.01,
                      n_categories=1,
                      variables=config.model_config.categorical_vars),
 ),
 # encode categorical variables using the target mean
 (
     "categorical_encoder",
     OrdinalEncoder(
         encoding_method="ordered",
         variables=config.model_config.categorical_vars,
     ),
 ),
 ("scaler", MinMaxScaler()),
 (
     "Lasso",
     Lasso(
         alpha=config.model_config.alpha,
def test_error_if_replace_with_not_string():
    with pytest.raises(ValueError):
        RareLabelEncoder(replace_with=0.5)