def test_transform_only_selected(self):
        x = pd.DataFrame([
            ['a', 'b', 'c'],
            ['a', 'a', 'c'],
            ['b', 'a', 'c'],
            ['b', 'c', 'b'],
            ['b', 'b', 'b'],
            ['a', 'b', 'a'],
        ],
                         columns=['f1', 'f2', 'f3'])
        y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog']
        wrapper = MultiClassWrapper(encoders.LeaveOneOutEncoder(cols=['f2']))

        # combination fit() + transform()
        wrapper.fit(x, y)
        result = wrapper.transform(x)
        print(result)
        self.assertEqual(
            len(result.columns), 4,
            'We expect 2 untouched features + f2 target encoded into 2 features'
        )

        # directly fit_transform()
        wrapper = MultiClassWrapper(encoders.LeaveOneOutEncoder(cols=['f2']))
        result2 = wrapper.fit_transform(x, y)
        print(result2)
        self.assertEqual(
            len(result2.columns), 4,
            'We expect 2 untouched features + f2 target encoded into 2 features'
        )

        # in the case of leave-one-out, we expect different results, because leave-one-out principle
        # is applied only on the training data (to decrease overfitting) while the testing data
        # use the whole statistics (to be as accurate as possible).
        self.assertFalse(result.iloc[0, 3] == result2.iloc[0, 3])
Exemplo n.º 2
0
def leave_one_out_encoding(df, cols, handle_nan=True, target=False):
    if handle_nan:
        encoder = ce.LeaveOneOutEncoder(cols=cols,
                                        handle_unknown='value',
                                        handle_missing='value')
    else:
        encoder = ce.LeaveOneOutEncoder(cols=cols,
                                        handle_unknown='return_nan',
                                        handle_missing='return_nan')

    if target:
        df_new = encoder.fit_transform(df, y=df[[target]])
        return df_new
    else:
        df_new = encoder.fit_transform(df, y=df[df.columns[-1]])
        return df_new
Exemplo n.º 3
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    encoder_loo = ce.LeaveOneOutEncoder(cols=X.columns)
    encoder_loo.fit(X, y)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/loo.pkl".format(output_dir), "wb") as fp:
            pickle.dump(encoder_loo, fp)
Exemplo n.º 4
0
    def _fit_leave_one_out(self, df, y, target, parameter):
        loo_encoder = ce.LeaveOneOutEncoder()

        loo_encoder.fit(df[target].map(to_str), df[y])
        name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_leave_one_out' for x in
                loo_encoder.get_feature_names()]
        self.trans_ls.append(('leave_one_out', name, target, loo_encoder))
Exemplo n.º 5
0
 def leave_one_out_encode(self, feature, verbose, drop_invariant, return_df,
                          handle_unknown, handle_missing, sigma):
     le = preprocessing.LabelEncoder()
     self.data.loc[self.data[self.target_name].notnull(),
                   self.target_name] = le.fit_transform(
                       self.data.loc[self.data[self.target_name].notnull(),
                                     self.target_name])
     loue = ce.LeaveOneOutEncoder(verbose=verbose,
                                  drop_invariant=drop_invariant,
                                  return_df=return_df,
                                  handle_unknown=handle_unknown,
                                  handle_missing=handle_missing,
                                  sigma=sigma)
     if feature == 'all':
         self.getFeatureType()
         for i in self.category_list:
             if i != self.target_name:
                 temp = self.data.loc[self.data[i].notnull(), i].index
                 self.data.loc[self.data[i].notnull(), i] = \
                     loue.fit_transform(
                     self.data.loc[self.data[i].notnull(), i],
                     self.data.loc[temp, self.target_name]
                 )
     else:
         self.data.loc[
             self.data[feature].notnull(), feature] = loue.fit_transform(
                 self.data.loc[self.data[feature].notnull(), feature],
                 self.data.loc[self.data[self.target_name].notnull(),
                               self.target_name])
Exemplo n.º 6
0
 def test_leave_one_out(self):
     enc = encoders.LeaveOneOutEncoder(verbose=1,
                                       randomized=True,
                                       sigma=0.1)
     enc.fit(X, y)
     tu.verify_numeric(enc.transform(X_t))
     tu.verify_numeric(enc.transform(X_t, y_t))
Exemplo n.º 7
0
def leaveoneout():
    X, _, _ = get_mushroom_data()
    print(X.info())
    enc = ce.LeaveOneOutEncoder()
    enc.fit(X, None)
    out = enc.transform(X)
    print(out.info())
    del enc, _, X, out
Exemplo n.º 8
0
def target_encoder_loo_include_self(train_df, test_df, cols, target):
    # こちらは自分も含めてtargetの平均でエンコードする
    ce_loo = ce.LeaveOneOutEncoder(cols=cols).fit(X=train_df[cols], y=train_df[target])
    tmp_train = ce_loo.transform(train_df[cols])
    if test_df is not None:
        tmp_test = ce_loo.transform(test_df[cols])
        return tmp_train, tmp_test
    return tmp_train, None
Exemplo n.º 9
0
 def create_features(self, df_train, df_test):
     encoder = ce.LeaveOneOutEncoder(cols=self.columns)
     encoder.fit(df_train[self.columns],
                 df_train[self.target_column].values.tolist())
     encoded_train = encoder.transform(df_train[self.columns])
     encoded_test = encoder.transform(df_test[self.columns])
     for column in encoded_train.columns:
         self.train[column + '_LeaveOneOutEncoder'] = encoded_train[column]
         self.test[column + '_LeaveOneOutEncoder'] = encoded_test[column]
Exemplo n.º 10
0
def target_encoder_loo(df, train_df, cols, target):
    # こちらは正真正銘looエンコードする
    ce_loo = ce.LeaveOneOutEncoder(cols=cols)
    ce_loo.fit(X=train_df[cols], y=train_df[target])
    _df = ce_loo.transform(df[cols])
    # カラム名の変更
    for col in cols:
        _df = _df.rename({col: f'{col}_targetenc_ce_loo'}, axis=1)
    return pd.concat([df, _df], axis=1)
Exemplo n.º 11
0
def leave_one_out_encoding(trainData, predictionData):

    oEncoder = ce.LeaveOneOutEncoder(cols=['country', 'profession', 'degree'])
    target = trainData['total_income']

    oEncoder.fit(trainData, target)
    trainDataFrame = oEncoder.transform(trainData)
    predictionDataFrame = oEncoder.transform(predictionData)

    return trainDataFrame, predictionDataFrame
Exemplo n.º 12
0
    def test_leave_one_out_unique(self):
        X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col'])
        y = np.array([1, 0, 1, 0, 1])

        encoder = encoders.LeaveOneOutEncoder(handle_unknown='value')
        result = encoder.fit(X, y).transform(X, y)

        self.assertFalse(result.isnull().any().any(), 'There should not be any missing value')
        expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col'])
        pd.testing.assert_frame_equal(expected, result)
Exemplo n.º 13
0
def apply_leave_one_out_encoding(df, categorical_columns, label='y'):
    if not isinstance(df, pd.DataFrame):
        raise DataFrameTypeError('df', df)
    import category_encoders as ce
    encoder = ce.LeaveOneOutEncoder(return_df=True,
                                    cols=categorical_columns).fit(
                                        df.drop([label], axis=1), df[label])
    X_transformed = encoder.transform(df.drop([label], axis=1))
    X_transformed[label] = df[label]
    return X_transformed
Exemplo n.º 14
0
    def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self):
        train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color')
        target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target')
        test = pd.Series(['b', 'c'], name='color')
        test_target = pd.Series([0, 0])

        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value')
        ce_leave.fit(train, target)
        obtained = ce_leave.transform(test, test_target)

        self.assertEqual([1.0, .6], list(obtained['color']))
Exemplo n.º 15
0
def encode_df(X, y, cat_features, cat_encoding):
    ENCODERS = {
        'leave_one_out':
        ce.LeaveOneOutEncoder(cols=cat_features, handle_missing='return_nan'),
        'james_stein':
        ce.JamesSteinEncoder(cols=cat_features, handle_missing='return_nan'),
        'target':
        ce.TargetEncoder(cols=cat_features, handle_missing='return_nan')
    }
    X = ENCODERS[cat_encoding].fit_transform(X, y)
    return X
Exemplo n.º 16
0
def cal_loe(df_tr, col):
    enc = ce.LeaveOneOutEncoder(cols=[col]).fit(df_tr.loc[::, feature_col],
                                                df_tr.loc[::, 'isDefault'])
    tmp = pd.DataFrame({
        f'{col}':
        df_tr.loc[::, col],
        f'loe_{col}':
        enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::,
                                                            'isDefault'])[col]
    })
    return tmp.groupby([col])[f'loe_{col}'].mean(), f'loe_{col}'
Exemplo n.º 17
0
 def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self):
     x_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a'])
     x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b'])  # different values and name
     y_dummy = [True, False, True, False, True, False]
     encoder = encoders.LeaveOneOutEncoder()
     encoder.fit(x_a, y_dummy)
     encoder.fit(x_b, y_dummy)
     mapping = encoder.mapping
     self.assertEqual(1, len(mapping))
     self.assertIn('col_b', mapping)     # the model should have the updated mapping
     expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'],  columns=['sum', 'count'])
     np.testing.assert_equal(expected.values, mapping['col_b'].values)
Exemplo n.º 18
0
    def test_leave_one_out_values(self):
        df = pd.DataFrame({
            'color': ["a", "a", "a", "b", "b", "b"],
            'outcome': [1, 0, 0, 1, 0, 1]})

        X = df.drop('outcome', axis=1)
        y = df.drop('color', axis=1)

        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], randomized=False)
        obtained = ce_leave.fit_transform(X, y['outcome'])

        self.assertEquals([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color']))
Exemplo n.º 19
0
    def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self):
        df = pd.DataFrame({
            'color': [np.nan, np.nan, np.nan, "b", "b", "b"],
            'outcome': [2, 2, 0, 1, 0, 1]})

        X = df.drop('outcome', axis=1)
        y = df.drop('color', axis=1)

        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value')
        obtained = ce_leave.fit_transform(X, y['outcome'])

        self.assertEqual([1, 1, 2, 0.5, 1.0, 0.5], list(obtained['color']))
Exemplo n.º 20
0
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
Exemplo n.º 21
0
 def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self):
     x_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a'])
     x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b'])  # different values and name
     y_dummy = [True, False, True, False, True, False]
     encoder = encoders.LeaveOneOutEncoder()
     encoder.fit(x_a, y_dummy)
     encoder.fit(x_b, y_dummy)
     mapping = encoder.mapping
     self.assertEqual(1, len(mapping))
     col_b_mapping = mapping[0]
     self.assertEqual('col_b', col_b_mapping['col']) # the model must get updated
     self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1'])
     self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
Exemplo n.º 22
0
    def test_leave_one_out(self):
        """

        :return:
        """

        cols = ['C1', 'D', 'E', 'F']
        X = self.create_dataset(n_rows=1000)
        X_t = self.create_dataset(n_rows=100)
        y = np.random.randn(X.shape[0])
        y_t = np.random.randn(X_t.shape[0])

        enc = encoders.LeaveOneOutEncoder(verbose=1, cols=cols)
        enc.fit(X, y)
        self.verify_numeric(enc.transform(X_t))
        self.verify_numeric(enc.transform(X_t, y_t))

        enc = encoders.LeaveOneOutEncoder(verbose=1)
        enc.fit(X, y)
        self.verify_numeric(enc.transform(X_t))
        self.verify_numeric(enc.transform(X_t, y_t))

        enc = encoders.LeaveOneOutEncoder(verbose=1, drop_invariant=True)
        enc.fit(X, y)
        self.verify_numeric(enc.transform(X_t))
        self.verify_numeric(enc.transform(X_t, y_t))

        enc = encoders.LeaveOneOutEncoder(verbose=1, return_df=False)
        enc.fit(X, y)
        self.assertTrue(isinstance(enc.transform(X_t), np.ndarray))
        self.assertTrue(isinstance(enc.transform(X_t, y_t), np.ndarray))

        enc = encoders.LeaveOneOutEncoder(verbose=1,
                                          randomized=True,
                                          sigma=0.1)
        enc.fit(X, y)
        self.verify_numeric(enc.transform(X_t))
        self.verify_numeric(enc.transform(X_t, y_t))
Exemplo n.º 23
0
    def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self):
        df = pd.DataFrame({
            'color': ["a", "a", "a", "b", "b", "b"],
            'outcome': [1, 0, 0, 1, 0, 1]})

        train = df.drop('outcome', axis=1)
        target = df.drop('color', axis=1)
        test = pd.Series([np.nan, 'b'], name='color')

        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value')
        ce_leave.fit(train, target['outcome'])
        obtained = ce_leave.transform(test)

        self.assertEqual([.5, 2/3.0], list(obtained['color']))
Exemplo n.º 24
0
def encode_category_variables(df, model_params):
    for key, value in model_params['CATEGORICAL_FEATURES_DICT'].items():
        if key not in df.columns:
            continue
        if value == 'OrdinalEncoder':
            ce_oe = ce.OrdinalEncoder(cols=key, handle_unknown='impute')
            df = ce_oe.fit_transform(df)
        elif value == 'OneHotEncoder':
            ce_ohe = ce.OneHotEncoder(cols=key, handle_unknown='impute')
            df = ce_ohe.fit_transform(df)
        elif value == 'LeaveOneOutEncoder':
            ce_looe = ce.LeaveOneOutEncoder(cols=key, handle_unknown='impute')
            df = ce_looe.fit_transform(df, y=df[model_params['CATEGORICAL_FEATURES_DICT']['target_y']])
    return df
Exemplo n.º 25
0
    def test_leave_one_out_np(self):
        """

        :return:
        """

        X = self.create_array(n_rows=1000)
        X_t = self.create_array(n_rows=100)
        y = np.random.randn(X.shape[0])
        y_t = np.random.randn(X_t.shape[0])

        enc = encoders.LeaveOneOutEncoder(verbose=1)
        enc.fit(X, y)
        self.verify_numeric(enc.transform(X_t))
        self.verify_numeric(enc.transform(X_t, y_t))
Exemplo n.º 26
0
 def LeaveOneOutEncoderMethod(self, configFile, data):
     import category_encoders as ce
     data_dict = {'train': {}, 'test': {}}
     scaler = ce.LeaveOneOutEncoder(cols=data['train']['x'].columns)
     scaler.fit(data['train']['x'], data['train']['y'])
     data_dict['train']['x'] = pd.DataFrame(
         scaler.transform(data['train']['x']))
     data_dict['train']['y'] = data['train']['y']
     data_dict['test']['x'] = pd.DataFrame(
         scaler.transform(data['test']['x']))
     data_dict['test']['y'] = data['test']['y']
     if 'test_out' in data:
         data_dict['test_out'] = {}
         data_dict['test_out']['x'] = pd.DataFrame(
             scaler.transform(data['test_out']['x']))
         data_dict['test_out']['y'] = data['test_out']['y']
     return data_dict
    'nursery.arff', 'postoperative.patient.data.arff', 'primary.tumor.arff',
    'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff',
    'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff',
    'vote.arff', 'vowel.arff'
]

# We ignore encoders {BackwardDifferenceEncoder, HelmertEncoder, PolynomialEncoder and SumEncoder} because of:
#   https://github.com/scikit-learn-contrib/categorical-encoding/issues/91
encoders = [
    category_encoders.BaseNEncoder(),
    category_encoders.OneHotEncoder(),
    category_encoders.BinaryEncoder(),
    category_encoders.HashingEncoder(),
    category_encoders.OrdinalEncoder(),
    category_encoders.TargetEncoder(),
    category_encoders.LeaveOneOutEncoder(),
    category_encoders.WOEEncoder()
]

# Initialization
if os.path.isfile('./output/result.csv'):
    os.remove('./output/result.csv')

# Ok...
warnings.filterwarnings('ignore')

# Loop over datasets, then over encoders, and finally, over the models
for dataset_name in datasets:
    X, y, fold_count = arff_loader.load(dataset_name)
    non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
    for encoder in encoders:
Exemplo n.º 28
0
def pipeline(df, target, cat_columns, models):
    n_rows, n_cols = df.shape
    metrics = {
        "n_rows": [],
        "n_cols": [],
        "cardinality": [],
        "model": [],
        "column": [],
        "encoder": [],
        "rmse": [],
        "mae": [],
        "fit_time": [],
        "rmse_change": [],
        "mae_change": [],
        "fit_time_change": [],
    }
    columns = cat_columns

    for model_name in models:

        base_rmse, base_mae, base_fit_time = model(
            df=df,
            target=target,
            encoder=np.nan,
            col=np.nan,
            model_name=model_name,
            encoder_type="basic",
            encoder_name=[],
        )

        _append_metric(
            row_list=metrics,
            n_rows=n_rows,
            n_cols=n_cols,
            cardinality=np.nan,
            model_name=model_name,
            column=np.nan,
            name="basic",
            rmse=base_rmse,
            mae=base_mae,
            fit_time=base_fit_time,
            base_rmse=base_rmse,
            base_mae=base_mae,
            base_fit_time=base_fit_time,
        )

        for column in columns:
            print()
            print(column)
            cardinality = df[column].nunique()

            print("ohe")
            rmse, mae, fit_time = model(
                df=df,
                target=target,
                encoder=np.nan,
                col=column,
                model_name=model_name,
                encoder_type="basic",
                encoder_name="One Hot Encoder (pd.dummies)",
            )
            _append_metric(
                row_list=metrics,
                n_rows=n_rows,
                n_cols=n_cols,
                cardinality=cardinality,
                model_name=model_name,
                column=column,
                name="One Hot Encoder (pd.dummies)",
                rmse=rmse,
                mae=mae,
                fit_time=fit_time,
                base_rmse=base_rmse,
                base_mae=base_mae,
                base_fit_time=base_fit_time,
            )

            encoders = [
                ("Sum Encoder(sleepmind)", SumEncoder()),
                ("BinaryEncoder", ce.BinaryEncoder(cols=[column])),
                ("HashingEncoder", ce.HashingEncoder(cols=[column])),
                ("OneHotEncoder", ce.OneHotEncoder(cols=[column])),
                ("OrdinalEncoder", ce.OrdinalEncoder(cols=[column])),
                ("BaseNEncoder", ce.BaseNEncoder(cols=[column])),
                (
                    "BackwardDifferenceEncoder",
                    ce.BackwardDifferenceEncoder(cols=[column]),
                ),
                ("HelmertEncoder", ce.HelmertEncoder(cols=[column])),
                ("SumEncoder", ce.SumEncoder(cols=[column])),
                ("PolynomialEncoder", ce.PolynomialEncoder(cols=[column])),
                ("TargetEncoder", ce.TargetEncoder(cols=[column])),
                ("LeaveOneOutEncoder", ce.LeaveOneOutEncoder(cols=[column])),
                (
                    "XAM_bayesian_targetEncoder",
                    BayesianTargetEncoder(columns=[column],
                                          prior_weight=3,
                                          suffix=""),
                ),
            ]

            for name, encoder in encoders:
                print(name)
                rmse, mae, fit_time = model(
                    df=df,
                    target=target,
                    encoder=encoder,
                    col=column,
                    model_name=model_name,
                    encoder_type="sklearn_encoding",
                    encoder_name=name,
                )
                _append_metric(
                    row_list=metrics,
                    n_rows=n_rows,
                    n_cols=n_cols,
                    cardinality=cardinality,
                    model_name=model_name,
                    column=column,
                    name=name,
                    rmse=rmse,
                    mae=mae,
                    fit_time=fit_time,
                    base_rmse=base_rmse,
                    base_mae=base_mae,
                    base_fit_time=base_fit_time,
                )

            bayes_encoders = [
                ("hcc_BayesEncoding", BayesEncoding),
                ("hcc_BayesEncodingKfold", BayesEncodingKfold),
                ("LOOEncoding", LOOEncoding),
                ("LOOEncodingKfold", LOOEncodingKfold),
            ]
            for name, bayes_encoder in bayes_encoders:
                print(name)
                rmse, mae, fit_time = model(
                    df=df,
                    target=target,
                    encoder=bayes_encoder,
                    col=column,
                    model_name=model_name,
                    encoder_name=name,
                    encoder_type="basic",
                    hcc_ind=1,
                )
                _append_metric(
                    row_list=metrics,
                    n_rows=n_rows,
                    n_cols=n_cols,
                    cardinality=cardinality,
                    model_name=model_name,
                    column=column,
                    name=name,
                    rmse=rmse,
                    mae=mae,
                    fit_time=fit_time,
                    base_rmse=base_rmse,
                    base_mae=base_mae,
                    base_fit_time=base_fit_time,
                )
    results = pd.DataFrame(metrics)
    return results
Exemplo n.º 29
0
            'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff',
            'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff',
            'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff']

# datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large...


# We painstakingly initialize each encoder here because that gives us the freedom to initialize the
# encoders with any setting we want.
encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(),
             category_encoders.BinaryEncoder(),
             category_encoders.HashingEncoder(),
             # category_encoders.HelmertEncoder(),
             category_encoders.JamesSteinEncoder(),
             category_encoders.LeaveOneOutEncoder(),
             category_encoders.MEstimateEncoder(),
             category_encoders.OneHotEncoder(),
             category_encoders.OrdinalEncoder(),
             # category_encoders.PolynomialEncoder(),
             # category_encoders.SumEncoder(),
             category_encoders.TargetEncoder(),
             category_encoders.WOEEncoder()]

encoders = [ #category_encoders.BackwardDifferenceEncoder(),
             category_encoders.BaseNEncoder(handle_missing='value'),
             category_encoders.BaseNEncoder(handle_missing='indicator'),
             category_encoders.BinaryEncoder(handle_missing='value'),
category_encoders.BinaryEncoder(handle_missing='indicator'),
#              category_encoders.HashingEncoder(handle_missing='value'),
# category_encoders.HashingEncoder(handle_missing='indicator'),
Exemplo n.º 30
0
    def preprocess_data(self,
                        data: pd.DataFrame,
                        stage: str = "inference") -> Tuple[pd.DataFrame, list]:
        """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder

        Args:
            data (pd.DataFrame): A dataframe with the features and target
            stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".

        Returns:
            tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple
        """
        logger.info(f"Preprocessing data: Stage: {stage}...")
        added_features = None
        if self.config.encode_date_columns:
            for field_name, freq in self.config.date_columns:
                data = self.make_date(data, field_name)
                data, added_features = self.add_datepart(data,
                                                         field_name,
                                                         frequency=freq,
                                                         prefix=None,
                                                         drop=True)
        # The only features that are added are the date features extracted
        # from the date which are categorical in nature
        if (added_features is not None) and (stage == "fit"):
            logger.debug(
                f"Added {added_features} features after encoding the date_columns"
            )
            self.config.categorical_cols += added_features
            self.config.categorical_dim = (len(self.config.categorical_cols)
                                           if self.config.categorical_cols
                                           is not None else 0)
        # Encoding Categorical Columns
        if len(self.config.categorical_cols) > 0:
            if stage == "fit":
                if self.do_leave_one_out_encoder():
                    logger.debug(
                        "Encoding Categorical Columns using LeavOneOutEncoder")
                    self.categorical_encoder = ce.LeaveOneOutEncoder(
                        cols=self.config.categorical_cols, random_state=42)
                    # Multi-Target Regression uses the first target to encode the categorical columns
                    if len(self.config.target) > 1:
                        logger.warning(
                            f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns"
                        )
                    data = self.categorical_encoder.fit_transform(
                        data, data[self.config.target[0]])
                else:
                    logger.debug(
                        "Encoding Categorical Columns using OrdinalEncoder")
                    self.categorical_encoder = OrdinalEncoder(
                        cols=self.config.categorical_cols)
                    data = self.categorical_encoder.fit_transform(data)
            else:
                data = self.categorical_encoder.transform(data)

        # Transforming Continuous Columns
        if (self.config.continuous_feature_transform
                is not None) and (len(self.config.continuous_cols) > 0):
            if stage == "fit":
                transform = self.CONTINUOUS_TRANSFORMS[
                    self.config.continuous_feature_transform]
                self.continuous_transform = transform["callable"](
                    **transform["params"])
                # TODO implement quantile noise
                data.loc[:, self.config.
                         continuous_cols] = self.continuous_transform.fit_transform(
                             data.loc[:, self.config.continuous_cols])
            else:
                data.loc[:, self.config.
                         continuous_cols] = self.continuous_transform.transform(
                             data.loc[:, self.config.continuous_cols])

        # Normalizing Continuous Columns
        if (self.config.normalize_continuous_features) and (len(
                self.config.continuous_cols) > 0):
            if stage == "fit":
                self.scaler = StandardScaler()
                data.loc[:, self.config.
                         continuous_cols] = self.scaler.fit_transform(
                             data.loc[:, self.config.continuous_cols])
            else:
                data.loc[:,
                         self.config.continuous_cols] = self.scaler.transform(
                             data.loc[:, self.config.continuous_cols])

        # Converting target labels to a 0 indexed label
        if self.config.task == "classification":
            if stage == "fit":
                self.label_encoder = LabelEncoder()
                data[self.config.target[0]] = self.label_encoder.fit_transform(
                    data[self.config.target[0]])
            else:
                if self.config.target[0] in data.columns:
                    data[self.config.target[0]] = self.label_encoder.transform(
                        data[self.config.target[0]])
        # Target Transforms
        if all([col in data.columns for col in self.config.target]):
            if self.do_target_transform:
                target_transforms = []
                for col in self.config.target:
                    _target_transform = copy.deepcopy(
                        self.target_transform_template)
                    data[col] = _target_transform.fit_transform(
                        data[col].values.reshape(-1, 1))
                    target_transforms.append(_target_transform)
                self.target_transforms = target_transforms
        return data, added_features