def count_encoder(X_train, Y_train, X_val, Y_val, target_col: str, cat_features=None, features=None):
    """
    Count_Encoding: カテゴリ列をカウント値に変換する特徴量エンジニアリング(要はgroupby().size()の集計列追加のこと)
    ※カウント数が同じカテゴリは同じようなデータ傾向になる可能性がある
    https://www.kaggle.com/matleonard/categorical-encodings
    """
    X_train = pd.DataFrame(X_train, columns=features)
    Y_train = pd.DataFrame(Y_train, columns=[target_col])
    X_val = pd.DataFrame(X_val, columns=features)
    Y_val = pd.DataFrame(Y_val, columns=[target_col])
    
    train_df = X_train.join(Y_train)
    valid_df = X_val.join(Y_val)
    
    count_enc = ce.CountEncoder(cols=cat_features)

    # trainだけでfitすること(validationやtest含めるとリークする)
    count_enc.fit(train_df[cat_features])
    train_encoded = train_df.join(
        count_enc.transform(train_df[cat_features]).add_suffix("_count")
    )
    valid_encoded = valid_df.join(
        count_enc.transform(valid_df[cat_features]).add_suffix("_count")
    )

    features = train_encoded.drop(target_col, axis=1).columns.to_list()
    
    #return train_encoded, valid_encoded
    return train_encoded.drop(target_col, axis=1), valid_encoded.drop(target_col, axis=1), features
Пример #2
0
    def count_encoder(train_df, valid_df, cat_features=None):
        """
        Count_Encoding: カテゴリ列をカウント値に変換する特徴量エンジニアリング(要はgroupby().size()の集計列追加のこと)
        ※カウント数が同じカテゴリは同じようなデータ傾向になる可能性がある
        https://www.kaggle.com/matleonard/categorical-encodings
        """
        # conda install -c conda-forge category_encoders
        import category_encoders as ce

        if cat_features is None:
            cat_features = train_df.select_dtypes(
                include=["object", "category", "bool"]
            ).columns.to_list()

        count_enc = ce.CountEncoder(cols=cat_features)

        # trainだけでfitすること(validationやtest含めるとリークする)
        count_enc.fit(train_df[cat_features])
        train_encoded = train_df.join(
            count_enc.transform(train_df[cat_features]).add_suffix("_count")
        )
        valid_encoded = valid_df.join(
            count_enc.transform(valid_df[cat_features]).add_suffix("_count")
        )

        return train_encoded, valid_encoded
 def CountEncoding(df, _col):
     _df = df[[_col]]
     import pandas as pd
     import category_encoders as ce
     # use binary encoding to encode two categorical features
     enc = ce.CountEncoder(cols=[_col]).fit(_df)
     # transform the dataset
     numeric_dataset = enc.transform(_df)
     return numeric_dataset
Пример #4
0
def get_encoder(encoder_name):
    """
    Returns an Encdoer Object given the name of the encoder
    """
    if encoder_name == 'LabelEncoder':
        return LabelEncoder()
    elif (encoder_name == 'CountEncoder'):
        return ce.CountEncoder()
    else:
        return None
Пример #5
0
    def test_count_min_group_name_string(self):
        """Test the min_group_name string on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size=6, min_group_name='dave')

        enc.fit(X)

        self.assertIn('dave', enc.mapping['none'])
        self.assertEqual(enc.mapping['none']['dave'], 8)
        self.assertIn('dave', enc.mapping['na_categorical'])
        self.assertEqual(enc.mapping['na_categorical']['dave'], 7)
Пример #6
0
 def fit(self, input_df):
     self.encoder = ce.CountEncoder(cols=self.column,
                                    handle_unknown=self.handle_unknown,
                                    handle_missing=self.handle_missing,
                                    min_group_size=self.min_group_size)
     if self.whole_df is None:
         self.encoder.fit(input_df[self.column])
     else:
         self.encoder.fit(self.whole_df[self.column])
     return self.transform(input_df)
Пример #7
0
def count_enc(df_norm, mode, count_enc=None):
    if mode == 'train':
        count_enc = ce.CountEncoder()
        count_encoded = count_enc.fit_transform(df_norm[cat_features])
        data = df_norm.join(count_encoded.add_suffix("_count"))
        data = data.drop(['tipodepropiedad', 'provincia', 'ciudad'], axis=1)
    if mode == 'test':
        new_cats = count_enc.transform(df_norm[cat_features])
        baseline_data = df_norm.drop(
            ['tipodepropiedad', 'ciudad', 'provincia'], axis=1).join(new_cats)
    return data, count_enc
Пример #8
0
    def test_count_defaults(self):
        """Test the defaults are working as expected on 'none' and 'categorical' 
        which are the most extreme edge cases for the count encoder."""
        enc = encoders.CountEncoder(verbose=1)
        enc.fit(X)
        out = enc.transform(X_t)

        self.assertTrue(pd.Series([5, 3, 6]).isin(out['none'].unique()).all())
        self.assertTrue(out['none'].unique().shape == (3, ))
        self.assertTrue(out['none'].isnull().sum() == 0)
        self.assertTrue(pd.Series([6, 3]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (4, ))
        self.assertTrue(enc.mapping is not None)
Пример #9
0
def countEncode(data):
    cat_features = ['CHAS', 'RAD']

    # Count Encoding
    count_enc = ce.CountEncoder()

    # Transform the features, rename the columns with the _count suffix, and join to dataframe
    count_encoded = count_enc.fit_transform(ks[cat_features])
    data = data.join(count_encoded.add_suffix("_count"))

    # Train a model
    train, valid, test = get_data_splits(data)
    train_model(train, valid)
Пример #10
0
    def test_count_min_group_size_int(self):
        """Test the min_group_size int  on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size=7)

        enc.fit(X)
        out = enc.transform(X_t)
        self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all())
        self.assertTrue(out['none'].unique().shape == (3, ))
        self.assertTrue(out['none'].isnull().sum() == 0)
        self.assertIn(np.nan, enc.mapping['none'])
        self.assertTrue(pd.Series([13, 7]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (2, ))
        self.assertIn('B_C_nan', enc.mapping['na_categorical'])
        self.assertFalse(np.nan in enc.mapping['na_categorical'])
Пример #11
0
def count_encodings_solution():
    cat_features = ['ip', 'app', 'device', 'os', 'channel']
    count_enc = ce.CountEncoder(cols=cat_features)

    train, valid, _ = get_data_splits()

    # Learn encoding from the training set
    count_enc.fit(train[cat_features])

    # Apply encoding to the train and validation sets
    train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count'))
    valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count'))

    return train_encoded, valid_encoded
Пример #12
0
    def test_count_handle_missing_string(self):
        """Test the handle_missing string on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(handle_missing='return_nan')

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._handle_missing)
        self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all())
        self.assertTrue(out['none'].unique().shape == (4, ))
        self.assertTrue(out['none'].isnull().sum() == 3)
        self.assertTrue(pd.Series([6, 7, 3]).isin(out['na_categorical']).all())
        self.assertFalse(pd.Series([4]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (4, ))
        self.assertTrue(out['na_categorical'].isnull().sum() == 3)
Пример #13
0
    def test_count_normalize_bool(self):
        """Test the normalize bool on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size=6, normalize=True)

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._normalize)
        self.assertTrue(out['none'].round(5).isin([0.3, 0.4]).all())
        self.assertEqual(out['none'].unique().shape[0], 2)
        self.assertEqual(out['none'].isnull().sum(), 0)
        self.assertTrue(
            pd.Series([0.3, 0.35]).isin(out['na_categorical']).all())
        self.assertEqual(out['na_categorical'].unique().shape[0], 2)
        self.assertTrue(enc.mapping is not None)
Пример #14
0
    def test_count_combine_min_nan_groups_bool(self):
        """Test the min_nan_groups_bool on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size=7,
                                    combine_min_nan_groups=False)

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all())
        self.assertEqual(out['none'].unique().shape[0], 3)
        self.assertEqual(out['none'].isnull().sum(), 0)
        self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
        self.assertEqual(out['na_categorical'].unique().shape[0], 3)
        self.assertTrue(enc.mapping is not None)
        self.assertIn(np.nan, enc.mapping['na_categorical'])
Пример #15
0
    def test_count_handle_missing_dict(self):
        """Test the handle_missing dict on 'none' and 'na_categorical'. 
        We want to see differing behavour between 'none' and 'na_cat' cols."""
        enc = encoders.CountEncoder(
            handle_missing={'na_categorical': 'return_nan'})

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._handle_missing)
        self.assertTrue(pd.Series([5, 3, 6]).isin(out['none']).all())
        self.assertTrue(out['none'].unique().shape == (3, ))
        self.assertTrue(out['none'].isnull().sum() == 0)
        self.assertTrue(pd.Series([6, 7, 3]).isin(out['na_categorical']).all())
        self.assertFalse(pd.Series([4]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (4, ))
        self.assertTrue(out['na_categorical'].isnull().sum() == 3)
Пример #16
0
    def test_count_min_group_name_dict(self):
        """Test the min_group_name dict on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size={
            'none': 6,
            'na_categorical': 6
        },
                                    min_group_name={
                                        'none': 'dave',
                                        'na_categorical': None
                                    })

        enc.fit(X)

        self.assertIn('none', enc._min_group_name)
        self.assertIn('dave', enc.mapping['none'])
        self.assertEqual(enc.mapping['none']['dave'], 8)
        self.assertIn('B_nan', enc.mapping['na_categorical'])
        self.assertEqual(enc.mapping['na_categorical']['B_nan'], 7)
Пример #17
0
    def test_count_normalize_dict(self):
        """Test the normalize dict on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size=7,
                                    normalize={
                                        'none': True,
                                        'na_categorical': False
                                    })

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._normalize)
        self.assertTrue(out['none'].round(5).isin([0.3, 0.15, 0.25]).all())
        self.assertEqual(out['none'].unique().shape[0], 3)
        self.assertEqual(out['none'].isnull().sum(), 0)
        self.assertTrue(pd.Series([13, 7]).isin(out['na_categorical']).all())
        self.assertEqual(out['na_categorical'].unique().shape[0], 2)
        self.assertTrue(enc.mapping is not None)
Пример #18
0
    def test_count_handle_unknown_string(self):
        """Test the handle_unknown string  on 'none' and 'na_categorical'.
        The 'handle_missing' must be set to 'return_nan' in order to test
        'handle_unkown' correctly."""
        enc = encoders.CountEncoder(
            handle_missing='return_nan',
            handle_unknown='return_nan',
        )

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._handle_unknown)
        self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all())
        self.assertTrue(out['none'].unique().shape == (4, ))
        self.assertTrue(out['none'].isnull().sum() == 3)
        self.assertTrue(pd.Series([3, 6, 7]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (4, ))
        self.assertTrue(out['na_categorical'].isnull().sum() == 3)
Пример #19
0
    def test_count_handle_unknown_dict(self):
        """Test the 'handle_unkown' dict with all non-default options."""
        enc = encoders.CountEncoder(
            handle_missing='return_nan',
            handle_unknown={
                'none': -1,
                'na_categorical': 'return_nan'
            },
        )

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._handle_unknown)
        self.assertTrue(pd.Series([6, 5, 3, -1]).isin(out['none']).all())
        self.assertTrue(out['none'].unique().shape == (4, ))
        self.assertTrue(out['none'].isnull().sum() == 0)
        self.assertTrue(pd.Series([3, 6, 7]).isin(out['na_categorical']).all())
        self.assertTrue(out['na_categorical'].unique().shape == (4, ))
        self.assertTrue(out['na_categorical'].isnull().sum() == 3)
Пример #20
0
    def test_count_combine_min_nan_groups_dict(self):
        """Test the combine_min_nan_groups dict  on 'none' and 'na_categorical'."""
        enc = encoders.CountEncoder(min_group_size={
            'none': 6,
            'na_categorical': 7
        },
                                    combine_min_nan_groups={
                                        'none': 'force',
                                        'na_categorical': False
                                    })

        enc.fit(X)
        out = enc.transform(X_t)

        self.assertIn('none', enc._combine_min_nan_groups)
        self.assertTrue(pd.Series([14, 6]).isin(out['none']).all())
        self.assertEqual(out['none'].unique().shape[0], 2)
        self.assertEqual(out['none'].isnull().sum(), 0)
        self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all())
        self.assertEqual(out['na_categorical'].unique().shape[0], 3)
        self.assertTrue(enc.mapping is not None)
        self.assertIn(np.nan, enc.mapping['na_categorical'])
Пример #21
0
    return data


# Train data processing
train = data_parser(train)
# Define target and predictors
target = np.log1p(train['meter_reading'])
features = train.drop(['meter_reading'], axis=1)

del train
gc.collect()
# Process categorical features
categorical_features = ['building_id', 'site_id', 'meter', 'primary_use']

encoder = category_encoders.CountEncoder(cols=categorical_features)
encoder.fit(features)
features = encoder.transform(features)

features_size = features.shape[0]
for feature in categorical_features:
    features[feature] = features[feature] / features_size
# Missing data imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(features)
features = imputer.transform(features)
# Regressors
lightgbm = LGBMRegressor(objective='regression',
                         learning_rate=0.05,
                         num_leaves=1024,
                         feature_fraction=0.8,
Пример #22
0
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst


# Training a model on the baseline data
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

print(
    "Count   Encoding-------------------------------------------------------------------------------"
)

import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix("_count"))

# Training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)

print(
    "Target  Encoding---------------------------------------------------------------------------------------------"
)
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
Пример #23
0
 def fit(self, input_df: pd.DataFrame, y=None):
     self.encoder = ce.CountEncoder(handle_unknown=-1, handle_missing="count")
     self.encoder.fit(input_df[self.cat_cols])
     return self.transform(input_df)
Пример #24
0
def get_model(PARAMS):
    """return model for provided params

    :param PARAMS: dictionary with model params
    :type PARAMS: dicr
    :return: model pipeline
    :rtype: sklearn pipeline
    """

    try:
        te_dict = {
            'CatBoostEncoder': ce.CatBoostEncoder(),
            'HashingEncoder': ce.HashingEncoder(),
            'HelmertEncoder': ce.HelmertEncoder(),
            'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(),
            'OneHotEncoder': ce.OneHotEncoder(),
            'TargetEncoder': ce.TargetEncoder(),
            'WOEEncoder': ce.WOEEncoder(),
            'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(),
            'BaseNEncoder': ce.BaseNEncoder(),
            'BinaryEncoder': ce.BinaryEncoder(),
            'CountEncoder': ce.CountEncoder(),
            'JamesSteinEncoder': ce.JamesSteinEncoder(),
            'MEstimateEncoder': ce.MEstimateEncoder(),
            'PolynomialEncoder': ce.PolynomialEncoder(),
            'SumEncoder': ce.SumEncoder()
        }

        pipe = make_pipeline(
            helpers.PrepareData(extraxt_year=True, unicode_text=True),
            ColumnTransformer([
                ('num', helpers.PassThroughOrReplace(), [
                    'flat_size', 'rooms', 'floor', 'number_of_floors',
                    'year_of_building', 'GC_latitude', 'GC_longitude'
                ]),
                ('te_producer', te_dict.get(PARAMS['te_producer']),
                 'producer_name'),
                ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'),
                ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']),
                 'GC_addr_neighbourhood'),
                ('te_suburb', te_dict.get(PARAMS['te_suburb']),
                 'GC_addr_suburb'),
                ('te_postcode', te_dict.get(PARAMS['te_postcode']),
                 'GC_addr_postcode'),
                ('txt_name',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_name__ngram_range']),
                                 max_features=PARAMS['txt_name__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_name__binary'],
                                 use_idf=PARAMS['txt_name__use_idf']), 'name'),
                ('txt_dscr',
                 TfidfVectorizer(lowercase=True,
                                 ngram_range=(1,
                                              PARAMS['txt_dscr__ngram_range']),
                                 max_features=PARAMS['txt_dscr__max_features'],
                                 dtype=np.float32,
                                 binary=PARAMS['txt_dscr__binary'],
                                 use_idf=PARAMS['txt_dscr__use_idf']),
                 'description'),
            ]),
            TransformedTargetRegressor(
                regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed),
                func=np.log1p,
                inverse_func=np.expm1))

        return pipe

    except BaseException as e:
        LOG.error(e)
        return None
Пример #25
0
 def _count_encoding(self):
     count_enc = ce.CountEncoder()
     return count_enc.fit_transform(self.df[self.cat_feats].values)
Пример #26
0
def lesson_2():
    print_("Lesson 2: Categorical Encodings", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    data = ks[data_cols].join(encoded)

    # Defining  functions that will help us test our encodings
    def get_data_splits(dataframe, valid_fraction=0.1):
        valid_fraction = 0.1
        valid_size = int(len(dataframe) * valid_fraction)

        train = dataframe[:-valid_size * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_size * 2:-valid_size]
        test = dataframe[-valid_size:]

        return train, valid, test

    def train_model(train, valid):
        feature_cols = train.columns.drop('outcome')

        dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

        param = {
            'num_leaves': 64,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7,
            'verbose': -1
        }
        bst = lgb.train(param,
                        dtrain,
                        num_boost_round=1000,
                        valid_sets=[dvalid],
                        early_stopping_rounds=10,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
        print(f"Validation AUC score: {valid_score:.4f}")

    # Train a model (on the baseline data)
    train, valid, test = get_data_splits(data)
    print_("Baseline (LightGBM with no categorical encoding)", 0)
    train_model(train, valid)
    print()

    # --------------
    # Count Encoding
    # --------------
    cat_features = ['category', 'currency', 'country']

    # Create the encoder
    count_enc = ce.CountEncoder()

    # Transform the features, rename the columns with the _count suffix, and join to dataframe
    # TODO: calculating the counts on the whole dataset? Should it be on the train only to avoid data leakage?
    # This is what was done in the Exercise 2
    count_encoded = count_enc.fit_transform(ks[cat_features])
    data = data.join(count_encoded.add_suffix("_count"))

    # Train a model
    train, valid, test = get_data_splits(data)
    print_("LightGBM with COUNT encoding", 0)
    train_model(train, valid)
    print()

    # ---------------
    # Target Encoding
    # ---------------
    # Create the encoder
    target_enc = ce.TargetEncoder(cols=cat_features)
    target_enc.fit(train[cat_features], train['outcome'])

    # Transform the features, rename the columns with _target suffix, and join to dataframe
    train_TE = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_target'))
    valid_TE = valid.join(
        target_enc.transform(valid[cat_features]).add_suffix('_target'))

    # Train a model
    print_("LightGBM with TARGET encoding", 0)
    train_model(train_TE, valid_TE)
    print()

    # -----------------
    # CatBoost Encoding
    # -----------------
    # Create the encoder
    cb_enc = ce.TargetEncoder(cols=cat_features)
    cb_enc.fit(train[cat_features], train['outcome'])

    # Transform the features, rename the columns with _target suffix, and join to dataframe
    train_CBE = train.join(
        cb_enc.transform(train[cat_features]).add_suffix('_cb'))
    valid_CBE = valid.join(
        cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

    # Train a model
    print_("LightGBM with CatBoost encoding", 0)
    train_model(train_CBE, valid_CBE)
    print()
# 1) Categorical encodings and leakage
# =============================================================================

# Use only training data to encode. If not -> data leakage

# =============================================================================
# 2) Count encodings
# =============================================================================

import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_enc.fit(train[cat_features])

# Apply encoding to the train and validation sets
train_encoded = train.join(
    count_enc.transform(train[cat_features]).add_suffix('_count'))
valid_encoded = valid.join(
    count_enc.transform(valid[cat_features]).add_suffix('_count'))

model2 = train_model(train_encoded, valid_encoded)

# =============================================================================
# 4) Target encoding
# =============================================================================
Пример #28
0
def preprocess_table(input_file_path, output_file_path):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    encoders = {}
    logger = logging.getLogger(__name__)

    df_full_train, output_filepath_df_train, output_filepath_misc_train = read_table(
        input_file_path, logger, output_file_path, suffix="Train")

    df_full_test, output_filepath_df_test, output_filepath_misc_test = read_table(
        input_file_path, logger, output_file_path, suffix="Test")

    df_full_val, output_filepath_df_val, output_filepath_misc_val = read_table(
        input_file_path, logger, output_file_path, suffix="Validation")

    # Label encode categoricals
    for cat in CAT_COLUMNS:
        logger.info(f"to category: {cat}")
        df_full_train[cat] = df_full_train[cat].astype(str)
        df_full_test[cat] = df_full_test[cat].astype(str)
        df_full_val[cat] = df_full_val[cat].astype(str)

    CALC_COUNT_COLUMNS = []
    df_to_fit_le = pd.concat([df_full_train, df_full_val],
                             axis=0)[df_full_test.columns]

    # Label encode categoricals
    label_encoder = ce.CountEncoder(return_df=True,
                                    cols=CAT_COLUMNS,
                                    verbose=1,
                                    normalize=True)
    count_encoder = ce.CountEncoder(return_df=True,
                                    cols=COUNT_COLUMNS + CALC_COUNT_COLUMNS,
                                    verbose=1,
                                    normalize=True)

    # Encode train and test with LE
    label_encoder.fit(df_to_fit_le)
    df_full_train[df_full_test.columns] = label_encoder.transform(
        df_full_train[df_full_test.columns])
    df_full_test = label_encoder.transform(df_full_test)
    df_full_val[df_full_test.columns] = label_encoder.transform(
        df_full_val[df_full_test.columns])
    # Encode train and test with CE
    count_encoder.fit(df_to_fit_le)
    df_full_train[df_full_test.columns] = count_encoder.transform(
        df_full_train[df_full_test.columns])
    df_full_test = count_encoder.transform(df_full_test)
    df_full_val[df_full_test.columns] = count_encoder.transform(
        df_full_val[df_full_test.columns])
    # Encode aggregate statistics using BallTree:
    X = pd.concat(
        [df_full_train[['lat', 'long']], df_full_val[['lat', 'long']]],
        axis=0).values
    # Build a tree:
    tree = BallTree(X)
    # Calculate aggregate statistics using tree:
    X_to_get_data = pd.concat([df_full_train, df_full_val], axis=0)
    #
    # df_full_train = calculate_agg_statistics(tree,X_to_get_data,df_full_train)
    # df_full_val = calculate_agg_statistics(tree, X_to_get_data, df_full_val)
    # df_full_test = calculate_agg_statistics(tree, X_to_get_data, df_full_test)

    #
    print(df_full_train.shape)
    print(df_full_test.shape)
    print(df_full_val.shape)
    # Encode test:

    misc = {}
    misc["encoder_dict"] = encoders
    # profile = feature_df.profile_report(title=f'Pandas Profiling Report for {suffix}')
    # profile.to_file(output_file=os.path.join(project_dir, f"output_{suffix}.html"))

    df_full_train.to_pickle(output_filepath_df_train)
    df_full_test.to_pickle(output_filepath_df_test)
    df_full_val.to_pickle(output_filepath_df_val)

    with open(output_filepath_misc_train, "wb") as f:
        pickle.dump(misc, f)

    return 0
Пример #29
0
    lst_combination = (list(combinations(auto_columns_2, 2)) +
                       list(combinations(auto_columns_3, 2)) +
                       list(combinations(auto_columns_4, 2)))
    for l, r in lst_combination:
        for func in 'add subtract divide multiply'.split():
            df[f'auto_{func}_{l}_{r}'] = getattr(np, func)(df[l], df[r])

    return df


def transform(df):
    df = process_datetime_cols(df)
    df = process_categorical_cols(df)
    df = process_others(df)
    return df.drop(ignore_columns, axis=1)


train = transform(train)
test = transform(test)

# Create the encoder
t = pd.concat([train, test]).reset_index(drop=True)
count_enc = ce.CountEncoder().fit_transform(t[cat_features])
tt = t.join(count_enc.add_suffix("_count"))

f2_train = tt.loc[tt.index < train.shape[0]]
f2_test = tt.loc[tt.index >= train.shape[0]]

columns = sorted(set(f2_train.columns).intersection(f2_test.columns))
print(len(columns))
Пример #30
0
def ex_2():
    print_("Exercise 2: Categorical Encodings", 0, 1)
    clicks = load_data_for_ex_2()

    def get_data_splits(dataframe, valid_fraction=0.1):
        """Splits a dataframe into train, validation, and test sets.

        First, orders by the column 'click_time'. Set the size of the
        validation and test sets with the valid_fraction keyword argument.
        """

        dataframe = dataframe.sort_values('click_time')
        valid_rows = int(len(dataframe) * valid_fraction)
        train = dataframe[:-valid_rows * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_rows * 2:-valid_rows]
        test = dataframe[-valid_rows:]

        return train, valid, test

    def train_model(train, valid, test=None, feature_cols=None):
        if feature_cols is None:
            feature_cols = train.columns.drop(
                ['click_time', 'attributed_time', 'is_attributed'])
        dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])

        param = {
            'num_leaves': 256,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7,
            'verbose': -1
        }
        num_round = 1000
        bst = lgb.train(param,
                        dtrain,
                        num_round,
                        valid_sets=[dvalid],
                        early_stopping_rounds=20,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
        print(f"Validation AUC score: {valid_score}")

        if test is not None:
            test_pred = bst.predict(test[feature_cols])
            test_score = metrics.roc_auc_score(test['is_attributed'],
                                               test_pred)
            return bst, valid_score, test_score
        else:
            return bst, valid_score

    print_("Baseline model", 0)
    train, valid, test = get_data_splits(clicks)
    _ = train_model(train, valid)
    print()

    # ------------------------------------
    # 1. Categorical encodings and leakage
    # ------------------------------------

    # ------------------
    # 2. Count encodings
    # ------------------
    cat_features = ['ip', 'app', 'device', 'os', 'channel']
    train, valid, test = get_data_splits(clicks)

    # Create the count encoder
    count_enc = ce.CountEncoder(cols=cat_features)

    # Learn encoding from the training set
    # TODO: Why not train['is_attributed']?
    count_enc.fit(train[cat_features])
    # count_enc.fit(train[cat_features], train['is_attributed'])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_count` as a suffix to the new columns
    train_encoded = train.join(
        count_enc.transform(train[cat_features]).add_suffix('_count'))
    valid_encoded = valid.join(
        count_enc.transform(valid[cat_features]).add_suffix('_count'))

    # Train the model on the encoded datasets
    print_("LightGBM with COUNT encoding", 0)
    _ = train_model(train_encoded, valid_encoded)
    print()

    # ------------------
    # 4. Target encoding
    # ------------------
    # Create the target encoder.
    target_enc = ce.TargetEncoder(cols=cat_features)

    # Learn encoding from the training set. Use the 'is_attributed' column as the target.
    target_enc.fit(train[cat_features], train['is_attributed'])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_target` as a suffix to the new columns
    train_encoded = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_target'))
    valid_encoded = valid.join(
        target_enc.transform(valid[cat_features]).add_suffix('_target'))

    # Train a model
    print_("LightGBM with TARGET encoding", 0)
    _ = train_model(train_encoded, valid_encoded)
    print()

    # --------------------
    # 6. CatBoost Encoding
    # --------------------
    # Remove IP from the encoded features
    cat_features = ['app', 'device', 'os', 'channel']

    # Create the CatBoost encoder
    cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

    # Learn encoding from the training set
    cb_enc.fit(train[cat_features], train['is_attributed'])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_cb` as a suffix to the new columns
    train_encoded = train.join(
        cb_enc.transform(train[cat_features]).add_suffix('_cb'))
    valid_encoded = valid.join(
        cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

    # Train a model
    print_("LightGBM with CatBoost encoding", 0)
    _ = train_model(train_encoded, valid_encoded)
    print()