def count_encoder(X_train, Y_train, X_val, Y_val, target_col: str, cat_features=None, features=None): """ Count_Encoding: カテゴリ列をカウント値に変換する特徴量エンジニアリング(要はgroupby().size()の集計列追加のこと) ※カウント数が同じカテゴリは同じようなデータ傾向になる可能性がある https://www.kaggle.com/matleonard/categorical-encodings """ X_train = pd.DataFrame(X_train, columns=features) Y_train = pd.DataFrame(Y_train, columns=[target_col]) X_val = pd.DataFrame(X_val, columns=features) Y_val = pd.DataFrame(Y_val, columns=[target_col]) train_df = X_train.join(Y_train) valid_df = X_val.join(Y_val) count_enc = ce.CountEncoder(cols=cat_features) # trainだけでfitすること(validationやtest含めるとリークする) count_enc.fit(train_df[cat_features]) train_encoded = train_df.join( count_enc.transform(train_df[cat_features]).add_suffix("_count") ) valid_encoded = valid_df.join( count_enc.transform(valid_df[cat_features]).add_suffix("_count") ) features = train_encoded.drop(target_col, axis=1).columns.to_list() #return train_encoded, valid_encoded return train_encoded.drop(target_col, axis=1), valid_encoded.drop(target_col, axis=1), features
def count_encoder(train_df, valid_df, cat_features=None): """ Count_Encoding: カテゴリ列をカウント値に変換する特徴量エンジニアリング(要はgroupby().size()の集計列追加のこと) ※カウント数が同じカテゴリは同じようなデータ傾向になる可能性がある https://www.kaggle.com/matleonard/categorical-encodings """ # conda install -c conda-forge category_encoders import category_encoders as ce if cat_features is None: cat_features = train_df.select_dtypes( include=["object", "category", "bool"] ).columns.to_list() count_enc = ce.CountEncoder(cols=cat_features) # trainだけでfitすること(validationやtest含めるとリークする) count_enc.fit(train_df[cat_features]) train_encoded = train_df.join( count_enc.transform(train_df[cat_features]).add_suffix("_count") ) valid_encoded = valid_df.join( count_enc.transform(valid_df[cat_features]).add_suffix("_count") ) return train_encoded, valid_encoded
def CountEncoding(df, _col): _df = df[[_col]] import pandas as pd import category_encoders as ce # use binary encoding to encode two categorical features enc = ce.CountEncoder(cols=[_col]).fit(_df) # transform the dataset numeric_dataset = enc.transform(_df) return numeric_dataset
def get_encoder(encoder_name): """ Returns an Encdoer Object given the name of the encoder """ if encoder_name == 'LabelEncoder': return LabelEncoder() elif (encoder_name == 'CountEncoder'): return ce.CountEncoder() else: return None
def test_count_min_group_name_string(self): """Test the min_group_name string on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size=6, min_group_name='dave') enc.fit(X) self.assertIn('dave', enc.mapping['none']) self.assertEqual(enc.mapping['none']['dave'], 8) self.assertIn('dave', enc.mapping['na_categorical']) self.assertEqual(enc.mapping['na_categorical']['dave'], 7)
def fit(self, input_df): self.encoder = ce.CountEncoder(cols=self.column, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing, min_group_size=self.min_group_size) if self.whole_df is None: self.encoder.fit(input_df[self.column]) else: self.encoder.fit(self.whole_df[self.column]) return self.transform(input_df)
def count_enc(df_norm, mode, count_enc=None): if mode == 'train': count_enc = ce.CountEncoder() count_encoded = count_enc.fit_transform(df_norm[cat_features]) data = df_norm.join(count_encoded.add_suffix("_count")) data = data.drop(['tipodepropiedad', 'provincia', 'ciudad'], axis=1) if mode == 'test': new_cats = count_enc.transform(df_norm[cat_features]) baseline_data = df_norm.drop( ['tipodepropiedad', 'ciudad', 'provincia'], axis=1).join(new_cats) return data, count_enc
def test_count_defaults(self): """Test the defaults are working as expected on 'none' and 'categorical' which are the most extreme edge cases for the count encoder.""" enc = encoders.CountEncoder(verbose=1) enc.fit(X) out = enc.transform(X_t) self.assertTrue(pd.Series([5, 3, 6]).isin(out['none'].unique()).all()) self.assertTrue(out['none'].unique().shape == (3, )) self.assertTrue(out['none'].isnull().sum() == 0) self.assertTrue(pd.Series([6, 3]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (4, )) self.assertTrue(enc.mapping is not None)
def countEncode(data): cat_features = ['CHAS', 'RAD'] # Count Encoding count_enc = ce.CountEncoder() # Transform the features, rename the columns with the _count suffix, and join to dataframe count_encoded = count_enc.fit_transform(ks[cat_features]) data = data.join(count_encoded.add_suffix("_count")) # Train a model train, valid, test = get_data_splits(data) train_model(train, valid)
def test_count_min_group_size_int(self): """Test the min_group_size int on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size=7) enc.fit(X) out = enc.transform(X_t) self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all()) self.assertTrue(out['none'].unique().shape == (3, )) self.assertTrue(out['none'].isnull().sum() == 0) self.assertIn(np.nan, enc.mapping['none']) self.assertTrue(pd.Series([13, 7]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (2, )) self.assertIn('B_C_nan', enc.mapping['na_categorical']) self.assertFalse(np.nan in enc.mapping['na_categorical'])
def count_encodings_solution(): cat_features = ['ip', 'app', 'device', 'os', 'channel'] count_enc = ce.CountEncoder(cols=cat_features) train, valid, _ = get_data_splits() # Learn encoding from the training set count_enc.fit(train[cat_features]) # Apply encoding to the train and validation sets train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix('_count')) valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix('_count')) return train_encoded, valid_encoded
def test_count_handle_missing_string(self): """Test the handle_missing string on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(handle_missing='return_nan') enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._handle_missing) self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all()) self.assertTrue(out['none'].unique().shape == (4, )) self.assertTrue(out['none'].isnull().sum() == 3) self.assertTrue(pd.Series([6, 7, 3]).isin(out['na_categorical']).all()) self.assertFalse(pd.Series([4]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (4, )) self.assertTrue(out['na_categorical'].isnull().sum() == 3)
def test_count_normalize_bool(self): """Test the normalize bool on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size=6, normalize=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._normalize) self.assertTrue(out['none'].round(5).isin([0.3, 0.4]).all()) self.assertEqual(out['none'].unique().shape[0], 2) self.assertEqual(out['none'].isnull().sum(), 0) self.assertTrue( pd.Series([0.3, 0.35]).isin(out['na_categorical']).all()) self.assertEqual(out['na_categorical'].unique().shape[0], 2) self.assertTrue(enc.mapping is not None)
def test_count_combine_min_nan_groups_bool(self): """Test the min_nan_groups_bool on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size=7, combine_min_nan_groups=False) enc.fit(X) out = enc.transform(X_t) self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all()) self.assertEqual(out['none'].unique().shape[0], 3) self.assertEqual(out['none'].isnull().sum(), 0) self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all()) self.assertEqual(out['na_categorical'].unique().shape[0], 3) self.assertTrue(enc.mapping is not None) self.assertIn(np.nan, enc.mapping['na_categorical'])
def test_count_handle_missing_dict(self): """Test the handle_missing dict on 'none' and 'na_categorical'. We want to see differing behavour between 'none' and 'na_cat' cols.""" enc = encoders.CountEncoder( handle_missing={'na_categorical': 'return_nan'}) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._handle_missing) self.assertTrue(pd.Series([5, 3, 6]).isin(out['none']).all()) self.assertTrue(out['none'].unique().shape == (3, )) self.assertTrue(out['none'].isnull().sum() == 0) self.assertTrue(pd.Series([6, 7, 3]).isin(out['na_categorical']).all()) self.assertFalse(pd.Series([4]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (4, )) self.assertTrue(out['na_categorical'].isnull().sum() == 3)
def test_count_min_group_name_dict(self): """Test the min_group_name dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size={ 'none': 6, 'na_categorical': 6 }, min_group_name={ 'none': 'dave', 'na_categorical': None }) enc.fit(X) self.assertIn('none', enc._min_group_name) self.assertIn('dave', enc.mapping['none']) self.assertEqual(enc.mapping['none']['dave'], 8) self.assertIn('B_nan', enc.mapping['na_categorical']) self.assertEqual(enc.mapping['na_categorical']['B_nan'], 7)
def test_count_normalize_dict(self): """Test the normalize dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size=7, normalize={ 'none': True, 'na_categorical': False }) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._normalize) self.assertTrue(out['none'].round(5).isin([0.3, 0.15, 0.25]).all()) self.assertEqual(out['none'].unique().shape[0], 3) self.assertEqual(out['none'].isnull().sum(), 0) self.assertTrue(pd.Series([13, 7]).isin(out['na_categorical']).all()) self.assertEqual(out['na_categorical'].unique().shape[0], 2) self.assertTrue(enc.mapping is not None)
def test_count_handle_unknown_string(self): """Test the handle_unknown string on 'none' and 'na_categorical'. The 'handle_missing' must be set to 'return_nan' in order to test 'handle_unkown' correctly.""" enc = encoders.CountEncoder( handle_missing='return_nan', handle_unknown='return_nan', ) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._handle_unknown) self.assertTrue(pd.Series([6, 5, 3]).isin(out['none']).all()) self.assertTrue(out['none'].unique().shape == (4, )) self.assertTrue(out['none'].isnull().sum() == 3) self.assertTrue(pd.Series([3, 6, 7]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (4, )) self.assertTrue(out['na_categorical'].isnull().sum() == 3)
def test_count_handle_unknown_dict(self): """Test the 'handle_unkown' dict with all non-default options.""" enc = encoders.CountEncoder( handle_missing='return_nan', handle_unknown={ 'none': -1, 'na_categorical': 'return_nan' }, ) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._handle_unknown) self.assertTrue(pd.Series([6, 5, 3, -1]).isin(out['none']).all()) self.assertTrue(out['none'].unique().shape == (4, )) self.assertTrue(out['none'].isnull().sum() == 0) self.assertTrue(pd.Series([3, 6, 7]).isin(out['na_categorical']).all()) self.assertTrue(out['na_categorical'].unique().shape == (4, )) self.assertTrue(out['na_categorical'].isnull().sum() == 3)
def test_count_combine_min_nan_groups_dict(self): """Test the combine_min_nan_groups dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder(min_group_size={ 'none': 6, 'na_categorical': 7 }, combine_min_nan_groups={ 'none': 'force', 'na_categorical': False }) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._combine_min_nan_groups) self.assertTrue(pd.Series([14, 6]).isin(out['none']).all()) self.assertEqual(out['none'].unique().shape[0], 2) self.assertEqual(out['none'].isnull().sum(), 0) self.assertTrue(pd.Series([9, 7, 4]).isin(out['na_categorical']).all()) self.assertEqual(out['na_categorical'].unique().shape[0], 3) self.assertTrue(enc.mapping is not None) self.assertIn(np.nan, enc.mapping['na_categorical'])
return data # Train data processing train = data_parser(train) # Define target and predictors target = np.log1p(train['meter_reading']) features = train.drop(['meter_reading'], axis=1) del train gc.collect() # Process categorical features categorical_features = ['building_id', 'site_id', 'meter', 'primary_use'] encoder = category_encoders.CountEncoder(cols=categorical_features) encoder.fit(features) features = encoder.transform(features) features_size = features.shape[0] for feature in categorical_features: features[feature] = features[feature] / features_size # Missing data imputation imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer.fit(features) features = imputer.transform(features) # Regressors lightgbm = LGBMRegressor(objective='regression', learning_rate=0.05, num_leaves=1024, feature_fraction=0.8,
valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") return bst # Training a model on the baseline data train, valid, _ = get_data_splits(baseline_data) bst = train_model(train, valid) print( "Count Encoding-------------------------------------------------------------------------------" ) import category_encoders as ce cat_features = ['category', 'currency', 'country'] count_enc = ce.CountEncoder() count_encoded = count_enc.fit_transform(ks[cat_features]) data = baseline_data.join(count_encoded.add_suffix("_count")) # Training a model on the baseline data train, valid, test = get_data_splits(data) bst = train_model(train, valid) print( "Target Encoding---------------------------------------------------------------------------------------------" ) import category_encoders as ce cat_features = ['category', 'currency', 'country'] # Create the encoder itself
def fit(self, input_df: pd.DataFrame, y=None): self.encoder = ce.CountEncoder(handle_unknown=-1, handle_missing="count") self.encoder.fit(input_df[self.cat_cols]) return self.transform(input_df)
def get_model(PARAMS): """return model for provided params :param PARAMS: dictionary with model params :type PARAMS: dicr :return: model pipeline :rtype: sklearn pipeline """ try: te_dict = { 'CatBoostEncoder': ce.CatBoostEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(), 'OneHotEncoder': ce.OneHotEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'WOEEncoder': ce.WOEEncoder(), 'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'CountEncoder': ce.CountEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'MEstimateEncoder': ce.MEstimateEncoder(), 'PolynomialEncoder': ce.PolynomialEncoder(), 'SumEncoder': ce.SumEncoder() } pipe = make_pipeline( helpers.PrepareData(extraxt_year=True, unicode_text=True), ColumnTransformer([ ('num', helpers.PassThroughOrReplace(), [ 'flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building', 'GC_latitude', 'GC_longitude' ]), ('te_producer', te_dict.get(PARAMS['te_producer']), 'producer_name'), ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'), ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']), 'GC_addr_neighbourhood'), ('te_suburb', te_dict.get(PARAMS['te_suburb']), 'GC_addr_suburb'), ('te_postcode', te_dict.get(PARAMS['te_postcode']), 'GC_addr_postcode'), ('txt_name', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_name__ngram_range']), max_features=PARAMS['txt_name__max_features'], dtype=np.float32, binary=PARAMS['txt_name__binary'], use_idf=PARAMS['txt_name__use_idf']), 'name'), ('txt_dscr', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_dscr__ngram_range']), max_features=PARAMS['txt_dscr__max_features'], dtype=np.float32, binary=PARAMS['txt_dscr__binary'], use_idf=PARAMS['txt_dscr__use_idf']), 'description'), ]), TransformedTargetRegressor( regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed), func=np.log1p, inverse_func=np.expm1)) return pipe except BaseException as e: LOG.error(e) return None
def _count_encoding(self): count_enc = ce.CountEncoder() return count_enc.fit_transform(self.df[self.cat_feats].values)
def lesson_2(): print_("Lesson 2: Categorical Encodings", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] data = ks[data_cols].join(encoded) # Defining functions that will help us test our encodings def get_data_splits(dataframe, valid_fraction=0.1): valid_fraction = 0.1 valid_size = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_size * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_size * 2:-valid_size] test = dataframe[-valid_size:] return train, valid, test def train_model(train, valid): feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = { 'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7, 'verbose': -1 } bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") # Train a model (on the baseline data) train, valid, test = get_data_splits(data) print_("Baseline (LightGBM with no categorical encoding)", 0) train_model(train, valid) print() # -------------- # Count Encoding # -------------- cat_features = ['category', 'currency', 'country'] # Create the encoder count_enc = ce.CountEncoder() # Transform the features, rename the columns with the _count suffix, and join to dataframe # TODO: calculating the counts on the whole dataset? Should it be on the train only to avoid data leakage? # This is what was done in the Exercise 2 count_encoded = count_enc.fit_transform(ks[cat_features]) data = data.join(count_encoded.add_suffix("_count")) # Train a model train, valid, test = get_data_splits(data) print_("LightGBM with COUNT encoding", 0) train_model(train, valid) print() # --------------- # Target Encoding # --------------- # Create the encoder target_enc = ce.TargetEncoder(cols=cat_features) target_enc.fit(train[cat_features], train['outcome']) # Transform the features, rename the columns with _target suffix, and join to dataframe train_TE = train.join( target_enc.transform(train[cat_features]).add_suffix('_target')) valid_TE = valid.join( target_enc.transform(valid[cat_features]).add_suffix('_target')) # Train a model print_("LightGBM with TARGET encoding", 0) train_model(train_TE, valid_TE) print() # ----------------- # CatBoost Encoding # ----------------- # Create the encoder cb_enc = ce.TargetEncoder(cols=cat_features) cb_enc.fit(train[cat_features], train['outcome']) # Transform the features, rename the columns with _target suffix, and join to dataframe train_CBE = train.join( cb_enc.transform(train[cat_features]).add_suffix('_cb')) valid_CBE = valid.join( cb_enc.transform(valid[cat_features]).add_suffix('_cb')) # Train a model print_("LightGBM with CatBoost encoding", 0) train_model(train_CBE, valid_CBE) print()
# 1) Categorical encodings and leakage # ============================================================================= # Use only training data to encode. If not -> data leakage # ============================================================================= # 2) Count encodings # ============================================================================= import category_encoders as ce cat_features = ['ip', 'app', 'device', 'os', 'channel'] train, valid, test = get_data_splits(clicks) # Create the count encoder count_enc = ce.CountEncoder(cols=cat_features) # Learn encoding from the training set count_enc.fit(train[cat_features]) # Apply encoding to the train and validation sets train_encoded = train.join( count_enc.transform(train[cat_features]).add_suffix('_count')) valid_encoded = valid.join( count_enc.transform(valid[cat_features]).add_suffix('_count')) model2 = train_model(train_encoded, valid_encoded) # ============================================================================= # 4) Target encoding # =============================================================================
def preprocess_table(input_file_path, output_file_path): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ encoders = {} logger = logging.getLogger(__name__) df_full_train, output_filepath_df_train, output_filepath_misc_train = read_table( input_file_path, logger, output_file_path, suffix="Train") df_full_test, output_filepath_df_test, output_filepath_misc_test = read_table( input_file_path, logger, output_file_path, suffix="Test") df_full_val, output_filepath_df_val, output_filepath_misc_val = read_table( input_file_path, logger, output_file_path, suffix="Validation") # Label encode categoricals for cat in CAT_COLUMNS: logger.info(f"to category: {cat}") df_full_train[cat] = df_full_train[cat].astype(str) df_full_test[cat] = df_full_test[cat].astype(str) df_full_val[cat] = df_full_val[cat].astype(str) CALC_COUNT_COLUMNS = [] df_to_fit_le = pd.concat([df_full_train, df_full_val], axis=0)[df_full_test.columns] # Label encode categoricals label_encoder = ce.CountEncoder(return_df=True, cols=CAT_COLUMNS, verbose=1, normalize=True) count_encoder = ce.CountEncoder(return_df=True, cols=COUNT_COLUMNS + CALC_COUNT_COLUMNS, verbose=1, normalize=True) # Encode train and test with LE label_encoder.fit(df_to_fit_le) df_full_train[df_full_test.columns] = label_encoder.transform( df_full_train[df_full_test.columns]) df_full_test = label_encoder.transform(df_full_test) df_full_val[df_full_test.columns] = label_encoder.transform( df_full_val[df_full_test.columns]) # Encode train and test with CE count_encoder.fit(df_to_fit_le) df_full_train[df_full_test.columns] = count_encoder.transform( df_full_train[df_full_test.columns]) df_full_test = count_encoder.transform(df_full_test) df_full_val[df_full_test.columns] = count_encoder.transform( df_full_val[df_full_test.columns]) # Encode aggregate statistics using BallTree: X = pd.concat( [df_full_train[['lat', 'long']], df_full_val[['lat', 'long']]], axis=0).values # Build a tree: tree = BallTree(X) # Calculate aggregate statistics using tree: X_to_get_data = pd.concat([df_full_train, df_full_val], axis=0) # # df_full_train = calculate_agg_statistics(tree,X_to_get_data,df_full_train) # df_full_val = calculate_agg_statistics(tree, X_to_get_data, df_full_val) # df_full_test = calculate_agg_statistics(tree, X_to_get_data, df_full_test) # print(df_full_train.shape) print(df_full_test.shape) print(df_full_val.shape) # Encode test: misc = {} misc["encoder_dict"] = encoders # profile = feature_df.profile_report(title=f'Pandas Profiling Report for {suffix}') # profile.to_file(output_file=os.path.join(project_dir, f"output_{suffix}.html")) df_full_train.to_pickle(output_filepath_df_train) df_full_test.to_pickle(output_filepath_df_test) df_full_val.to_pickle(output_filepath_df_val) with open(output_filepath_misc_train, "wb") as f: pickle.dump(misc, f) return 0
lst_combination = (list(combinations(auto_columns_2, 2)) + list(combinations(auto_columns_3, 2)) + list(combinations(auto_columns_4, 2))) for l, r in lst_combination: for func in 'add subtract divide multiply'.split(): df[f'auto_{func}_{l}_{r}'] = getattr(np, func)(df[l], df[r]) return df def transform(df): df = process_datetime_cols(df) df = process_categorical_cols(df) df = process_others(df) return df.drop(ignore_columns, axis=1) train = transform(train) test = transform(test) # Create the encoder t = pd.concat([train, test]).reset_index(drop=True) count_enc = ce.CountEncoder().fit_transform(t[cat_features]) tt = t.join(count_enc.add_suffix("_count")) f2_train = tt.loc[tt.index < train.shape[0]] f2_test = tt.loc[tt.index >= train.shape[0]] columns = sorted(set(f2_train.columns).intersection(f2_test.columns)) print(len(columns))
def ex_2(): print_("Exercise 2: Categorical Encodings", 0, 1) clicks = load_data_for_ex_2() def get_data_splits(dataframe, valid_fraction=0.1): """Splits a dataframe into train, validation, and test sets. First, orders by the column 'click_time'. Set the size of the validation and test sets with the valid_fraction keyword argument. """ dataframe = dataframe.sort_values('click_time') valid_rows = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_rows * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_rows * 2:-valid_rows] test = dataframe[-valid_rows:] return train, valid, test def train_model(train, valid, test=None, feature_cols=None): if feature_cols is None: feature_cols = train.columns.drop( ['click_time', 'attributed_time', 'is_attributed']) dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed']) param = { 'num_leaves': 256, 'objective': 'binary', 'metric': 'auc', 'seed': 7, 'verbose': -1 } num_round = 1000 bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=20, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred) print(f"Validation AUC score: {valid_score}") if test is not None: test_pred = bst.predict(test[feature_cols]) test_score = metrics.roc_auc_score(test['is_attributed'], test_pred) return bst, valid_score, test_score else: return bst, valid_score print_("Baseline model", 0) train, valid, test = get_data_splits(clicks) _ = train_model(train, valid) print() # ------------------------------------ # 1. Categorical encodings and leakage # ------------------------------------ # ------------------ # 2. Count encodings # ------------------ cat_features = ['ip', 'app', 'device', 'os', 'channel'] train, valid, test = get_data_splits(clicks) # Create the count encoder count_enc = ce.CountEncoder(cols=cat_features) # Learn encoding from the training set # TODO: Why not train['is_attributed']? count_enc.fit(train[cat_features]) # count_enc.fit(train[cat_features], train['is_attributed']) # Apply encoding to the train and validation sets as new columns # Make sure to add `_count` as a suffix to the new columns train_encoded = train.join( count_enc.transform(train[cat_features]).add_suffix('_count')) valid_encoded = valid.join( count_enc.transform(valid[cat_features]).add_suffix('_count')) # Train the model on the encoded datasets print_("LightGBM with COUNT encoding", 0) _ = train_model(train_encoded, valid_encoded) print() # ------------------ # 4. Target encoding # ------------------ # Create the target encoder. target_enc = ce.TargetEncoder(cols=cat_features) # Learn encoding from the training set. Use the 'is_attributed' column as the target. target_enc.fit(train[cat_features], train['is_attributed']) # Apply encoding to the train and validation sets as new columns # Make sure to add `_target` as a suffix to the new columns train_encoded = train.join( target_enc.transform(train[cat_features]).add_suffix('_target')) valid_encoded = valid.join( target_enc.transform(valid[cat_features]).add_suffix('_target')) # Train a model print_("LightGBM with TARGET encoding", 0) _ = train_model(train_encoded, valid_encoded) print() # -------------------- # 6. CatBoost Encoding # -------------------- # Remove IP from the encoded features cat_features = ['app', 'device', 'os', 'channel'] # Create the CatBoost encoder cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7) # Learn encoding from the training set cb_enc.fit(train[cat_features], train['is_attributed']) # Apply encoding to the train and validation sets as new columns # Make sure to add `_cb` as a suffix to the new columns train_encoded = train.join( cb_enc.transform(train[cat_features]).add_suffix('_cb')) valid_encoded = valid.join( cb_enc.transform(valid[cat_features]).add_suffix('_cb')) # Train a model print_("LightGBM with CatBoost encoding", 0) _ = train_model(train_encoded, valid_encoded) print()