def apply_weight_of_evidence_encoding(df, categorical_columns, label='y'): if not isinstance(df, pd.DataFrame): raise DataFrameTypeError('df', df) import category_encoders as ce encoder = ce.WOEEncoder(cols=categorical_columns).fit( df.drop([label], axis=1), df[label]) X_transformed = encoder.transform(df) return X_transformed
def cal_woe(df_tr, col): enc = ce.WOEEncoder(cols=[col]).fit(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault']) tmp = pd.DataFrame({ f'{col}': df_tr.loc[::, col], f'woe_{col}': enc.transform(df_tr.loc[::, feature_col], df_tr.loc[::, 'isDefault'])[col] }) return tmp.groupby([col])[f'woe_{col}'].mean(), f'woe_{col}'
def test_HandleUnknownValue_HaveUnknown_ExpectEncodedWithZero(self): X = ['a', 'a', 'b', 'b'] y = [1, 0, 0, 0] test = ['a', 'c'] enc = encoders.WOEEncoder(handle_unknown='value') enc.fit(X, y) result = enc.transform(test) expected = pd.Series([0.5108256237659906, 0], name=0) pd.testing.assert_series_equal(expected, result[0])
def __init__(self, encoder_type, columns_name=None): """ :param encoder_type: :param columns_name: list, 特征名组成的列表名 """ if encoder_type == "BackwardDe": # 反向差分编码 self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name) elif encoder_type == "BaseN": # BaseN编码 self.encoder = ce.BaseNEncoder(cols=columns_name) elif encoder_type == "Binary": # 二值编码 self.encoder = ce.BinaryEncoder(cols=columns_name) elif encoder_type == "Catboost": self.encoder = ce.CatBoostEncoder(cols=columns_name) elif encoder_type == "Hash": self.encoder = ce.HashingEncoder(cols=columns_name) elif encoder_type == "Helmert": self.encoder = ce.HelmertEncoder(cols=columns_name) elif encoder_type == "JamesStein": self.encoder = ce.JamesSteinEncoder(cols=columns_name) elif encoder_type == "LOO": # LeaveOneOutEncoder 编码 self.encoder = ce.LeaveOneOutEncoder(cols=columns_name) elif encoder_type == "ME": self.encoder = ce.MEstimateEncoder(cols=columns_name) # M估计编码器 elif encoder_type == "OneHot": self.encoder = ce.OneHotEncoder(cols=columns_name) elif encoder_type == "OridinalEncoder": # 原始编码 self.encoder = ce.OrdinalEncoder(cols=columns_name) elif encoder_type == "Sum": # 求和编码 self.encoder = ce.SumEncoder(cols=columns_name) elif encoder_type == "Polynomial": # 多项式编码 self.encoder = ce.PolynomialEncoder(cols=columns_name) elif encoder_type == "Target": # 目标编码 self.encoder = ce.TargetEncoder(cols=columns_name) elif encoder_type == "WOE": # WOE 编码器 self.encoder = ce.WOEEncoder(cols=columns_name) else: raise ValueError("请选择正确的编码方式")
def test_HaveArrays_ExpectCalculatedProperly(self): X = ['a', 'a', 'b', 'b'] y = [1, 0, 0, 0] enc = encoders.WOEEncoder() result = enc.fit_transform(X, y) expected = pd.Series([ 0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119 ], name=0) pd.testing.assert_series_equal(expected, result[0])
def test_HandleMissingValue_HaveMissingInTrain_ExpectEncoded(self): X = ['a', 'a', np.nan, np.nan] y = [1, 0, 0, 0] enc = encoders.WOEEncoder(handle_missing='value') result = enc.fit_transform(X, y) expected = pd.Series([ 0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119 ], name=0) pd.testing.assert_series_equal(expected, result[0])
def woe_encoding(X_fit, y_fit, cols, X_test=None, sigma=0): """ 只针对binomial target X_fit: 用来计算encoding的df, 包含cols y_fit: encoding的target X_test: 需要transform的对象 cols: 需要encoding的列 sigma: 添加噪声的标准差,防止过拟合 """ if X_test is None: X_test = X_fit encoder = ce.WOEEncoder(cols=cols, sigma=sigma) encoder.fit(X_fit, y_fit) result = encoder.transform(X_test) return result
def woe_discrete(df, discrete_variable_name, target): """Generates woe transformation of discrete variables Args: df (pd.Dataframe): dataframe containing discrete variables to be trnasformed discrete_variable_name (list): list of discrete variables to be transformed target (str): target variable name Returns: pd.Dataframe: dataframe with woe transformed discrete variables added as columns to the original dataframe """ woe_encoder = ce.WOEEncoder(cols=discrete_variable_name) woe_of_discrete_variables = woe_encoder.fit_transform( df[discrete_variable_name], df[target]).add_suffix('_woe') df = df.join(woe_of_discrete_variables) return df
def woe_encode(self, train, test, feature): ''' Weight of Evidence Encoding (WOE) FYI warning can be ignored, refers to this issue: https://github.com/scikit-learn-contrib/category_encoders/issues/281 ''' train = train.copy() test = test.copy() encoder = category_encoders.WOEEncoder() train[f'{feature}_WOE'] = encoder.fit_transform( train[feature].astype("category"), train["purchased"] )[feature].values test[f'{feature}_WOE'] = encoder.transform( test[feature].astype("category") )[feature].values return train, test
def define_lr_pipeline(df: pd.DataFrame, target_col: str, n_jobs: int, random_state: int) -> Pipeline: woe = ce.WOEEncoder() sc = StandardScaler() lr = LogisticRegression(n_jobs=n_jobs, random_state=random_state) # from sklearn.tree import DecisionTreeClassifier # dt = DecisionTreeClassifier(max_depth=5, random_state=random_state) cat_features = (df.drop( columns=target_col).select_dtypes('object').columns) num_features = (df.drop( columns=target_col).select_dtypes('number').columns) transformer = ColumnTransformer([('woe', woe, cat_features), ('sc', sc, num_features)]) pipeline = Pipeline([('transformer', transformer), ('clf', lr)]) return pipeline
def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_woe = ce.WOEEncoder(cols=X.columns, randomized=True, handle_missing='value', handle_unknown='value') encoder_woe.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/woe.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_woe, fp)
'victimIsDucked', 'victimIsDucking', 'victimIsDefusing', 'victimIsScoped', 'victimHasHelmet', 'hitgroup', ] features = [ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27" ] X0 = df.drop(['ct_wins', 't_wins'], axis=1) encoder = ce.WOEEncoder(cols=nominal_columns).fit(X0, y) X = encoder.transform(X0) X = X.rename(index=str, columns={ "attackerHealth": "x1", "attackerXPosition": "x2", "attackerYPosition": "x3", "attackerZPosition": "x4", "weapon": "x5", "attackerSpotted": "x6", "attackerSide": "x7", "attackerIsScoped": "x8", "attackerIsDucked": "x9", "attackerIsDucking": "x10", "attackerHasHelmet": "x11",
'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff' ] # We ignore encoders {BackwardDifferenceEncoder, HelmertEncoder, PolynomialEncoder and SumEncoder} because of: # https://github.com/scikit-learn-contrib/categorical-encoding/issues/91 encoders = [ category_encoders.BaseNEncoder(), category_encoders.OneHotEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), category_encoders.OrdinalEncoder(), category_encoders.TargetEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.WOEEncoder() ] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Ok... warnings.filterwarnings('ignore') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values) for encoder in encoders: print("Encoding:", dataset_name, y.name, encoder.__class__.__name__)
# We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(handle_missing='value'), category_encoders.BaseNEncoder(handle_missing='indicator'), category_encoders.BinaryEncoder(handle_missing='value'), category_encoders.BinaryEncoder(handle_missing='indicator'), # category_encoders.HashingEncoder(handle_missing='value'), # category_encoders.HashingEncoder(handle_missing='indicator'), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(handle_missing='value'), category_encoders.JamesSteinEncoder(handle_missing='indicator'), category_encoders.LeaveOneOutEncoder(handle_missing='value'), category_encoders.LeaveOneOutEncoder(handle_missing='indicator'), category_encoders.MEstimateEncoder(handle_missing='value'), category_encoders.MEstimateEncoder(handle_missing='indicator'),
def get_encoder(self) -> BaseEstimator: return ce.WOEEncoder(cols=self.target_columns)
#target encoding start_time = time.time() target_encoder = ce.TargetEncoder(cols=cat_cols_bank, smoothing=1) mean_target_transformed = target_encoder.fit_transform(df_bank[cat_cols_bank], df_bank['y']) print('computation time of target:', time.time() - start_time) print( 'Memory usage after encoding: ', round( mean_target_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #WoE start_time = time.time() woe_encoder = ce.WOEEncoder(cols=cat_cols_bank) woe_encoder_transformed = woe_encoder.fit_transform(df_bank[cat_cols_bank], df_bank['y']) print('computation time of WOE :', time.time() - start_time) print( 'Memory usage after encoding: ', round( woe_encoder_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #embeddings = [('one hot encoding',df_bank_one_hot_transformed), ('label encoding',df_bank_label_transformed), # ('hash encoding',hash_transformed), ('target encoding',mean_target_transformed), ('WOE encoding',woe_encoder_transformed)] #%% Train-Test split num_fold = 5 X = label_transformed.drop(['y'],
min_split_gain=0.0, missing=-999, n_estimators=500, n_jobs=1, num_leaves=31, objective=None, random_state=64, reg_alpha=0.0, reg_lambda=0.0, silent=1, subsample=0.8, subsample_for_bin=200000, subsample_freq=0) pipe = Pipeline([('transformer', FeatureSelector()), ('encoder', ce.WOEEncoder()), ('scaler', MinMaxScaler()), ('classifier', lgbm)]) pipe.fit(train, y) cloudpickle.dump(pipe, open('titanicModel.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL) model = pickle.load(open('titanicModel.pkl', 'rb')) col_dict = {i: [] for i in train.columns} col_dict['Pclass'].append(3) col_dict['Name'].append('asda, Mr. Ram') col_dict['Sex'].append('male')
def get_model(PARAMS): """return model for provided params :param PARAMS: dictionary with model params :type PARAMS: dicr :return: model pipeline :rtype: sklearn pipeline """ try: te_dict = { 'CatBoostEncoder': ce.CatBoostEncoder(), 'HashingEncoder': ce.HashingEncoder(), 'HelmertEncoder': ce.HelmertEncoder(), 'LeaveOneOutEncoder': ce.LeaveOneOutEncoder(), 'OneHotEncoder': ce.OneHotEncoder(), 'TargetEncoder': ce.TargetEncoder(), 'WOEEncoder': ce.WOEEncoder(), 'BackwardDifferenceEncoder': ce.BackwardDifferenceEncoder(), 'BaseNEncoder': ce.BaseNEncoder(), 'BinaryEncoder': ce.BinaryEncoder(), 'CountEncoder': ce.CountEncoder(), 'JamesSteinEncoder': ce.JamesSteinEncoder(), 'MEstimateEncoder': ce.MEstimateEncoder(), 'PolynomialEncoder': ce.PolynomialEncoder(), 'SumEncoder': ce.SumEncoder() } pipe = make_pipeline( helpers.PrepareData(extraxt_year=True, unicode_text=True), ColumnTransformer([ ('num', helpers.PassThroughOrReplace(), [ 'flat_size', 'rooms', 'floor', 'number_of_floors', 'year_of_building', 'GC_latitude', 'GC_longitude' ]), ('te_producer', te_dict.get(PARAMS['te_producer']), 'producer_name'), ('te_road', te_dict.get(PARAMS['te_road']), 'GC_addr_road'), ('te_neighbourhood', te_dict.get(PARAMS['te_neighbourhood']), 'GC_addr_neighbourhood'), ('te_suburb', te_dict.get(PARAMS['te_suburb']), 'GC_addr_suburb'), ('te_postcode', te_dict.get(PARAMS['te_postcode']), 'GC_addr_postcode'), ('txt_name', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_name__ngram_range']), max_features=PARAMS['txt_name__max_features'], dtype=np.float32, binary=PARAMS['txt_name__binary'], use_idf=PARAMS['txt_name__use_idf']), 'name'), ('txt_dscr', TfidfVectorizer(lowercase=True, ngram_range=(1, PARAMS['txt_dscr__ngram_range']), max_features=PARAMS['txt_dscr__max_features'], dtype=np.float32, binary=PARAMS['txt_dscr__binary'], use_idf=PARAMS['txt_dscr__use_idf']), 'description'), ]), TransformedTargetRegressor( regressor=lgb.LGBMRegressor(**PARAMS, random_state=seed), func=np.log1p, inverse_func=np.expm1)) return pipe except BaseException as e: LOG.error(e) return None
def test_woe(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] # balanced label with balanced features X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col1']) y_balanced = [True, False, True, False, True, False] enc = encoders.WOEEncoder() enc.fit(X_balanced, y_balanced) X1 = enc.transform(X_balanced) self.assertTrue( all(X1.sum() < 0.001), "When the class label is balanced, WoE should sum to 0 in each transformed column" ) enc = encoders.WOEEncoder(cols=cols) enc.fit(X, np_y) X1 = enc.transform(X_t) th.verify_numeric(X1[cols]) self.assertTrue( np.isfinite(X1[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertTrue( np.isfinite(X2[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') X3 = enc.transform(X, np_y) th.verify_numeric(X3) self.assertTrue( np.isfinite(X3[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X)), len(list(X3)), 'The count of attributes must not change') self.assertEqual(len(X), len(X3), 'The count of rows must not change') self.assertTrue( X3['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') X4 = enc.fit_transform(X, np_y) th.verify_numeric(X4) self.assertTrue( np.isfinite(X4[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X)), len(list(X4)), 'The count of attributes must not change') self.assertEqual(len(X), len(X4), 'The count of rows must not change') self.assertTrue( X4['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') enc = encoders.WOEEncoder() enc.fit(X, np_y) X1 = enc.transform(X_t) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') th.verify_numeric(X1) X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') # seed enc = encoders.WOEEncoder(cols=cols, random_state=2001, randomized=True) enc.fit(X, np_y) X1 = enc.transform(X_t, np_y_t) X2 = enc.transform(X_t, np_y_t) self.assertTrue( X1.equals(X2), "When the seed is given, the results must be identical") th.verify_numeric(X1) th.verify_numeric(X2) # invariant target y_invariant = [True, True, True, True, True, True] enc = encoders.WOEEncoder() with self.assertRaises(ValueError): enc.fit(X_balanced, y_invariant) # branch coverage unit tests - no cols enc = encoders.WOEEncoder(cols=[]) enc.fit(X, np_y) self.assertTrue(enc.transform(X_t).equals(X_t)) # missing values in the target y_missing = [True, True, None, True, True, True] enc = encoders.WOEEncoder() with self.assertRaises(ValueError): enc.fit(X_balanced, y_missing) # impute missing enc = encoders.WOEEncoder(handle_missing='return_nan') enc.fit(X, np_y) X1 = enc.transform(X_t) th.verify_numeric(X1) self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change')
train, test = train_test_split(data, test_size=0.15, random_state=42) # %% x = train.drop(columns=['y']).reset_index(drop=True) y = train['y'].reset_index(drop=True) x_test = test.drop(columns=['y']).reset_index(drop=True) y_test = test['y'].reset_index(drop=True) # %% encoders = { "one-hot": ce.OneHotEncoder(drop_invariant=True, return_df=True, use_cat_names=True), "woe": ce.WOEEncoder(drop_invariant=True, return_df=True), "binary": ce.BinaryEncoder(drop_invariant=True, return_df=True), } def objective(trial: opt.Trial): # only test dropping sozio economic facotrs drop_sozioeco = trial.suggest_categorical("drop_eco", [True, False]) # rest of preprocessing keeps default values # categrorial encoding, try identical encoders for all columns (for now) enc_name = trial.suggest_categorical("encoder", ["one-hot", "woe", "binary"]) enc = encoders[enc_name]
# We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder()] encoders = [category_encoders.TargetEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.WOEEncoder()] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders for dataset_name in datasets: # X, y, fold_count = arff_loader.load(dataset_name) X, y, fold_count, nominal_columns = csv_loader.load(dataset_name) # Get indexes (not names) of categorical features categorical_indexes = [] for col in X.select_dtypes(exclude=[np.number]).columns.values:
df=job.sample(frac=1, random_state=12) #%% different embedding # one-hot encoding one_hot_encoder=ce.OneHotEncoder(cols=['Job']) df_one_hot_transformed=one_hot_encoder.fit_transform(df) print(df_one_hot_transformed.iloc[0:7,]) # label encode label_encoder=ce.OrdinalEncoder(cols=['Job']) df_label_transformed=label_encoder.fit_transform(df) print(df_label_transformed.iloc[0:7,]) #hash encoding with md5 hash function hash_encoder=ce.HashingEncoder(cols=['Job'],n_components=7) hash_transformed=hash_encoder.fit_transform(df) print(hash_transformed.iloc[0:7,]) #target encoding target_encoder=ce.TargetEncoder(cols='Job',smoothing=1) mean_target_transformed=target_encoder.fit_transform(df['Job'],df['Target']) print(mean_target_transformed.iloc[0:7,]) #WoE woe_encoder=ce.WOEEncoder(cols='Job') woe_encoder_transformed=woe_encoder.fit_transform(df['Job'],df['Target']) print(woe_encoder_transformed.iloc[0:7,]) y=df[df['Job']=='student']
print('X train shape: {}'.format(X_train.shape)) print('y train shape: {}'.format(y_train.shape)) print('X test shape: {}'.format(X_test.shape)) print('y test shape: {}'.format(y_test.shape)) X_train = X_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) y_test = y_test.reset_index(drop=True) print('reindexed X train and y train for WOE embeddings') print('reindexed X test and y test for WOE embeddings') # In[480]: encoding = ce.WOEEncoder(cols=['category_embed'], impute_missing=True) encoding.fit(X_train[['category_embed']], X_train['paren_match']) X_train['category_embed'] = encoding.transform(X_train[['category_embed']]) print('Category embeddings created for training data') # Create a new column to embed categories (testing data) X_test['category_embed'] = encoding.transform(X_test[['category_embed']]) print('Category embeddings created for testing data') # In[481]: """ Add new feature: topics (topic modeling) We have 4 categories in the dataset. SO lets use 4 topics """ # Preprocessing the text first clean_questions = [
def blight_model(): import pandas as pd import numpy as np import matplotlib.pyplot as plt import re import traceback import string from sklearn.base import BaseEstimator from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util from sklearn.utils.random import check_random_state from feature_engine import categorical_encoders as ce import xgboost as xgb from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from xgboost import plot_importance import xgboost from matplotlib import pyplot import category_encoders as ces import seaborn as sns from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.kernel_approximation import RBFSampler from xgboost import XGBClassifier from category_encoders.cat_boost import CatBoostEncoder from sklearn.metrics import confusion_matrix, roc_curve, auc, plot_roc_curve, accuracy_score from sklearn.model_selection import cross_val_score, GridSearchCV from sklearn.linear_model import Ridge from sklearn.metrics import roc_auc_score train = pd.read_csv('train.csv', encoding = 'ISO-8859-1') test = pd.read_csv('test.csv') #train_no_null['compliance_detail'].unique() #a =train_no_null[train_no_null['compliance_detail'] == 'non-compliant by no payment'] #a['payment_status'].unique() #a['compliance'].unique() ############################################################ ########## DATA CLEARNING & DATA LEAKAGE PREVENT ########### ############################################################ train_no_null = train.loc[train.compliance.notnull()] ## ifdentifying indecies which are not satisfy conditions badValuesTrain = [] for index, row in train_no_null.iterrows(): if (train_no_null['payment_status'].loc[index] == 'PAID IN FULL') and (train_no_null['compliance'].loc[index] == 0) and (train_no_null['compliance_detail'].loc[index] == 'non-compliant by late payment more than 1 month') and (train_no_null['compliance'].loc[index] == 1)\ or (train_no_null['payment_status'].loc[index] == 'NO PAYMENT APPLIED') and (train_no_null['compliance'].loc[index] == 1)\ or (train_no_null['payment_status'].loc[index] == 'PARTIAL PAYMENT APPLIED') and (train_no_null['compliance'].loc[index] == 1)\ or (train_no_null['payment_status'].loc[index] == 'NO PAYMENT APPLIED') and (train_no_null['compliance_detail'].loc[index] == 'compliant by no fine') and (train_no_null['compliance'].loc[index] == 1): badValuesTrain.append(index) # remove obtained indexes from the initial DF using QUERY a = train_no_null.query('index not in @badValuesTrain') # how many NaNs per column in TRAIN DATA train_no_null = train_no_null.query("state == state") train_no_null = train_no_null.query("zip_code == zip_code") train_no_null = train_no_null.query("mailing_address_str_number == mailing_address_str_number") train_no_null = train_no_null.query("mailing_address_str_name == mailing_address_str_name") #test = test.query("state == state") #test = test.query("zip_code == zip_code") #test = test.query("city == city") #test = test.query("violator_name == violator_name") #test = test.query("mailing_address_str_number == mailing_address_str_number") #test = test.query("mailing_address_str_name == mailing_address_str_name") #train_no_null.isnull().sum(axis = 0) #test.isnull().sum(axis = 0) train_no_null['hearing_date'].fillna(train_no_null['hearing_date'].value_counts().index[0], inplace=True) test['hearing_date'].fillna(test['hearing_date'].value_counts().index[0], inplace=True) test['state'].fillna(test['state'].value_counts().index[0], inplace=True) test['zip_code'].fillna(test['zip_code'].value_counts().index[0], inplace=True) test['mailing_address_str_number'].fillna(test['mailing_address_str_number'].value_counts().index[0], inplace=True) test['mailing_address_str_name'].fillna(test['mailing_address_str_name'].value_counts().index[0], inplace=True) # remove the colums from TRAINING data which are not corresponds to TEST data # getting a list of common columns betwee TRAIN and TEST common_cols = list(set(train_no_null.columns).intersection(test.columns)) train_upd = train_no_null[common_cols] removedColumnsTrain = train_no_null.drop([col for col in train_no_null.columns if col in train_no_null.columns and col in test.columns], axis=1) y_train = removedColumnsTrain['compliance'] # remove colums with lots of NaNs for both TRAIN and TEST DS train_upd = train_upd.drop(['non_us_str_code'], axis=1) test = test.drop(['non_us_str_code'], axis=1) train_upd = train_upd.drop(['violation_zip_code'], axis=1) test = test.drop(['violation_zip_code'], axis=1) train_upd = train_upd.drop(['grafitti_status'], axis=1) test = test.drop(['grafitti_status'], axis=1) ##################################################################### ##################### PLOTTING/CLEANING ############################# ##################################################################### #train_upd.plot(subplots=True, layout=(4,3)) #test.plot(subplots=True, layout=(4,3)) #plt.close('figure') # since "state_fee", "clean_up_cost", "admin_fee" have no impact factor, they are constant, we remove them train_upd = train_upd.drop(['state_fee'], axis=1) test = test.drop(['state_fee'], axis=1) train_upd = train_upd.drop(['clean_up_cost'], axis=1) test = test.drop(['clean_up_cost'], axis=1) train_upd = train_upd.drop(['admin_fee'], axis=1) test = test.drop(['admin_fee'], axis=1) ################# EXTRA PLOTING FEATURES ############################### def plot_Comp_train_test(train, test, plotVar, titleName, plotShowNumsorted=30, plotkind='bar', figsize=(18, 3.2)): plt.subplots(1, 2, figsize=(18, 5)) plt.subplot(1, 2, 1) yvalue = train[plotVar].value_counts() (yvalue[:plotShowNumsorted] / train.shape[0]).plot(kind="bar", alpha=0.6, color='slateblue') plt.title(titleName + ' (training set)') plt.subplot(1, 2, 2) yvalue = test[plotVar].value_counts() (yvalue[:plotShowNumsorted] / test.shape[0]).plot(kind="bar", alpha=0.6, color='teal') plt.title(titleName + ' (test set)') return plt # plot_Comp_train_test(train_upd, test, 'zip_code', 'zip_code', plotShowNumsorted=55, figsize=(20,3.2)); # plot_Comp_train_test(train_upd, test, 'violation_code', 'violation_code', plotShowNumsorted=55, figsize=(20,3.2)); ################################################################## ############# FEATURES PREPROCESSING REGEX ##################### ################################################################## ################################################################## ################# CREATING DATE & TIME FEATURES ################## ################################################################## train_upd['ticket_issued_date'] = pd.to_datetime(train_upd.ticket_issued_date, format='%Y-%m-%d %H:%M:%S') train_upd['hearing_date'] = pd.to_datetime(train_upd.hearing_date, format='%Y-%m-%d %H:%M:%S') test['ticket_issued_date'] = pd.to_datetime(test.ticket_issued_date, format='%Y-%m-%d %H:%M:%S') test['hearing_date'] = pd.to_datetime(test.hearing_date, format='%Y-%m-%d %H:%M:%S') datetime = ['day', 'month', 'year', 'hour', 'minute', 'weekday', 'week'] for period in datetime: if datetime != 'week': train_upd['Issued_' + period] = getattr(train_upd.ticket_issued_date.dt, period) test['Issued_' + period] = getattr(test.ticket_issued_date.dt, period) train_upd['Hearing_' + period] = getattr(train_upd.hearing_date.dt, period) test['Hearing_' + period] = getattr(test.hearing_date.dt, period) else: train_upd['Issued_' + period] = getattr(train_upd.ticket_issued_date.dt.isocalendar(), period) test['Issued_' + period] = getattr(test.ticket_issued_date.dt.isocalendar(), period) train_upd['Hearing_' + period] = getattr(train_upd.hearing_date.dt.isocalendar(), period) test['Hearing_' + period] = getattr(test.hearing_date.dt.isocalendar(), period) # removing columns with DataTime train_upd = train_upd.drop(['ticket_issued_date'], axis=1) train_upd = train_upd.drop(['hearing_date'], axis=1) test = test.drop(['ticket_issued_date'], axis=1) test = test.drop(['hearing_date'], axis=1) #train_upd.isnull().sum(axis=0) ### cleaning mailing_address_str_number column #### for i, row in list(test.iterrows()): if type(row['mailing_address_str_number']) != 'int': c = str(row['mailing_address_str_number']) if ('p' in row['mailing_address_str_number'].lower()) or ('*' in row['mailing_address_str_number']) \ or ('.' in row['mailing_address_str_number']) or ('O' in row['mailing_address_str_number']) \ or ('o' in row['mailing_address_str_number']) or ('G' in row['mailing_address_str_number']) \ or ('# 143' in row['mailing_address_str_number'] ) or ('XX' in row['mailing_address_str_number']) \ or ('22A' in row['mailing_address_str_number']) or ('NE' in row['mailing_address_str_number'])\ or ('12 1ST' in row['mailing_address_str_number']) or ('11111A' in row['mailing_address_str_number']) : test.at[i,'mailing_address_str_number'] = 11111 #print(i, test.at[i,'mailing_address_str_number']) test.mailing_address_str_number = test.mailing_address_str_number.replace(' ','',regex=True).replace(',','',regex=True) test.mailing_address_str_number = test.mailing_address_str_number.replace(to_replace='[A-Z-a-z][0-9]*', value = '11111', regex=True).replace('-','',regex=True).replace('`','',regex=True).replace('#','11111',regex=True) ### converting mailing adrees for both TRAIN and TEXT into numbers instread of stirngs for i, row in list(train_upd.iterrows()): if isinstance(train_upd.at[i,'mailing_address_str_number'], float) == False: train_upd.at[i,'mailing_address_str_number'] = float(train_upd.at[i,'mailing_address_str_number']) for i, row in list(test.iterrows()): if isinstance(test.at[i,'mailing_address_str_number'], float) == False: test.at[i,'mailing_address_str_number'] = float(test.at[i,'mailing_address_str_number']) ######### categorical encoding Weight of Evidence ######### ########################################################### ####### Weight of Evidence transformation of text values into categories ############## cat_columns = ['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name', 'violator_name', 'violation_street_name', 'violation_code', 'violation_description', 'inspector_name'] woe_encoder = ces.WOEEncoder(cols=cat_columns) #fit the encoder woe_encoded_train = woe_encoder.fit_transform(train_upd.iloc[:], y_train) woe_encoded_train = woe_encoder.fit_transform(train_upd, y_train) # transform XTrain_transformed = woe_encoder.transform(train_upd) XTest_transformed = woe_encoder.transform(test) # CBE_encoder = CatBoostEncoder() # train_encoded = CBE_encoder.fit_transform(train_upd[cat_columns], y_train) # test_encoded = CBE_encoder.transform(test[cat_columns]) # t = train_upd # t = t.drop(['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name', 'violator_name', 'violation_street_name', 'violation_code', 'violation_description', 'inspector_name'], axis=1, inplace=True) # tt = test # tt = tt.drop(['country', 'city', 'state', 'agency_name', 'disposition', 'zip_code', 'mailing_address_str_name', 'violator_name', 'violation_street_name', 'violation_code', 'violation_description', 'inspector_name'], axis=1, inplace=True) # XTrain_transformed = pd.concat([train_upd, train_encoded], axis=1, sort=False) # XTest_transformed = pd.concat([test, test_encoded], axis=1, sort=False) ########################################################## ############# Correlation map for features ############### ########################################################## correlation = XTrain_transformed.corr().round(1) fig, ax = plt.subplots(1, 1, figsize=(8, 6.5)) sns.heatmap(data=correlation, annot=True, cmap="YlGn") ax.set_title("Correlation matrix for taken variables"); #plt.savefig('plots/correlationMap.pdf') ########################################################## ################## saving new data ####################### ########################################################## #XTrain_transformed.to_csv(r'/Users/kreozotica/PycharmProjects/current/ML_Coursera/processed_train.csv', index=False) #XTest_transformed.to_csv(r'/Users/kreozotica/PycharmProjects/current/ML_Coursera/XTest_transformed.csv', index=False) ############################################################################################################################# # Further, since we don't have prediction data, we keep TEST data as prediction and SPLIT TRAIN data into new TRAIN and TEST ############################################################################################################################# X_train, X_test, y_train, y_test = train_test_split(XTrain_transformed, y_train, random_state=0, test_size=0.75) #### scalling scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.fit_transform(X_test) XTest_transformed_scaled = scaler.fit_transform(XTest_transformed) XTest_transformed_scaled = pd.DataFrame(XTest_transformed_scaled, columns = XTest_transformed.columns) #plot_importance(model_XGB) #pyplot.show() # featureImportance = pd.DataFrame(regressor.feature_importances_.reshape(1, -1), columns=TrainTest_noLabel.columns) ########################################################## ################## MODELLING APPROACH #################### ########################################################## ##### universal model's function def modelFit(X_train, X_test, y_train, y_test, clf, cv=5): clf = clf.fit(X_train, y_train) cv = cross_val_score(clf, X_test, y_test, cv=cv, scoring = 'roc_auc') cv_mean = round(cv.mean(), 3) cv_std = round(cv.std(), 3) print('Cross-validation (AUC)', cv, ', mean =', cv_mean, ', std =', cv_std) #y_pred =clf.predict(X_test) #confusion = confusion_matrix(y_test, y_pred) #print(confusion) return cv_mean, cv_std ##### XGBoost clf_XGB = XGBClassifier() auc_mean_XGB, auc_std_XGB = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_XGB, cv=20) ##### Gradient-boosted Decision Trees¶ clf_GBC = GradientBoostingClassifier(learning_rate=0.05) # scaling doesn't really need it, advantage auc_mean_GBC, auc_std_GBC = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_GBC, cv=20) ##### SVM clf_SVM = SVC(kernel='rbf', C=1, random_state=0) auc_mean_SVM, auc_std_SVM = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, clf_SVM, cv=20) #### LogReg grid_values = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']} LogReg = LogisticRegression() grid_rbf_recall = GridSearchCV(LogReg, param_grid = grid_values, scoring='recall') auc_mean_LR, auc_std_LR = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, grid_rbf_recall, cv=20) #### RidgeReg #RdgReg_clf = Ridge() #auc_mean_RG, auc_std_RG = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, RdgReg_clf, cv=20) ### NaiveBayes NB_clf = GaussianNB() auc_mean_NB, auc_std_NB = modelFit(X_train_scaled, X_test_scaled, y_train, y_test, NB_clf, cv=20) ################## ROC vis ################## def roCurves(clfList, X_test, y_test): roCurveList = [] plt.subplots(1, 1, figsize=(5, 5)) styleList = ['solid', 'solid', 'dashed', 'dashed', 'dotted', 'dashed'] for clf, sty in zip(clfList, styleList): ax = plt.gca() roc = plot_roc_curve(clf, X_test, y_test, ax=ax, alpha=0.85, lw=2, linestyle=sty) roCurveList.append(roc) plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted') plt.title('ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') return roCurveList exps = [clf_XGB, clf_GBC, clf_SVM, grid_rbf_recall, NB_clf] roCurves(exps, X_test_scaled, y_test) # Save the figure and show #plt.tight_layout() #plt.savefig('plots/ROCs.png') #plt.show() ##### Pedict probabilities for the best model - XGBoost y_proba = clf_XGB.predict_proba(XTest_transformed_scaled)[:,1] # Integrate with reloaded test data test['compliance'] = y_proba return test.compliance
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) min_count = np.min(np.unique(y, return_counts=True)[1]) if min_count < 9: self.params['cv_search'] = False if min_count < 3: self.params['grid_search_iterations'] = False self.params['cv_search'] = False # save pre-datatable-imputed X X_dt = X # Apply OOB imputation self.oob_imputer = OOBImpute(self._impute_num_type, self._impute_int_type, self._impute_bool_type, self._impute_cat_type, self._oob_bool, self._oob_cat) X = self.oob_imputer.fit_transform(X) # convert to pandas for sklearn X = X.to_pandas() X_orig_cols_names = list(X.columns) if self._kaggle_features: self.features = make_features() X = self.features.fit_transform(X) else: self.features = None # print("LR: pandas dtypes: %s" % (str(list(X.dtypes)))) # FEATURE GROUPS # Choose which features are numeric or categorical cat_features = [ x for x in X_orig_cols_names if CatOriginalTransformer.is_me_transformed(x) ] catlabel_features = [ x for x in X_orig_cols_names if CatTransformer.is_me_transformed(x) ] # can add explicit column name list to below force_cats force_cats = cat_features + catlabel_features # choose if numeric is treated as categorical if not self._num_as_cat: numerical_features = (X.dtypes == 'float') | ( X.dtypes == 'float32') | (X.dtypes == 'float64') else: numerical_features = X.dtypes == 'invalid' # force oob imputation for numerics self.oob_imputer = OOBImpute('oob', 'oob', 'oob', self._impute_cat_type, self._oob_bool, self._oob_cat) X = self.oob_imputer.fit_transform(X_dt) X = X.to_pandas() X = self.features.fit_transform(X) if self._kaggle_features: numerical_features = self.features.update_numerical_features( numerical_features) categorical_features = ~numerical_features # below can lead to overlap between what is numeric and what is categorical more_cats = (pd.Series([ True if x in force_cats else False for x in list(categorical_features.index) ], index=categorical_features.index)) categorical_features = (categorical_features) | (more_cats) if self._kaggle_features: categorical_features = self.features.update_categorical_features( categorical_features) if self._debug: import uuid struuid = str(uuid.uuid4()) Xy = X.copy() Xy.loc[:, 'target'] = y Xy.to_csv("munged_%s.csv" % struuid) cat_X = X.loc[:, categorical_features] num_X = X.loc[:, numerical_features] if self._debug: print("LR: Cat names: %s" % str(list(cat_X.columns))) print("LR: Num names: %s" % str(list(num_X.columns))) # TRANSFORMERS lr_params = copy.deepcopy(self.params) lr_params.pop('grid_search_by_iterations', None) lr_params.pop('cv_search', None) grid_search = False # WIP full_features_list = [] transformers = [] if self._use_numerics and any(numerical_features.values): impute_params = {} impute_params['strategy'] = lr_params.pop('strategy', 'mean') full_features_list.extend(list(num_X.columns)) transformers.append( (make_pipeline(SimpleImputer(**impute_params), StandardScaler()), numerical_features)) # http://contrib.scikit-learn.org/categorical-encoding/ if self._use_ordinal_encoding and any(categorical_features.values): ord_params = dict(handle_missing='value', handle_unknown='value') full_features_list.extend(list(cat_X.columns)) # Note: OrdinalEncoder doesn't handle unseen features, while CategoricalEncoder used too import category_encoders as ce transformers.append( (ce.OrdinalEncoder(**ord_params), categorical_features)) if self._use_catboost_encoding and any(categorical_features.values): cb_params = dict(handle_missing='value', handle_unknown='value') cb_params['sigma'] = lr_params.pop('sigma') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.CatBoostEncoder(**cb_params), categorical_features)) if self._use_woe_encoding and any(categorical_features.values): woe_params = dict(handle_missing='value', handle_unknown='value') woe_params['randomized'] = lr_params.pop('randomized') woe_params['sigma'] = lr_params.pop('sigma_woe') woe_params['regularization'] = lr_params.pop('regularization') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.WOEEncoder(**woe_params), categorical_features)) if self._use_target_encoding and any(categorical_features.values): te_params = dict(handle_missing='value', handle_unknown='value') te_params['min_samples_leaf'] = lr_params.pop('min_samples_leaf') te_params['smoothing'] = lr_params.pop('smoothing') full_features_list.extend(list(cat_X.columns)) import category_encoders as ce transformers.append( (ce.TargetEncoder(**te_params), categorical_features)) if self._use_target_encoding_other and any( categorical_features.values): full_features_list.extend(list(cat_X.columns)) len_uniques = [] cat_X_copy = cat_X.copy() for c in cat_X.columns: le = LabelEncoder() le.fit(cat_X[c]) cat_X_copy[c] = le.transform(cat_X_copy[c]) len_uniques.append(len(le.classes_)) if self._debug: uniques_series = pd.Series(len_uniques, index=list(cat_X.columns)) print("uniques_series: %s" % uniques_series) ALPHA = 75 MAX_UNIQUE = max(len_uniques) # FEATURES_COUNT = cat_X.shape[1] cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.params['random_state']) split_cv = [cv] # split_cv = [3, 3] from target_encoding import TargetEncoder transformers.append( (TargetEncoder(alpha=ALPHA, max_unique=MAX_UNIQUE, split_in=split_cv), categorical_features)) if self._use_ohe_encoding and any(categorical_features.values): transformers.append( (OneHotEncoder(handle_unknown='ignore', sparse=True), categorical_features)) assert len(transformers) > 0, "should have some features" preprocess = make_column_transformer(*transformers) # ESTIMATOR lr_defaults = dict(penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) allowed_lr_kwargs_keys = lr_defaults.keys() lr_params_copy = copy.deepcopy(lr_params) for k, v in lr_params_copy.items(): if k not in allowed_lr_kwargs_keys: lr_params.pop(k, None) del lr_params_copy can_score = self.num_classes == 2 and 'AUC' in self.params_base[ 'score_f_name'].upper() # print("LR: can_score: %s" % str(can_score)) if can_score: scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True) else: scorer = None if not ('C' in lr_params or 'l1_ratios' in lr_params): # override self.params['cv_search'] = False if not self.params['cv_search']: estimator = LogisticRegression(**lr_params) estimator_name = 'logisticregression' else: lr_params_cv = copy.deepcopy(lr_params) if 'C' in lr_params: lr_params_cv['Cs'] = self.get_param_range( self.params['C'], self.params['fit_count'], func_type='log') # print("LR: CV: Cs: %s" % str(lr_params_cv['Cs'])) if 'l1_ratios' in lr_params: lr_params_cv['l1_ratios'] = self.get_param_range( self.params['l1_ratio'], self.params['fit_count'], func_type='linear') # print("LR: CV: l1_ratios: %s" % str(lr_params_cv['l1_ratios'])) lr_params_cv.pop('n_jobs', None) lr_params_cv.pop('C', None) lr_params_cv.pop('l1_ratio', None) if lr_params_cv['penalty'] == 'none': lr_params_cv['penalty'] = 'l2' estimator = LogisticRegressionCV(n_jobs=self.params['n_jobs'], cv=3, refit=True, scoring=scorer, **lr_params_cv) estimator_name = 'logisticregressioncv' # PIPELINE model = make_pipeline(preprocess, estimator) # FIT if self.params['grid_search_iterations'] and can_score: # WIP FIXME for multiclass and other scorers from sklearn.model_selection import GridSearchCV max_iter_range = self.get_param_range( self.params['max_iter'], self.params['fit_count'], range_limit=self._overfit_limit_iteration_step, func_type='log') # print("LR: max_iter_range: %s" % str(max_iter_range)) param_grid = { '%s__max_iter' % estimator_name: max_iter_range, } grid_clf = GridSearchCV(model, param_grid, n_jobs=self.params['n_jobs'], cv=3, iid=True, refit=True, scoring=scorer) grid_clf.fit(X, y) model = grid_clf.best_estimator_ # print("LR: best_index=%d best_score: %g best_params: %s" % ( # grid_clf.best_index_, grid_clf.best_score_, str(grid_clf.best_params_))) elif grid_search: # WIP from sklearn.model_selection import GridSearchCV param_grid = { 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'], '%s__C' % estimator_name: [0.1, 0.5, 1.0], } grid_clf = GridSearchCV(model, param_grid, cv=10, iid=False) grid_clf.fit(X, y) model = grid_clf.best_estimator_ # self.best_params = grid_clf.best_params_ else: model.fit(X, y) # get actual LR model lr_model = model.named_steps[estimator_name] if self._debug and False: import uuid struuid = str(uuid.uuid4()) save_obj( model.named_steps['columntransformer'].fit_transform(X, y), "columns_csr_%s.pkl" % struuid) # average importances over classes importances = np.average(np.array(lr_model.coef_), axis=0) # average iterations over classes (can't take max_iter per class) iterations = np.average(lr_model.n_iter_) # print("LR: iterations: %d" % iterations) # reduce OHE features to original names ohe_features_short = [] if self._use_ohe_encoding and any(categorical_features.values): if self._use_ohe_encoding: input_features = [x + self._ohe_postfix for x in cat_X.columns] ohe_features = pd.Series( model.named_steps['columntransformer']. named_transformers_['onehotencoder'].get_feature_names( input_features=input_features)) def f(x): return '_'.join(x.split(self._ohe_postfix + '_')[:-1]) # identify OHE features ohe_features_short = ohe_features.apply(lambda x: f(x)) full_features_list.extend(list(ohe_features_short)) # aggregate our own features if self._kaggle_features: self.features.aggregate(full_features_list, importances) msg = "LR: num=%d cat=%d : ohe=%d : imp=%d full=%d" % ( len(num_X.columns), len(cat_X.columns), len(ohe_features_short), len(importances), len(full_features_list)) if self._debug: print(msg) assert len(importances) == len(full_features_list), msg # aggregate importances by dai feature name importances = pd.Series( np.abs(importances), index=full_features_list).groupby(level=0).mean() assert len(importances) == len( X_orig_cols_names), "%d %d %s : %s %s" % ( len(importances), len(X_orig_cols_names), msg, str(list(X.columns)), str(list(X.dtypes))) # save hyper parameter searched results for next search self.params['max_iter'] = iterations if self.params['cv_search']: self.params['C'] = np.average(lr_model.C_, axis=0) if 'l1_ratios' in lr_params and self.params['cv_search']: self.params['l1_ratio'] = np.average(lr_model.l1_ratio_, axis=0) if 'fit_count' in self.params: self.params['fit_count'] += 1 else: self.params['fit_count'] = 0 self.set_model_properties(model=(model, self.features), features=orig_cols, importances=importances.tolist(), iterations=iterations) self.features = None
'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2), 'AgingPP': AgingPPEncoder(estimator_name='LinearRegression', num_predictors=2), 'SimplePP': SimplePPEncoder(estimator_name='LinearRegression', num_predictors=2), 'CESAMOEncoder': CESAMOEncoder() } if target_flag == 0: del Encoders['EntityEmbedding'] del Encoders['TargetEnc']
def fit(self, X, y, column): self.col_name = column self.real_encoder = ce.WOEEncoder() self.real_encoder.fit(X[column], y)
import sys sys.path.append('../encoders/') from ceng import CENGEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder from entity_embedding import EntityEmbeddingEncoder from cesamo import CESAMOEncoder Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(num_predictors=2), 'AgingPP': AgingPPEncoder(num_predictors=2), 'SimplePP': SimplePPEncoder(num_predictors=2), 'CESAMOEncoder': CESAMOEncoder() } if target_flag == 0: del Encoders['EntityEmbedding'] del Encoders['TargetEnc'] del Encoders['WOE'] """END: Import encoders""" import time
def encode_all(df,dfv,dfk,encoder_to_use,handle_missing='return_nan'): encoders_used = {} for col in encoder_to_use: if encoder_to_use[col] == 'ColumnDropper': df = df.drop(columns = col) dfv = dfv.drop(columns = col) dfk = dfk.drop(columns = col) encoders_used[col] = 'ColumnDropper' if encoder_to_use[col]=='BackwardDifferenceEncoder': encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BaseNEncoder': encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=3) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='BinaryEncoder': encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='CatBoostEncoder': encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None,a=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # if encoder_to_use[col]=='HashingEncoder': # encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) # encoder.fit(X=df,y=df['set_clicked']) # df=encoder.transform(df) # encoders_used[col]=encoder if encoder_to_use[col]=='HelmertEncoder': encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='JamesSteinEncoder': encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model='binary') encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='LeaveOneOutEncoder': encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='MEstimateEncoder': encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None,m=2) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) encoders_used[col]=encoder if encoder_to_use[col]=='OneHotEncoder': encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='OrdinalEncoder': encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='SumEncoder': encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='PolynomialEncoder': encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='TargetEncoder': encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=10, smoothing=5) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder if encoder_to_use[col]=='WOEEncoder': encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=True,sigma=None) encoder.fit(X=df,y=df['set_clicked']) df=encoder.transform(df) dfv=encoder.transform(dfv) dfk=encoder.transform(dfk) encoders_used[col]=encoder # print("Encoding done for - ",col) print("Completed encoder - ",datetime.datetime.now()) return df, dfv, dfk, encoders_used
test['Score difference'] = diff ################################################################ # Target feature paren_match (training data). It is 1 if answer and wiki page match. 0 otherwise train['paren_match'] = 0 for i, row in train.iterrows(): if row['Answer'] == row['Wiki page']: train.loc[i, 'paren_match'] = 1 ################################################################# # WOE encoding encoding = ce.WOEEncoder(cols=['category', 'Wiki page']) encoding.fit(train, train['paren_match']) train_df = encoding.transform(train) features = [ 'Wiki page', 'Quest len', 'Page score', 'category', 'Score difference' ] target = ['paren_match'] scaler = StandardScaler() scaler.fit(train_df[features].values) train_df[features] = scaler.transform(train_df[features].values) train_df.head()