def train_models(ModelClass, invoices, observation_end_dates, rfe=False, **kwargs): train_results = dict( models=[], observation_end_dates=observation_end_dates, X_train=[], y_train=[], X_test=[], y_test=[], ) X_trains, y_trains = [], [], for observation_end_date in observation_end_dates: X_train, y_train, X_test, y_test = get_train_test_data( invoices, observation_end_date) X_trains.append(X_train) y_trains.append(y_train) X_train, y_train = pd.concat( X_trains[-lag:]).reset_index(drop=True), pd.concat( y_trains[-lag:]).reset_index(drop=True).astype( int) # Last "lag" months used as training data X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index( drop=True).astype(int) # Encode "MostBoughtItem" feature rare_encoder = RareLabelCategoricalEncoder( tol=0.02 if len(X_train) < 100 else 0.01, variables=['MostBoughtItem']).fit(X_train) X_train = rare_encoder.transform(X_train) X_test = rare_encoder.transform(X_test) mean_enc = MeanCategoricalEncoder(variables=['MostBoughtItem']).fit( X_train, y_train) X_train = mean_enc.transform(X_train) X_test = mean_enc.transform(X_test) if rfe: sel_ = RFE(ModelClass(**kwargs), n_features_to_select=8) sel_.fit(X_train, y_train) selected_feats = X_train.columns[(sel_.get_support())] model = ModelClass(**kwargs).fit(X_train[selected_feats], y_train) train_results['X_train'].append(X_train[selected_feats]) train_results['X_test'].append(X_test[selected_feats]) else: model = ModelClass(**kwargs) model.fit(X_train, y_train) train_results['X_train'].append(X_train) train_results['X_test'].append(X_test) train_results['models'].append(model) train_results['y_train'].append(y_train) train_results['y_test'].append(y_test) return train_results
def test_RareLabelEncoder(): df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } df = pd.DataFrame(df) transf_df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['Rare'] * 8 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } transf_df = pd.DataFrame(transf_df) encoder = RareLabelCategoricalEncoder(tol=0.05, n_categories=9, variables=['category']) encoder.fit(df) X = encoder.transform(df) pd.testing.assert_frame_equal(X, transf_df) assert encoder.variables == ['category'] assert encoder.input_shape_ == (63, 2) df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } df = pd.DataFrame(df) transf_df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } transf_df = pd.DataFrame(transf_df) encoder = RareLabelCategoricalEncoder(tol=0.01, n_categories=9, variables=['category']) encoder.fit(df) X = encoder.transform(df) pd.testing.assert_frame_equal(X, transf_df) assert encoder.variables == ['category'] assert encoder.input_shape_ == (63, 2)
def test_RareLabelEncoder(dataframe_enc_big, dataframe_enc_big_na): # test case 1: defo params, automatically select variables encoder = RareLabelCategoricalEncoder(tol=0.06, n_categories=5, variables=None, replace_with='Rare') X = encoder.fit_transform(dataframe_enc_big) df = { 'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, 'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, 'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, } df = pd.DataFrame(df) # init params assert encoder.tol == 0.06 assert encoder.n_categories == 5 assert encoder.replace_with == 'Rare' assert encoder.variables == ['var_A', 'var_B', 'var_C'] # fit params assert encoder.input_shape_ == (40, 3) # transform params pd.testing.assert_frame_equal(X, df) # test case 2: user provides alternative grouping value and variable list encoder = RareLabelCategoricalEncoder(tol=0.15, n_categories=5, variables=['var_A', 'var_B'], replace_with='Other') X = encoder.fit_transform(dataframe_enc_big) df = { 'var_A': ['A'] * 6 + ['B'] * 10 + ['Other'] * 4 + ['D'] * 10 + ['Other'] * 4 + ['G'] * 6, 'var_B': ['A'] * 10 + ['B'] * 6 + ['Other'] * 4 + ['D'] * 10 + ['Other'] * 4 + ['G'] * 6, 'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6 } df = pd.DataFrame(df) # init params assert encoder.tol == 0.15 assert encoder.n_categories == 5 assert encoder.replace_with == 'Other' assert encoder.variables == ['var_A', 'var_B'] # fit params assert encoder.input_shape_ == (40, 3) # transform params pd.testing.assert_frame_equal(X, df) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(tol=5) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=0.5) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(replace_with=0.5) # test case 3: when the variable has low cardinality with pytest.warns(UserWarning): encoder = RareLabelCategoricalEncoder(n_categories=10) encoder.fit(dataframe_enc_big) # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=4) encoder.fit(dataframe_enc_big_na) # test case 5: when dataset contains na, transform method with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=4) encoder.fit(dataframe_enc_big) encoder.transform(dataframe_enc_big_na)
for variable in ['Neighborhood', 'Exterior1st', 'Exterior2nd']: X_train, X_test = rare_encoding(X_train, X_test, variable, 0.05) ---With Feature-Engine from feature_engine.categorical_encoders import RareLabelCategoricalEncoder # Rare value encoder rare_encoder = RareLabelCategoricalEncoder( tol=0.05, # minimal percentage to be considered non-rare n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories variables=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'BsmtCond'] # variables to re-group ) rare_encoder.fit(X_train) X_train = rare_encoder.transform(X_train) X_test = rare_encoder.transform(X_test) rare_encoder.variables # the encoder_dict_ is a dictionary of variable: frequent labels pair rare_encoder.encoder_dict_ ----------DISCRETISATION--------- ----Equal width discretisation # with Scikit Learn from sklearn.preprocessing import KBinsDiscretizer disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform') disc.fit(X_train[['age', 'fare']]) train_t = disc.transform(X_train[['age', 'fare']])
## remove category l and gg from variable l A4 and A5 simultaneously read_data = read_data[read_data.A4 != 'l'] read_data=read_data[read_data.A5 != 'gg'] ## encode rare categories in dataset rare_encoder = RareLabelCategoricalEncoder( tol=0.05, # minimal percentage to be considered non-rare n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories variables=['A1','A4','A5','A6','A7','A9','A10','A12'] # variables to re-group ) rare_encoder.fit(read_data) rare_encoder.variables read_data = rare_encoder.transform(read_data) rare_encoder_A13 = RareLabelCategoricalEncoder( tol=0.1, # minimal percentage to be considered non-rare n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories variables=['A13'] # variables to re-group ) rare_encoder_A13.fit(read_data) rare_encoder_A13.variables read_data = rare_encoder_A13.transform(read_data) ## standarized numerical variables read_data[['A2', 'A3','A8','A11','A14','A15']] = StandardScaler().fit_transform(read_data[['A2', 'A3','A8','A11','A14','A15']]) ## check if there is relation between independent categorical and dependent categorical variable using chi-square test