def train_models(ModelClass, invoices, observation_end_dates, rfe=False, **kwargs): train_results = dict( models=[], observation_end_dates=observation_end_dates, X_train=[], y_train=[], X_test=[], y_test=[], ) X_trains, y_trains = [], [], for observation_end_date in observation_end_dates: X_train, y_train, X_test, y_test = get_train_test_data( invoices, observation_end_date) X_trains.append(X_train) y_trains.append(y_train) X_train, y_train = pd.concat( X_trains[-lag:]).reset_index(drop=True), pd.concat( y_trains[-lag:]).reset_index(drop=True).astype( int) # Last "lag" months used as training data X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index( drop=True).astype(int) # Encode "MostBoughtItem" feature rare_encoder = RareLabelCategoricalEncoder( tol=0.02 if len(X_train) < 100 else 0.01, variables=['MostBoughtItem']).fit(X_train) X_train = rare_encoder.transform(X_train) X_test = rare_encoder.transform(X_test) mean_enc = MeanCategoricalEncoder(variables=['MostBoughtItem']).fit( X_train, y_train) X_train = mean_enc.transform(X_train) X_test = mean_enc.transform(X_test) if rfe: sel_ = RFE(ModelClass(**kwargs), n_features_to_select=8) sel_.fit(X_train, y_train) selected_feats = X_train.columns[(sel_.get_support())] model = ModelClass(**kwargs).fit(X_train[selected_feats], y_train) train_results['X_train'].append(X_train[selected_feats]) train_results['X_test'].append(X_test[selected_feats]) else: model = ModelClass(**kwargs) model.fit(X_train, y_train) train_results['X_train'].append(X_train) train_results['X_test'].append(X_test) train_results['models'].append(model) train_results['y_train'].append(y_train) train_results['y_test'].append(y_test) return train_results
def test_RareLabelEncoder(): df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } df = pd.DataFrame(df) transf_df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['Rare'] * 8 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } transf_df = pd.DataFrame(transf_df) encoder = RareLabelCategoricalEncoder(tol=0.05, n_categories=9, variables=['category']) encoder.fit(df) X = encoder.transform(df) pd.testing.assert_frame_equal(X, transf_df) assert encoder.variables == ['category'] assert encoder.input_shape_ == (63, 2) df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } df = pd.DataFrame(df) transf_df = { 'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5, 'target': [1] * 63 } transf_df = pd.DataFrame(transf_df) encoder = RareLabelCategoricalEncoder(tol=0.01, n_categories=9, variables=['category']) encoder.fit(df) X = encoder.transform(df) pd.testing.assert_frame_equal(X, transf_df) assert encoder.variables == ['category'] assert encoder.input_shape_ == (63, 2)
def test_RareLabelEncoder(dataframe_enc_big, dataframe_enc_big_na): # test case 1: defo params, automatically select variables encoder = RareLabelCategoricalEncoder(tol=0.06, n_categories=5, variables=None, replace_with='Rare') X = encoder.fit_transform(dataframe_enc_big) df = { 'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, 'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, 'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['Rare'] * 4 + ['G'] * 6, } df = pd.DataFrame(df) # init params assert encoder.tol == 0.06 assert encoder.n_categories == 5 assert encoder.replace_with == 'Rare' assert encoder.variables == ['var_A', 'var_B', 'var_C'] # fit params assert encoder.input_shape_ == (40, 3) # transform params pd.testing.assert_frame_equal(X, df) # test case 2: user provides alternative grouping value and variable list encoder = RareLabelCategoricalEncoder(tol=0.15, n_categories=5, variables=['var_A', 'var_B'], replace_with='Other') X = encoder.fit_transform(dataframe_enc_big) df = { 'var_A': ['A'] * 6 + ['B'] * 10 + ['Other'] * 4 + ['D'] * 10 + ['Other'] * 4 + ['G'] * 6, 'var_B': ['A'] * 10 + ['B'] * 6 + ['Other'] * 4 + ['D'] * 10 + ['Other'] * 4 + ['G'] * 6, 'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 + ['F'] * 2 + ['G'] * 6 } df = pd.DataFrame(df) # init params assert encoder.tol == 0.15 assert encoder.n_categories == 5 assert encoder.replace_with == 'Other' assert encoder.variables == ['var_A', 'var_B'] # fit params assert encoder.input_shape_ == (40, 3) # transform params pd.testing.assert_frame_equal(X, df) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(tol=5) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=0.5) with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(replace_with=0.5) # test case 3: when the variable has low cardinality with pytest.warns(UserWarning): encoder = RareLabelCategoricalEncoder(n_categories=10) encoder.fit(dataframe_enc_big) # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=4) encoder.fit(dataframe_enc_big_na) # test case 5: when dataset contains na, transform method with pytest.raises(ValueError): encoder = RareLabelCategoricalEncoder(n_categories=4) encoder.fit(dataframe_enc_big) encoder.transform(dataframe_enc_big_na)
used_car_price_pipeline = Pipeline([ ("preprocessing", preprocessors.Preprocessing()), ("random_num_impute", ArbitraryNumberImputer(arbitrary_number=-9, variables=['year', 'odometer'])), ("cylinder_impute", CategoricalVariableImputer(imputation_method='missing', fill_value='-1', variables=['cylinders'])), ("categorical_impute", CategoricalVariableImputer(imputation_method='missing', fill_value='missing')), ("rare_label_manufacturer", RareLabelCategoricalEncoder(tol=0.01, variables='manufacturer', n_categories=5, replace_with='rare')), ("rare_label_cylinder", RareLabelCategoricalEncoder(tol=0.01, variables='cylinders', n_categories=5, replace_with='-1')), ("rare_label_condition", RareLabelCategoricalEncoder(tol=0.07, variables=['condition'], n_categories=3, replace_with='rare')), ('rare_label_type', RareLabelCategoricalEncoder(tol=0.04, variables=['type'], n_categories=3,
variables=config.model_config.categorical_vars, transformer=SimpleImputer(strategy="constant", fill_value="missing"), ), ), ( "temporal_variable", pp.TemporalVariableEstimator( variables=config.model_config.temporal_vars, reference_variable=config.model_config.drop_features, ), ), ( "rare_label_encoder", RareLabelCategoricalEncoder( tol=config.model_config.rare_label_tol, n_categories=config.model_config.rare_label_n_categories, variables=config.model_config.categorical_vars, ), ), ( "categorical_encoder", pp.SklearnTransformerWrapper( variables=config.model_config.categorical_vars, transformer=OrdinalEncoder(), ), ), ( "drop_features", pp.DropUnecessaryFeatures( variables_to_drop=config.model_config.drop_features, ),
X_test[variable] = np.where(X_test[variable].isin( frequent_cat), X_test[variable], 'Rare') return X_train, X_test # Transforming for variable in ['Neighborhood', 'Exterior1st', 'Exterior2nd']: X_train, X_test = rare_encoding(X_train, X_test, variable, 0.05) -# With Feature-Engine # Rare value encoder rare_encoder = RareLabelCategoricalEncoder( tol=0.05, # minimal percentage to be considered non-rare n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories variables=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'BsmtCond'] # variables to re-group ) ----OUTLIERS # let's make boxplots to visualise outliers in the continuous variables # and histograms to get an idea of the distribution for var in numerical: plt.figure(figsize=(6,4)) plt.subplot(1, 2, 1) fig = data.boxplot(column=var) fig.set_title('')
read_data = read_data[read_data.A4 != 'l'] read_data=read_data[read_data.A5 != 'gg'] ## remove category l and gg from variable l A4 and A5 simultaneously read_data = read_data[read_data.A4 != 'l'] read_data=read_data[read_data.A5 != 'gg'] ## encode rare categories in dataset rare_encoder = RareLabelCategoricalEncoder( tol=0.05, # minimal percentage to be considered non-rare n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories variables=['A1','A4','A5','A6','A7','A9','A10','A12'] # variables to re-group ) rare_encoder.fit(read_data) rare_encoder.variables read_data = rare_encoder.transform(read_data) rare_encoder_A13 = RareLabelCategoricalEncoder( tol=0.1, # minimal percentage to be considered non-rare n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories variables=['A13'] # variables to re-group ) rare_encoder_A13.fit(read_data) rare_encoder_A13.variables read_data = rare_encoder_A13.transform(read_data)