def rare_encoding(data, variables, rare_threshold=0.05, n_rare_categories=4): encoder = ce.RareLabelCategoricalEncoder(tol=rare_threshold, n_categories=n_rare_categories, variables=variables, replace_with='Rare') # fit the encoder encoder.fit(data) # transform the data data = encoder.transform(data) return data
def rare_encoding(X_train, X_test, variable, tolerance, n_cat): encoder = ce.RareLabelCategoricalEncoder(tol=tolerance, n_categories=n_cat, variables=variable, replace_with='Rare', return_object=True) # fit the encoder encoder.fit(X_train) return encoder.transform(X_train), encoder.transform(X_test)
_logger = logging.getLogger(__name__) loan_pipe = Pipeline([ # categorical missing value imputer ('categorical_imputer', pp.CategoricalImputer(variables=config.VARS_WITH_NA)), # frequent label categorical encoder ('rare_encoder', ce.RareLabelCategoricalEncoder(tol=0.02, n_categories=4, variables=config.VARS_WITH_RARE, replace_with='Other', return_object=True)), # target guided ordinal categorical variable encoder ('ordinal_encoder', ce.OrdinalCategoricalEncoder(encoding_method='ordered', variables=config.ORDINAL_VARS)), # nominal categorical variable encoder (one hot) ('nominal_encoder', ce.OneHotCategoricalEncoder(variables=config.NOMINAL_VARS, drop_last=True)), # Yeo-Johnson numerical variable transformer ('yeo_johnson_transformer',
trainset = all_dataset[all_dataset.HasDetections != -1] # seperate original train and test sets using -1 sentinel value del all_dataset gc.collect() cat_imputer = missing_data_imputers.FrequentCategoryImputer(variables=dropped_machine_id_cat_names) # missing value imputer for categorical values (the most frequent value) cont_imputer = missing_data_imputers.MeanMedianImputer(variables=dropped_targets_cont_names) # missing value imputer for numeric values (median value) cat_imputer.fit(trainset) trainset = cat_imputer.transform(trainset) # cat_imputer fit and transform for train set print(trainset.head()) testset = cat_imputer.transform(testset) # cat_imputer (only) transform for test set print(testset.head()) rare_cat_enc = categorical_encoders.RareLabelCategoricalEncoder(variables=dropped_machine_id_cat_names) rare_cat_enc.fit(trainset) trainset = rare_cat_enc.transform(trainset) # find rare (using a threshold value) categorical values from train set and replace these values with 'rare', this technique is useful to prevent overfitting because of CountFrequencyCategoricalEncoder testset = rare_cat_enc.transform(testset) # rare_cat_enc (only) transform for test set cf_cat_enc = categorical_encoders.CountFrequencyCategoricalEncoder(encoding_method='frequency', variables=dropped_machine_id_cat_names) cf_cat_enc.fit(trainset) trainset = cf_cat_enc.transform(trainset) # Convert all categorical features to normalized frequency values testset = cf_cat_enc.transform(testset) # transform for test set cont_imputer.fit(trainset) trainset = cont_imputer.transform(trainset) # ift and transform for cont_imputer on train set print(trainset.head()) testset = cont_imputer.transform(testset) # (only) transform for cont_imputer on test set print(testset.head())
target = 'loan_status' object_cols_x_target = [c for c in object_cols if c != target] cols_to_drop = datetime_cols + ['address'] p = pipe([ # Add missing indicators for numeric features ("num_nan_ind",mdi.AddMissingIndicator(variables=numeric_cols)), # Add missing level for categorical features ("fill_cat_nas",mdi.CategoricalVariableImputer( fill_value = "_MISSING_", variables=object_cols_x_target)), # Bin rare levels of categorical variables ("rare_cats",ce.RareLabelCategoricalEncoder( tol = 0.02, replace_with = "_RARE_", variables=object_cols_x_target)), # Impute missing numeric variables with media ("rmmean",mdi.MeanMedianImputer(imputation_method = 'median',variables=numeric_cols)), # Drop dates and address columns ("drop_date",fs.DropFeatures(features_to_drop=cols_to_drop))]) # In[13]: df_train_prepped = p.fit_transform(df_train) # In[14]:
#call the function for all rows t_train['Title'] = t_train['Name'].apply(calcTitle) t_test['Title'] = t_test['Name'].apply(calcTitle) #delete unsed variables del t_train['Name'], t_test['Name'] del t_train['Cabin'], t_test['Cabin'] t_train['Embarked'] = t_train['Embarked'].fillna(t_train['Embarked'].mode()[0]) t_test['Fare'] = t_test['Fare'].fillna(t_test['Fare'].mode()[0]) #transofmr the categorical values of Title from feature_engine import categorical_encoders as ce rare_encoder = ce.RareLabelCategoricalEncoder(tol=0.04489, n_categories=3) t_train['Title'] = rare_encoder.fit_transform(pd.DataFrame(t_train['Title'])) t_test['Title'] = rare_encoder.fit_transform(pd.DataFrame(t_test['Title'])) del rare_encoder t_train['Familysize'] = t_train['SibSp'] + t_train['Parch'] + 1 t_test['Familysize'] = t_test['SibSp'] + t_test['Parch'] + 1 t_train['isAlone'] = np.where((t_train['Familysize'] > 1), 0, t_train['Familysize']) t_test['isAlone'] = np.where((t_test['Familysize'] > 1), 0, t_test['Familysize']) t_train['Sex'] = t_train['Sex'].replace({'male': 0, 'female': 1})
('missing_ind', mdi.AddNaNBinaryImputer( variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])), ('imputer_num', mdi.MeanMedianImputer( imputation_method='median', variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])), ('imputer_cat', mdi.CategoricalVariableImputer(variables=categorical)), # categorical encoding - section 6 ('rare_label_enc', ce.RareLabelCategoricalEncoder(tol=0.05, n_categories=6, variables=categorical + discrete)), ('categorical_enc', ce.OrdinalCategoricalEncoder(encoding_method='ordered', variables=categorical + discrete)), # discretisation + encoding - section 8 ('discretisation', dsc.EqualFrequencyDiscretiser(q=5, return_object=True, variables=numerical)), ('encoding', ce.OrdinalCategoricalEncoder(encoding_method='ordered', variables=numerical)), # #Lets transform the monotonic relationship into a gaussian distribution