示例#1
0
def rare_encoding(data, variables, rare_threshold=0.05, n_rare_categories=4):
    encoder = ce.RareLabelCategoricalEncoder(tol=rare_threshold, n_categories=n_rare_categories, variables=variables,
                                             replace_with='Rare')
    # fit the encoder
    encoder.fit(data)
    # transform the data
    data = encoder.transform(data)
    return data
示例#2
0
def rare_encoding(X_train, X_test, variable, tolerance, n_cat):
    encoder = ce.RareLabelCategoricalEncoder(tol=tolerance,
                                             n_categories=n_cat,
                                             variables=variable,
                                             replace_with='Rare',
                                             return_object=True)
    # fit the encoder
    encoder.fit(X_train)
    return encoder.transform(X_train), encoder.transform(X_test)

_logger = logging.getLogger(__name__)


loan_pipe = Pipeline([
    
    # categorical missing value imputer
    ('categorical_imputer', 
        pp.CategoricalImputer(variables=config.VARS_WITH_NA)),
    
    # frequent label categorical encoder
    ('rare_encoder', 
        ce.RareLabelCategoricalEncoder(tol=0.02, 
                                        n_categories=4,
                                        variables=config.VARS_WITH_RARE,
                                        replace_with='Other', 
                                        return_object=True)),

    # target guided ordinal categorical variable encoder
    ('ordinal_encoder', 
        ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                     variables=config.ORDINAL_VARS)),
    
    # nominal categorical variable encoder (one hot)
    ('nominal_encoder', 
        ce.OneHotCategoricalEncoder(variables=config.NOMINAL_VARS,
                                    drop_last=True)),
        
    # Yeo-Johnson numerical variable transformer
    ('yeo_johnson_transformer', 
示例#4
0
trainset = all_dataset[all_dataset.HasDetections != -1] # seperate original train and test sets using -1 sentinel value

del all_dataset
gc.collect()

cat_imputer = missing_data_imputers.FrequentCategoryImputer(variables=dropped_machine_id_cat_names) # missing value imputer for categorical values (the most frequent value)
cont_imputer = missing_data_imputers.MeanMedianImputer(variables=dropped_targets_cont_names) # missing value imputer for numeric values (median value)

cat_imputer.fit(trainset)
trainset = cat_imputer.transform(trainset) # cat_imputer fit and transform for train set
print(trainset.head())

testset = cat_imputer.transform(testset) # cat_imputer (only) transform for test set 
print(testset.head())

rare_cat_enc = categorical_encoders.RareLabelCategoricalEncoder(variables=dropped_machine_id_cat_names)
rare_cat_enc.fit(trainset)
trainset = rare_cat_enc.transform(trainset) # find rare (using a threshold value) categorical values from train set and replace these values with 'rare', this technique is useful to prevent overfitting because of CountFrequencyCategoricalEncoder
testset = rare_cat_enc.transform(testset) # rare_cat_enc (only) transform for test set

cf_cat_enc = categorical_encoders.CountFrequencyCategoricalEncoder(encoding_method='frequency', variables=dropped_machine_id_cat_names)
cf_cat_enc.fit(trainset)
trainset = cf_cat_enc.transform(trainset) # Convert all categorical features to normalized frequency values
testset = cf_cat_enc.transform(testset) # transform for test set

cont_imputer.fit(trainset)
trainset = cont_imputer.transform(trainset) # ift and transform for cont_imputer on train set
print(trainset.head())
testset = cont_imputer.transform(testset) # (only) transform for cont_imputer on test set
print(testset.head())
示例#5
0

target = 'loan_status'

object_cols_x_target = [c for c in object_cols if c != target]

cols_to_drop = datetime_cols + ['address']

p = pipe([
    # Add missing indicators for numeric features
    ("num_nan_ind",mdi.AddMissingIndicator(variables=numeric_cols)),
    # Add missing level for categorical features
    ("fill_cat_nas",mdi.CategoricalVariableImputer(
        fill_value = "_MISSING_", variables=object_cols_x_target)),
    # Bin rare levels of categorical variables
    ("rare_cats",ce.RareLabelCategoricalEncoder(
        tol = 0.02, replace_with = "_RARE_", variables=object_cols_x_target)),
    # Impute missing numeric variables with media
    ("rmmean",mdi.MeanMedianImputer(imputation_method = 'median',variables=numeric_cols)),
    # Drop dates and address columns
    ("drop_date",fs.DropFeatures(features_to_drop=cols_to_drop))])


# In[13]:


df_train_prepped = p.fit_transform(df_train)


# In[14]:

示例#6
0
#call the function for all rows
t_train['Title'] = t_train['Name'].apply(calcTitle)

t_test['Title'] = t_test['Name'].apply(calcTitle)

#delete unsed variables
del t_train['Name'], t_test['Name']
del t_train['Cabin'], t_test['Cabin']

t_train['Embarked'] = t_train['Embarked'].fillna(t_train['Embarked'].mode()[0])
t_test['Fare'] = t_test['Fare'].fillna(t_test['Fare'].mode()[0])

#transofmr the categorical values of Title
from feature_engine import categorical_encoders as ce

rare_encoder = ce.RareLabelCategoricalEncoder(tol=0.04489, n_categories=3)

t_train['Title'] = rare_encoder.fit_transform(pd.DataFrame(t_train['Title']))
t_test['Title'] = rare_encoder.fit_transform(pd.DataFrame(t_test['Title']))

del rare_encoder

t_train['Familysize'] = t_train['SibSp'] + t_train['Parch'] + 1
t_test['Familysize'] = t_test['SibSp'] + t_test['Parch'] + 1

t_train['isAlone'] = np.where((t_train['Familysize'] > 1), 0,
                              t_train['Familysize'])
t_test['isAlone'] = np.where((t_test['Familysize'] > 1), 0,
                             t_test['Familysize'])

t_train['Sex'] = t_train['Sex'].replace({'male': 0, 'female': 1})
    ('missing_ind',
     mdi.AddNaNBinaryImputer(
         variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
    
    ('imputer_num',
     mdi.MeanMedianImputer(
         imputation_method='median',
         variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
    
    ('imputer_cat', mdi.CategoricalVariableImputer(variables=categorical)),

    
    # categorical encoding - section 6
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=6,
                                    variables=categorical + discrete)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                  variables=categorical + discrete)),

    # discretisation + encoding - section 8
    ('discretisation',
     dsc.EqualFrequencyDiscretiser(q=5,
                                   return_object=True,
                                   variables=numerical)),
    ('encoding',
     ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                  variables=numerical)),
    
#     #Lets transform the monotonic relationship into a gaussian distribution