def _num_feature_extraction(self, num_feats_b, b_df, b_agg_df, p_id): for feature in num_feats_b: print(feature) b_agg_df = f.get_aggregate_features_num(b_df, b_agg_df, feature, p_id) # na_ind = b_agg_df[(b_agg_df[feature + '_std'].isna()==True) & # ((b_agg_df[feature+'_mean'])==(b_agg_df[feature+'_median']))].index # # b_agg_df.loc[na_ind][feature+'_std'].fillna(0) # b_agg_df.loc[na_ind][feature'_std'].isna().sum() b_agg_df[feature + '_std'] = np.where( (b_agg_df[feature + '_std'].isna() == True) & ((b_agg_df[feature + '_mean']) == (b_agg_df[feature + '_median'])), 0, b_agg_df[feature + '_std']) b_agg_df.insert(0, p_id, b_agg_df.index) b_agg_df.reset_index(drop=True, inplace=True) return b_agg_df
# As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as # we might loose information. Hence we will extract aggregated features first and then # apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the # features qualitatively b_agg_df = pd.DataFrame() # Create a object for aggregation at SK_ID_CURR level #b_agg = b_df.groupby('SK_ID_CURR') #Aggregating bureau data at Customer Id level for feature in num_feats_b: b_agg_df = f.get_aggregate_features_num(b_df,b_agg_df, feature,'SK_ID_CURR') # na_ind = b_agg_df[(b_agg_df[feature + '_std'].isna()==True) & # ((b_agg_df[feature+'_mean'])==(b_agg_df[feature+'_median']))].index # # b_agg_df.loc[na_ind][feature+'_std'].fillna(0) # b_agg_df.loc[na_ind][feature'_std'].isna().sum() b_agg_df[feature+'_std'] = np.where((b_agg_df[feature+'_std'].isna()==True) & ((b_agg_df[feature+'_mean'])==(b_agg_df[feature+'_median'])), 0, b_agg_df[feature+'_std']) b_agg_df.insert(0,'SK_ID_CURR',b_agg_df.index) b_agg_df.reset_index(drop=True) for feature in cat_feats_b: b_agg_cat = b_df.groupby('SK_ID_CURR')[feature].value_counts() for i in b_df[feature].unique():
# As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as # we might loose information. Hence we will extract aggregated features first and then # apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the # features qualitatively p_agg_df = pd.DataFrame() # Create a object for aggregation at SK_ID_CURR level #b_agg = b_df.groupby('SK_ID_CURR') #Aggregating bureau data at Customer Id level for feature in num_feats_p: p_agg_df = f.get_aggregate_features_num(p_df, p_agg_df, feature, 'SK_ID_CURR') p_agg_df[feature + '_std'] = np.where( (p_agg_df[feature + '_std'].isna() == True) & ((p_agg_df[feature + '_mean']) == (p_agg_df[feature + '_median'])), 0, p_agg_df[feature + '_std']) p_agg_df.insert(0, 'SK_ID_CURR', p_agg_df.index) p_agg_df.reset_index(drop=True) for feature in cat_feats_p: p_agg_cat = p_df.groupby('SK_ID_CURR')[feature].value_counts() for i in p_df[feature].unique(): p_agg_df[feature + '_' + i + '_count'] = p_agg_cat.xs(key=i, level=1) p_agg_df[feature + '_' + i + '_count'].fillna(value=0, inplace=True) # Assuming the NA values where Bureau does not have the data which mean that # in such scenarios the client does not have that entry which mean Zero