def define_dataset(self): # Observe the features with missing values f.get_missing_value_feats(self.ds1_df) # Seperate the categorical and numerical features self.ds1_df.shape num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) # Change the datatype of categorical and numerical values f.change_type(self.ds1_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) par_num_df_start, par_cat_df_start = f.get_params( self.ds1_df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
def missing_value_imputations(self): #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: #indicator = MissingIndicator(missing_values=np.nan) #mask_missing_values_only = indicator.fit_transform(self.ds1_df) #mask_missing_values_only.shape num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) # Num missing values imputations self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna( value=self.ds1_df[num_feats_imp_df].mean()) # Left missing values are categorical. missing_feats_cat = f.get_missing_value_feats(self.ds1_df) par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80 for feature in missing_feats_cat: if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold: self.ds1_df[feature].fillna( value=par_cat_df.loc[feature]['MODE'], inplace=True) print("Method : MODE , Feature : {} , Mode_Percentage : {}". format(feature, par_cat_df.loc[feature]['MODE_PERCENTAGE'])) else: imp_list, score = f.impute_knn_classifier( self.ds1_df, feature, 5) self.ds1_df[feature].fillna(value=imp_list, inplace=True) print( "Method : KNN , Feature : {} , Imputation Accuracy Score : {}" .format(feature, score)) return par_num_df, par_cat_df
def define_params(self, df): num_feats, cat_feats = self.seperate_cat_num_var(df) par_num_df_start, par_cat_df_start = f.get_params( df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
print(len(num_feats_b),len(cat_feats_b)) #num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df) #print(len(num_feats_bal),len(cat_feats_bal)) # Change the datatype of categorical and numerical values f.change_type(b_df,num_feats_b,count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_b,cat_feats_b = f.distinct_feats(b_df) for i in ['SK_ID_BUREAU','SK_ID_CURR']: num_feats_b.remove(i) print(len(num_feats_b),len(cat_feats_b)) par_num_df_start, par_cat_df_start = f.get_params(b_df, num_feats_b, cat_feats_b) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as # we might loose information. Hence we will extract aggregated features first and then # apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the # features qualitatively b_agg_df = pd.DataFrame() # Create a object for aggregation at SK_ID_CURR level #b_agg = b_df.groupby('SK_ID_CURR') #Aggregating bureau data at Customer Id level
# Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Change the datatype of categorical and numerical values f.change_type(x_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats) ############################# IDENTIFYING MISSING FEATS ######################### # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(x_df) missing_value_feats # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, x_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) #################### REMOVING THE VALUES DIRECTLY ########################## # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. imp_df = f.impute_values(x_df, missing_value_feats, 65, action=True)
################################ IMPORT LATEST DATASET ################################ train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv") train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True) # Change the datatype of categorical and numerical values (NOT REQUIRED) #f.change_type(train_df,num_feats,count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(train_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Get the list of attributes and their properties to start par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats, cat_feats) ############# FEATURE CORRELATIONS ########## # Code Block to find the correlated features for various features including featues including each category correlations # This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer # Not using this approach for now as there are no strong correlations with missing value columns x_df_dum = pd.get_dummies(train_df) x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1] x_df_dum.columns = x_df_dum.columns.map(f.remove_space) x_df_Default_dum.columns = x_df_Default_dum.columns.map(f.remove_space) # General correlations wrt Correlations in case of default. x_corr_default = x_df_Default_dum.corr() x_corr = x_df_dum.corr()
# Seperate the categorical and numerical features num_feats_p, cat_feats_p = f.distinct_feats(p_df) print(len(num_feats_p), len(cat_feats_p)) # Change the datatype of categorical and numerical values f.change_type(p_df, num_feats_p, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_p, cat_feats_p = f.distinct_feats(p_df) for i in ['SK_ID_PREV', 'SK_ID_CURR']: num_feats_p.remove(i) print(len(num_feats_p), len(cat_feats_p)) par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p, cat_feats_p) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as # we might loose information. Hence we will extract aggregated features first and then # apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the # features qualitatively p_agg_df = pd.DataFrame() # Create a object for aggregation at SK_ID_CURR level #b_agg = b_df.groupby('SK_ID_CURR') #Aggregating bureau data at Customer Id level
# Seperate the categorical and numerical features num_feats_c, cat_feats_c = f.distinct_feats(c_df) print(len(num_feats_c), len(cat_feats_c)) # Change the datatype of categorical and numerical values f.change_type(c_df, num_feats_c, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_c, cat_feats_c = f.distinct_feats(c_df) for i in ['SK_ID_CURR', 'SK_ID_PREV']: num_feats_c.remove(i) print(len(num_feats_c), len(cat_feats_c)) par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c, cat_feats_c) ############################# FEATURE TREATMENT AND EXTRACTION ######################### # As the features are expected to be extracted and grouped at SK_ID_CURR level # to synchronise at the Loan Application Client level.Hence we need to extract # aggregated information at Client level out of the dataset # This means that treatment to individual columns would not be generallised as # we might loose information. Hence we will extract aggregated features first and then # apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the # features qualitatively c_agg_df = pd.DataFrame() # Create a object for aggregation at SK_ID_CURR level #b_agg = b_df.groupby('SK_ID_CURR') #Aggregating bureau data at Customer Id level