def visualize_target(self, df): logger.info("In DataVisualisation | visualize_target started") try: target_encoder = EncoderStore.get('target') labels = target_encoder.classes_ logger.debug("Target Labels : " + str(labels)) inverse_target = target_encoder.inverse_transform(df['target']) target_as_no = (inverse_target == 'no').sum() target_as_yes = (inverse_target == 'yes').sum() sizes = [target_as_no, target_as_yes] logger.debug("Target counts : " + str(sizes)) colors = ['lightcoral', 'yellowgreen'] patches, texts, percent = plt.pie(sizes, colors=colors, autopct='%1.1f%%', labels=labels, startangle=90, wedgeprops={'edgecolor': 'w'}) plt.legend(patches, labels, loc="best") plt.axis('equal') plt.tight_layout() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'pie_visualization_target')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataVisualisation | visualize_target finished")
def get_binned_data(self, df=None, bins_per_col=4): logger.info("In DataFrameHandler | get_binned_data started") if df is None: df = self.data_frame_original try: binned_dataframe = df.copy() for col in self.numerical_cols: bins = np.linspace(binned_dataframe[col].min(), binned_dataframe[col].max(), bins_per_col + 1) binned_dataframe[col] = pd.cut(binned_dataframe[col], bins, precision=1, include_lowest=True, right=True) cat_list = pd.get_dummies(binned_dataframe[col], prefix=col) binned_dataframe = binned_dataframe.join(cat_list) binned_dataframe = binned_dataframe.drop(col, axis=1) logger.debug('Columns after Dummy Encoding : ' + str(binned_dataframe.columns.values)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_binned_data finished") return binned_dataframe
def convert_to_categorical_values(self, df, cat_cols, use_label_encoder=False): logger.info("In PreProcessor | convert_to_categorical_values started") try: if use_label_encoder: for col in cat_cols: if col in COLUMNS_CATEGORIZATION_APPLICABLE: logger.debug('Categorizing Column : ' + str(col)) encoder = LabelEncoder() logger.debug('Column unique value : ' + str(df[col].unique())) encoder.fit(df[col].unique()) df[col] = encoder.fit_transform(df[col]) EncoderStore.save(col, encoder) if not use_label_encoder: one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') for col in cat_cols: enc_df = pd.DataFrame( one_hot_encoder.fit_transform(df[[col]])) enc_df.columns = one_hot_encoder.get_feature_names([col]) df = df.join(enc_df) df = df.drop(col, axis=1) logger.info('Columns in dataframe after one hot encoding: ' + str(df.columns)) logger.info('Shape of dataframe after one hot encoding: ' + str(df.shape)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | convert_to_categorical_values finished") return df
def impute_missing_values(self, df, missing_val_info, method='strategic'): logger.info("In MissingValue | impute_missing_value started") try: possible_methods = ['strategic', 'knn', 'mice'] if method in possible_methods: if method == 'strategic': for col in df.columns: if missing_val_info[col]['percentage'] > 0: logger.debug('Strategically imputing column : ' + str(col)) column_imputation_method = COLUMN_WISE_IMPUTE_TECHNIQUE_MAP.get(col) if column_imputation_method == 'mode': self.__impute_by_mode(df, col) elif column_imputation_method == 'mean': self.__impute_by_mean(df, col) elif column_imputation_method == 'median': self.__impute_by_median(df, col) elif column_imputation_method == 'value': self.__impute_by_value(df, col, 0) elif method == 'knn': self.__impute_by_knn(df) elif method == 'mice': self.__impute_by_mice(df) else: logger.error("Incorrect Imputation Method !!! Possible values : strategic, knn, mice") except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | impute_missing_value finished")
def detect_categorical_columns(self, df): logger.info("In PreProcessor | detect_categorical_columns started") try: logger.debug("In detect_categorical_columns | " + str(df.dtypes)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | detect_categorical_columns finished") return df.columns[df.dtypes == np.object]
def __impute_by_value(self, df, col, value): logger.info("In MissingValue | __impute_by_value started") try: logger.debug("Value to replace NAN for column " + str(col) + " : " + str(value)) df[col] = df[col].fillna(value) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_value finished")
def __impute_by_mode(self, df, col): logger.info("In MissingValue | __impute_by_mode started") try: column_mode = df[col].mode() logger.debug("Mode obtained for column " + str(col) + " : " + str(column_mode)) df[col] = df[col].fillna(column_mode) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_mode finished")
def __impute_by_knn(self, df): logger.info("In MissingValue | __impute_by_knn started") try: logger.debug("Applying KNN for imputation with k=1") df = fast_knn(k=1, data=df) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_knn finished") return df
def load_data(self, default_directory=DEFAULT_DIRECTORY): logger.info("In PreProcessor | load_data started") try: data_file = os.path.join(default_directory, DATA_CSV_FILENAME) logger.debug("In load_data | Reading Data File : " + data_file) df = pd.read_csv(data_file) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | load_data finished") return df
def get_missing_values_info(self, df): logger.info("In MissingValue | get_missing_values_info started") info = {} try: for col in df.columns: missing_val_count = df[col].isnull().sum() total_row_count = df[col].shape[0] logger.debug("Missing values in Column " + col + " : " + str(missing_val_count)) logger.debug("Total Entries in Column " + col + " : " + str(total_row_count)) info[col] = { 'count': missing_val_count, 'percentage': (missing_val_count / total_row_count) * 100 } except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | get_missing_values_info finished") return info
def get_dummies_data(self, df=None): logger.info("In DataFrameHandler | get_dummies_data started") if df is None: df = self.data_frame_original try: dummies_dataframe = df.copy() for col in self.categorical_cols: cat_list = pd.get_dummies(dummies_dataframe[col], prefix=col) dummies_dataframe = dummies_dataframe.join(cat_list) all_dummies_cols = dummies_dataframe.columns.values.tolist() cols_to_keep = [ col for col in all_dummies_cols if col not in self.categorical_cols ] dummies_dataframe = dummies_dataframe[cols_to_keep] logger.debug('Columns after Dummy Encoding : ' + str(dummies_dataframe.columns.values)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_dummies_data finished") return dummies_dataframe
def main(): pre_process = PreProcessor() df = pre_process.load_data() df_handler = DataFrameHandler(df) ###################################################################################################################### # Model 1: Dummy Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables # ###################################################################################################################### dummies_df = df_handler.get_dummies_data() logger.debug(dummies_df.head()) scaled_df = df_handler.get_scaled_data(df=dummies_df) attribute_target_split_result = df_handler.split_attribute_and_target(df=scaled_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded_MinMaxScaling') ###################################################################################################################### # Model 2: Dummy Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables-> SMOTE on training Set# ###################################################################################################################### X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded_MinMaxScaling_SMOTE', perform_smote=True) ###################################################################################################################### # Model 3: Dummy Encoding for Categorical Variables-> SMOTE on training Set # ###################################################################################################################### attribute_target_split_result = df_handler.split_attribute_and_target(df=dummies_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded_SMOTE', perform_smote=True) ###################################################################################################################### # Model 4: Dummy Encoding for Categorical Variables # ###################################################################################################################### attribute_target_split_result = df_handler.split_attribute_and_target(df=dummies_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded', perform_smote=False) ###################################################################################################################### # Model 5: Dummy Encoding for Categorical Variables-> Binning for Numerical Variables # ###################################################################################################################### dummies_df = df_handler.get_dummies_data() logger.debug(dummies_df.head()) binned_df = df_handler.get_binned_data(df=dummies_df) attribute_target_split_result = df_handler.split_attribute_and_target(df=binned_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded_Binning', perform_smote=False) ###################################################################################################################### # Model 6: Dummy Encoding for Categorical Variables-> Binning for Numerical Variables-> SMOTE on training Set # ###################################################################################################################### X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'DummyEncoded_Binning_SMOTE', perform_smote=True) ###################################################################################################################### # Model 7: Label Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables # ###################################################################################################################### label_df = df_handler.get_label_encoded_data() scaled_df = df_handler.get_scaled_data(df=label_df) attribute_target_split_result = df_handler.split_attribute_and_target(df=scaled_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabeEncoded_MinMaxScaling', perform_smote=False) ###################################################################################################################### # Model 8: Label Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables-> SMOTE on training Set# ###################################################################################################################### X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabelEncoded_MinMaxScaling_SMOTE', perform_smote=True) ###################################################################################################################### # Model 9: Label Encoding for Categorical Variables-> SMOTE on training Set # ###################################################################################################################### attribute_target_split_result = df_handler.split_attribute_and_target(df=label_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabelEncoded_SMOTE', perform_smote=True) ###################################################################################################################### # Model 10: Label Encoding for Categorical Variables # ###################################################################################################################### attribute_target_split_result = df_handler.split_attribute_and_target(df=label_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabelEncoded', perform_smote=False) ###################################################################################################################### # Model 11: Label Encoding for Categorical Variables-> Binning for Numerical Variables # ###################################################################################################################### label_df = df_handler.get_label_encoded_data() binned_df = df_handler.get_binned_data(df=label_df) attribute_target_split_result = df_handler.split_attribute_and_target(df=binned_df) X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabelEncoded_Binning', perform_smote=False) ###################################################################################################################### # Model 12: Label Encoding for Categorical Variables-> Binning for Numerical Variables-> SMOTE on training Set # ###################################################################################################################### X = attribute_target_split_result['attributes'] y = attribute_target_split_result['target'] print(X.head()) run_model(X, y, 'LabelEncoded_Binning_SMOTE', perform_smote=True)
def run_model(X, y, model_name, df_handler, perform_smote=False): logger.info('In NeuralNetworkModel | run_model Started for ' + model_name + ' model.') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) if perform_smote: sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) y_train = pd.get_dummies(y_train, prefix=df_handler.target_col) y_test = pd.get_dummies(y_test, prefix=df_handler.target_col) colnum = X_train.shape[1] model = create_model(colnum) model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=45) score = model.evaluate(X_test, y_test, verbose=0) print(model_name + ' Model Testing Score : ' + str(score)) plt.plot(np.arange(0, len(history.history['loss'])), history.history['loss']) plt.title("Loss") plt.grid() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_losses')) plt.pause(1) plt.close() plt.plot(np.arange(0, len(history.history['accuracy'])), history.history['accuracy']) plt.title("Accuracy") plt.grid() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_accuracy')) plt.pause(1) plt.close() y_pred = model.predict(X_test) logger.debug(y_pred) rounded_predictions = model.predict_classes(X_test) logger.debug(rounded_predictions) print('Confusion Matrix: ') print(confusion_matrix(y_test['target_yes'], rounded_predictions)) print('Classification Report: ') print(classification_report(y_test['target_yes'], rounded_predictions)) probs = model.predict_proba(X_test) probs = probs[:, 1] auc = roc_auc_score(y_test['target_yes'], probs) print('AUC: %.2f' % auc) fpr, tpr, thresholds = roc_curve(y_test['target_yes'], probs) plt.plot(fpr, tpr, color='orange', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend() plt.grid() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_roc_auc')) plt.pause(1) plt.close() logger.info('In NeuralNetworkModel | run_model finished')