예제 #1
0
 def visualize_target(self, df):
     logger.info("In DataVisualisation | visualize_target started")
     try:
         target_encoder = EncoderStore.get('target')
         labels = target_encoder.classes_
         logger.debug("Target Labels : " + str(labels))
         inverse_target = target_encoder.inverse_transform(df['target'])
         target_as_no = (inverse_target == 'no').sum()
         target_as_yes = (inverse_target == 'yes').sum()
         sizes = [target_as_no, target_as_yes]
         logger.debug("Target counts : " + str(sizes))
         colors = ['lightcoral', 'yellowgreen']
         patches, texts, percent = plt.pie(sizes,
                                           colors=colors,
                                           autopct='%1.1f%%',
                                           labels=labels,
                                           startangle=90,
                                           wedgeprops={'edgecolor': 'w'})
         plt.legend(patches, labels, loc="best")
         plt.axis('equal')
         plt.tight_layout()
         plt.ion()
         plt.show()
         plt.savefig(
             os.path.join(VISUALIZATION_SAVE_DIRECTORY,
                          'pie_visualization_target'))
         plt.pause(1)
         plt.close()
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataVisualisation | visualize_target finished")
 def get_binned_data(self, df=None, bins_per_col=4):
     logger.info("In DataFrameHandler | get_binned_data started")
     if df is None:
         df = self.data_frame_original
     try:
         binned_dataframe = df.copy()
         for col in self.numerical_cols:
             bins = np.linspace(binned_dataframe[col].min(),
                                binned_dataframe[col].max(),
                                bins_per_col + 1)
             binned_dataframe[col] = pd.cut(binned_dataframe[col],
                                            bins,
                                            precision=1,
                                            include_lowest=True,
                                            right=True)
             cat_list = pd.get_dummies(binned_dataframe[col], prefix=col)
             binned_dataframe = binned_dataframe.join(cat_list)
             binned_dataframe = binned_dataframe.drop(col, axis=1)
         logger.debug('Columns after Dummy Encoding : ' +
                      str(binned_dataframe.columns.values))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_binned_data finished")
     return binned_dataframe
예제 #3
0
 def convert_to_categorical_values(self,
                                   df,
                                   cat_cols,
                                   use_label_encoder=False):
     logger.info("In PreProcessor | convert_to_categorical_values started")
     try:
         if use_label_encoder:
             for col in cat_cols:
                 if col in COLUMNS_CATEGORIZATION_APPLICABLE:
                     logger.debug('Categorizing Column : ' + str(col))
                     encoder = LabelEncoder()
                     logger.debug('Column unique value : ' +
                                  str(df[col].unique()))
                     encoder.fit(df[col].unique())
                     df[col] = encoder.fit_transform(df[col])
                     EncoderStore.save(col, encoder)
         if not use_label_encoder:
             one_hot_encoder = OneHotEncoder(sparse=False,
                                             handle_unknown='ignore')
             for col in cat_cols:
                 enc_df = pd.DataFrame(
                     one_hot_encoder.fit_transform(df[[col]]))
                 enc_df.columns = one_hot_encoder.get_feature_names([col])
                 df = df.join(enc_df)
                 df = df.drop(col, axis=1)
             logger.info('Columns in dataframe after one hot encoding: ' +
                         str(df.columns))
             logger.info('Shape of dataframe after one hot encoding: ' +
                         str(df.shape))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | convert_to_categorical_values finished")
     return df
예제 #4
0
 def impute_missing_values(self, df, missing_val_info, method='strategic'):
     logger.info("In MissingValue | impute_missing_value started")
     try:
         possible_methods = ['strategic', 'knn', 'mice']
         if method in possible_methods:
             if method == 'strategic':
                 for col in df.columns:
                     if missing_val_info[col]['percentage'] > 0:
                         logger.debug('Strategically imputing column : ' + str(col))
                         column_imputation_method = COLUMN_WISE_IMPUTE_TECHNIQUE_MAP.get(col)
                         if column_imputation_method == 'mode':
                             self.__impute_by_mode(df, col)
                         elif column_imputation_method == 'mean':
                             self.__impute_by_mean(df, col)
                         elif column_imputation_method == 'median':
                             self.__impute_by_median(df, col)
                         elif column_imputation_method == 'value':
                             self.__impute_by_value(df, col, 0)
             elif method == 'knn':
                 self.__impute_by_knn(df)
             elif method == 'mice':
                 self.__impute_by_mice(df)
         else:
             logger.error("Incorrect Imputation Method !!! Possible values : strategic, knn, mice")
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | impute_missing_value finished")
예제 #5
0
 def detect_categorical_columns(self, df):
     logger.info("In PreProcessor | detect_categorical_columns started")
     try:
         logger.debug("In detect_categorical_columns | " + str(df.dtypes))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | detect_categorical_columns finished")
     return df.columns[df.dtypes == np.object]
예제 #6
0
 def __impute_by_value(self, df, col, value):
     logger.info("In MissingValue | __impute_by_value started")
     try:
         logger.debug("Value to replace NAN for column " + str(col) + " : " + str(value))
         df[col] = df[col].fillna(value)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_value finished")
예제 #7
0
 def __impute_by_mode(self, df, col):
     logger.info("In MissingValue | __impute_by_mode started")
     try:
         column_mode = df[col].mode()
         logger.debug("Mode obtained for column " + str(col) + " : " + str(column_mode))
         df[col] = df[col].fillna(column_mode)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_mode finished")
예제 #8
0
 def __impute_by_knn(self, df):
     logger.info("In MissingValue | __impute_by_knn started")
     try:
         logger.debug("Applying KNN for imputation with k=1")
         df = fast_knn(k=1, data=df)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | __impute_by_knn finished")
     return df
예제 #9
0
 def load_data(self, default_directory=DEFAULT_DIRECTORY):
     logger.info("In PreProcessor | load_data started")
     try:
         data_file = os.path.join(default_directory, DATA_CSV_FILENAME)
         logger.debug("In load_data | Reading Data File : " + data_file)
         df = pd.read_csv(data_file)
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In PreProcessor | load_data finished")
     return df
예제 #10
0
 def get_missing_values_info(self, df):
     logger.info("In MissingValue | get_missing_values_info started")
     info = {}
     try:
         for col in df.columns:
             missing_val_count = df[col].isnull().sum()
             total_row_count = df[col].shape[0]
             logger.debug("Missing values in Column " + col + " : " + str(missing_val_count))
             logger.debug("Total Entries in Column " + col + " : " + str(total_row_count))
             info[col] = {
                 'count': missing_val_count,
                 'percentage': (missing_val_count / total_row_count) * 100
             }
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In MissingValue | get_missing_values_info finished")
     return info
예제 #11
0
 def get_dummies_data(self, df=None):
     logger.info("In DataFrameHandler | get_dummies_data started")
     if df is None:
         df = self.data_frame_original
     try:
         dummies_dataframe = df.copy()
         for col in self.categorical_cols:
             cat_list = pd.get_dummies(dummies_dataframe[col], prefix=col)
             dummies_dataframe = dummies_dataframe.join(cat_list)
         all_dummies_cols = dummies_dataframe.columns.values.tolist()
         cols_to_keep = [
             col for col in all_dummies_cols
             if col not in self.categorical_cols
         ]
         dummies_dataframe = dummies_dataframe[cols_to_keep]
         logger.debug('Columns after Dummy Encoding : ' +
                      str(dummies_dataframe.columns.values))
     except Exception as exp:
         err = self.errObj.handleErr(str(exp))
         logger.error(str(err))
     logger.info("In DataFrameHandler | get_dummies_data finished")
     return dummies_dataframe
예제 #12
0
def main():
    pre_process = PreProcessor()
    df = pre_process.load_data()
    df_handler = DataFrameHandler(df)
    ######################################################################################################################
    #  Model 1: Dummy Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables                        #
    ######################################################################################################################
    dummies_df = df_handler.get_dummies_data()
    logger.debug(dummies_df.head())
    scaled_df = df_handler.get_scaled_data(df=dummies_df)
    attribute_target_split_result = df_handler.split_attribute_and_target(df=scaled_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded_MinMaxScaling')
    ######################################################################################################################
    #  Model 2: Dummy Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables-> SMOTE on training Set#
    ######################################################################################################################
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded_MinMaxScaling_SMOTE', perform_smote=True)
    ######################################################################################################################
    #  Model 3: Dummy Encoding for Categorical Variables-> SMOTE on training Set                                         #
    ######################################################################################################################
    attribute_target_split_result = df_handler.split_attribute_and_target(df=dummies_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded_SMOTE', perform_smote=True)
    ######################################################################################################################
    #  Model 4: Dummy Encoding for Categorical Variables                                                                 #
    ######################################################################################################################
    attribute_target_split_result = df_handler.split_attribute_and_target(df=dummies_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded', perform_smote=False)
    ######################################################################################################################
    #  Model 5: Dummy Encoding for Categorical Variables-> Binning for Numerical Variables                               #
    ######################################################################################################################
    dummies_df = df_handler.get_dummies_data()
    logger.debug(dummies_df.head())
    binned_df = df_handler.get_binned_data(df=dummies_df)
    attribute_target_split_result = df_handler.split_attribute_and_target(df=binned_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded_Binning', perform_smote=False)
    ######################################################################################################################
    #  Model 6: Dummy Encoding for Categorical Variables-> Binning for Numerical Variables-> SMOTE on training Set       #
    ######################################################################################################################
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'DummyEncoded_Binning_SMOTE', perform_smote=True)
    ######################################################################################################################
    #  Model 7: Label Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables                        #
    ######################################################################################################################
    label_df = df_handler.get_label_encoded_data()
    scaled_df = df_handler.get_scaled_data(df=label_df)
    attribute_target_split_result = df_handler.split_attribute_and_target(df=scaled_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabeEncoded_MinMaxScaling', perform_smote=False)
    ######################################################################################################################
    #  Model 8: Label Encoding for Categorical Variables-> MinMax Scaling for Numerical Variables-> SMOTE on training Set#
    ######################################################################################################################
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabelEncoded_MinMaxScaling_SMOTE', perform_smote=True)
    ######################################################################################################################
    #  Model 9: Label Encoding for Categorical Variables-> SMOTE on training Set                                         #
    ######################################################################################################################
    attribute_target_split_result = df_handler.split_attribute_and_target(df=label_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabelEncoded_SMOTE', perform_smote=True)
    ######################################################################################################################
    #  Model 10: Label Encoding for Categorical Variables                                                                 #
    ######################################################################################################################
    attribute_target_split_result = df_handler.split_attribute_and_target(df=label_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabelEncoded', perform_smote=False)
    ######################################################################################################################
    #  Model 11: Label Encoding for Categorical Variables-> Binning for Numerical Variables                               #
    ######################################################################################################################
    label_df = df_handler.get_label_encoded_data()
    binned_df = df_handler.get_binned_data(df=label_df)
    attribute_target_split_result = df_handler.split_attribute_and_target(df=binned_df)
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabelEncoded_Binning', perform_smote=False)
    ######################################################################################################################
    #  Model 12: Label Encoding for Categorical Variables-> Binning for Numerical Variables-> SMOTE on training Set       #
    ######################################################################################################################
    X = attribute_target_split_result['attributes']
    y = attribute_target_split_result['target']
    print(X.head())
    run_model(X, y, 'LabelEncoded_Binning_SMOTE', perform_smote=True)
예제 #13
0
def run_model(X, y, model_name, df_handler, perform_smote=False):
    logger.info('In NeuralNetworkModel | run_model Started for ' + model_name +
                ' model.')
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    if perform_smote:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
        y_train = pd.get_dummies(y_train, prefix=df_handler.target_col)
        y_test = pd.get_dummies(y_test, prefix=df_handler.target_col)
    colnum = X_train.shape[1]
    model = create_model(colnum)
    model.compile(optimizer='adamax',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=45)
    score = model.evaluate(X_test, y_test, verbose=0)
    print(model_name + ' Model Testing Score : ' + str(score))
    plt.plot(np.arange(0, len(history.history['loss'])),
             history.history['loss'])
    plt.title("Loss")
    plt.grid()
    plt.ion()
    plt.show()
    plt.savefig(
        os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_losses'))
    plt.pause(1)
    plt.close()
    plt.plot(np.arange(0, len(history.history['accuracy'])),
             history.history['accuracy'])
    plt.title("Accuracy")
    plt.grid()
    plt.ion()
    plt.show()
    plt.savefig(
        os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_accuracy'))
    plt.pause(1)
    plt.close()
    y_pred = model.predict(X_test)
    logger.debug(y_pred)
    rounded_predictions = model.predict_classes(X_test)
    logger.debug(rounded_predictions)
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test['target_yes'], rounded_predictions))
    print('Classification Report: ')
    print(classification_report(y_test['target_yes'], rounded_predictions))
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    auc = roc_auc_score(y_test['target_yes'], probs)
    print('AUC: %.2f' % auc)
    fpr, tpr, thresholds = roc_curve(y_test['target_yes'], probs)
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.grid()
    plt.ion()
    plt.show()
    plt.savefig(
        os.path.join(VISUALIZATION_SAVE_DIRECTORY, model_name + '_roc_auc'))
    plt.pause(1)
    plt.close()
    logger.info('In NeuralNetworkModel | run_model finished')