def visualize_target(self, df): logger.info("In DataVisualisation | visualize_target started") try: target_encoder = EncoderStore.get('target') labels = target_encoder.classes_ logger.debug("Target Labels : " + str(labels)) inverse_target = target_encoder.inverse_transform(df['target']) target_as_no = (inverse_target == 'no').sum() target_as_yes = (inverse_target == 'yes').sum() sizes = [target_as_no, target_as_yes] logger.debug("Target counts : " + str(sizes)) colors = ['lightcoral', 'yellowgreen'] patches, texts, percent = plt.pie(sizes, colors=colors, autopct='%1.1f%%', labels=labels, startangle=90, wedgeprops={'edgecolor': 'w'}) plt.legend(patches, labels, loc="best") plt.axis('equal') plt.tight_layout() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'pie_visualization_target')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataVisualisation | visualize_target finished")
def get_binned_data(self, df=None, bins_per_col=4): logger.info("In DataFrameHandler | get_binned_data started") if df is None: df = self.data_frame_original try: binned_dataframe = df.copy() for col in self.numerical_cols: bins = np.linspace(binned_dataframe[col].min(), binned_dataframe[col].max(), bins_per_col + 1) binned_dataframe[col] = pd.cut(binned_dataframe[col], bins, precision=1, include_lowest=True, right=True) cat_list = pd.get_dummies(binned_dataframe[col], prefix=col) binned_dataframe = binned_dataframe.join(cat_list) binned_dataframe = binned_dataframe.drop(col, axis=1) logger.debug('Columns after Dummy Encoding : ' + str(binned_dataframe.columns.values)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_binned_data finished") return binned_dataframe
def create_model(): logger.info('In RandomForestModel | create_model started') n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=100)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } random_forest = RandomForestClassifier(random_state=42) rand = RandomizedSearchCV( random_forest, random_grid, random_state=42, n_iter=10, cv=10, n_jobs=-1, scoring=['recall', 'accuracy', 'neg_log_loss', 'f1', 'roc_auc'], refit='accuracy') logger.info('In RandomForestModel | create_model finished') return random_forest, rand
def create_model(): logger.info('In LogisticRegression | create_model started') hyperparameters = [{ 'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': np.logspace(0, 10, 100) }, { 'solver': ['newton-cg', 'sag', 'lbfgs'], 'penalty': ['l2'], 'C': np.logspace(0, 10, 100) }, { 'solver': ['saga'], 'penalty': ['l1', 'l2', 'elasticnet'], 'C': np.logspace(0, 10, 100) }] logreg = LogisticRegression(solver='saga', random_state=42) rand = RandomizedSearchCV( logreg, hyperparameters, random_state=42, n_iter=10, cv=10, n_jobs=-1, scoring=['recall', 'accuracy', 'neg_log_loss', 'f1', 'roc_auc'], refit='accuracy') logger.info('In LogisticRegression | create_model finished') return logreg, rand
def impute_missing_values(self, df, missing_val_info, method='strategic'): logger.info("In MissingValue | impute_missing_value started") try: possible_methods = ['strategic', 'knn', 'mice'] if method in possible_methods: if method == 'strategic': for col in df.columns: if missing_val_info[col]['percentage'] > 0: logger.debug('Strategically imputing column : ' + str(col)) column_imputation_method = COLUMN_WISE_IMPUTE_TECHNIQUE_MAP.get(col) if column_imputation_method == 'mode': self.__impute_by_mode(df, col) elif column_imputation_method == 'mean': self.__impute_by_mean(df, col) elif column_imputation_method == 'median': self.__impute_by_median(df, col) elif column_imputation_method == 'value': self.__impute_by_value(df, col, 0) elif method == 'knn': self.__impute_by_knn(df) elif method == 'mice': self.__impute_by_mice(df) else: logger.error("Incorrect Imputation Method !!! Possible values : strategic, knn, mice") except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | impute_missing_value finished")
def outlier_DBSCAN(self, df, numerical_cols): logger.info("In OutlierDetection | outlier_DBSCAN started") try: for col in numerical_cols: outlier_detector = DBSCAN(eps=.5, metric='euclidean', min_samples=5, n_jobs=-1) clusters = outlier_detector.fit_predict(df[[col]]) cmap = cm.get_cmap('Set1') df.plot.scatter(x=col, y='target', c=clusters, cmap=cmap, colorbar=False) plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'outlier_visualization_dbscan' + str(col))) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In OutlierDetection | visualize_outlier finished")
def visualize_feature_correlation_heat_map(self, df): logger.info( "In DataVisualisation | visualize_feature_correlation_heat_map started" ) try: fig, ax = plt.subplots(figsize=(20, 20)) chart = sns.heatmap(df.corr(), ax=ax, annot=True, vmin=-1, vmax=1, center=0, cmap='coolwarm') chart.set_xticklabels(chart.get_xticklabels(), rotation=90) chart.set_yticklabels(chart.get_yticklabels(), rotation=0) plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'sns_correlation_heatmap')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info( "In DataVisualisation | visualize_feature_correlation_heat_map finished" )
def __impute_by_mice(self, df): logger.info("In MissingValue | __impute_by_mice started") try: df = mice(data=df) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_mice finished") return df
def __impute_by_value(self, df, col, value): logger.info("In MissingValue | __impute_by_value started") try: logger.debug("Value to replace NAN for column " + str(col) + " : " + str(value)) df[col] = df[col].fillna(value) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_value finished")
def detect_categorical_columns(self, df): logger.info("In PreProcessor | detect_categorical_columns started") try: logger.debug("In detect_categorical_columns | " + str(df.dtypes)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | detect_categorical_columns finished") return df.columns[df.dtypes == np.object]
def __impute_by_mode(self, df, col): logger.info("In MissingValue | __impute_by_mode started") try: column_mode = df[col].mode() logger.debug("Mode obtained for column " + str(col) + " : " + str(column_mode)) df[col] = df[col].fillna(column_mode) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_mode finished")
def __impute_by_knn(self, df): logger.info("In MissingValue | __impute_by_knn started") try: logger.debug("Applying KNN for imputation with k=1") df = fast_knn(k=1, data=df) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | __impute_by_knn finished") return df
def create_model(colnum): logger.info('In NeuralNetworkModel | create_model Started') model = Sequential() model.add(Dense(150, activation='relu', input_shape=(colnum, ))) model.add(Dense(100, activation='relu', input_shape=(colnum, ))) model.add(Dense(50, activation='relu', input_shape=(colnum, ))) model.add(Dense(2, activation='softmax')) print(model.summary()) logger.info('In NeuralNetworkModel | create_model finished') return model
def create_model(): logger.info('In SVMModel | create_model started') C = np.logspace(0, 10, 100) kernel = ['linear', 'poly', 'rbf', 'sigmoid'] gamma = ['scale', 'auto'] hyperparameters = dict(C=C, kernel=kernel, gamma=gamma) svc_classifier = SVC(random_state=42, probability=True, cache_size=2000, tol=0.1) rand = RandomizedSearchCV(svc_classifier, hyperparameters, random_state=42, n_iter=100, cv=10, n_jobs=-1, scoring=['recall', 'accuracy'], refit='accuracy') logger.info('In SVMModel | create_model finished') return svc_classifier, rand
def apply_min_max_scaling(self, df): logger.info("In PreProcessor | apply_min_max_scaling started") try: scaler = MinMaxScaler() for col in self.get_numeric_cols(df): df[col] = scaler.fit_transform(df[[col]]) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | apply_min_max_scaling finished") return df
def load_data(self, default_directory=DEFAULT_DIRECTORY): logger.info("In PreProcessor | load_data started") try: data_file = os.path.join(default_directory, DATA_CSV_FILENAME) logger.debug("In load_data | Reading Data File : " + data_file) df = pd.read_csv(data_file) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | load_data finished") return df
def convert_numerical_to_Categorical(self, df, eligible_cols): try: ogger.info( "In FeatureEngineering | create_category_percent started") d1 = data.copy() length = len(d1) logger.info( "In FeatureEngineering | create_category_percent finished") return d1 except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err))
def visualize_missing_value_heatmap(self, df): logger.info("In MissingValue | visualize_missing_value_heatmap started") try: msno.heatmap(df) plt.ion() plt.show() plt.savefig(os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'correlation_heatmap.png')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | visualize_missing_value_heatmap finished")
def split_attribute_and_target(self, df=None): logger.info("In DataFrameHandler | split_attribute_and_target started") if df is None: df = self.data_frame_original try: target = df[self.target_col] attribute_set = df.drop(self.target_col, axis=1) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info( "In DataFrameHandler | split_attribute_and_target finished") return {'attributes': attribute_set, 'target': target}
def get_scaled_data(self, df=None): logger.info("In DataFrameHandler | get_scaled_data started") if df is None: df = self.data_frame_original try: scaled_dataframe = df.copy() scaled_dataframe[self.numerical_cols] = self.scaler.fit_transform( scaled_dataframe[self.numerical_cols]) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_scaled_data finished") return scaled_dataframe
def get_label_encoded_data(self, df=None): logger.info("In DataFrameHandler | get_label_encoded_data started") if df is None: df = self.data_frame_original try: label_encoded_dataframe = df.copy() for col in self.categorical_cols: label_encoded_dataframe[col] = self.labelEncoder.fit_transform( label_encoded_dataframe[col]) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_label_encoded_data finished") return label_encoded_dataframe
def visualize_job_vs_target(self, df): logger.info("In DataVisualisation | visualize_job_vs_target started") try: target_encoder = EncoderStore.get('target') inverse_target = target_encoder.inverse_transform(df['target']) job_encoder = EncoderStore.get('job') job_labels = job_encoder.classes_ inverse_job = job_encoder.inverse_transform(df['job']) sizes_not_paid = [] sizes_paid = [] for label in job_labels: job_label_and_target_no = ((inverse_job == label) & (inverse_target == 'no')).sum() job_label_and_target_yes = ((inverse_job == label) & (inverse_target == 'yes')).sum() sizes_not_paid.append(job_label_and_target_no) sizes_paid.append(job_label_and_target_yes) colors = [ "aqua", "azure", "brown", "chartreuse", "coral", "crimson", "cyan", "fuchsia", "goldenrod", "lavender", "purple", "teal" ] fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 20)) ax1.pie(sizes_not_paid, autopct='%1.1f%%', labels=job_labels, startangle=90, colors=colors, wedgeprops={'edgecolor': 'w'}) ax1.axis('equal') ax1.set_title('Target: no') ax2.pie(sizes_paid, autopct='%1.1f%%', labels=job_labels, startangle=90, colors=colors, wedgeprops={'edgecolor': 'w'}) ax2.set_title('Target: yes') ax2.axis('equal') plt.tight_layout() plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'pie_visualization_job_vs_target')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataVisualisation | visualize_job_vs_target finished")
def convert_to_categorical_values(self, df, cat_cols, use_label_encoder=False): logger.info("In PreProcessor | convert_to_categorical_values started") try: if use_label_encoder: for col in cat_cols: if col in COLUMNS_CATEGORIZATION_APPLICABLE: logger.debug('Categorizing Column : ' + str(col)) encoder = LabelEncoder() logger.debug('Column unique value : ' + str(df[col].unique())) encoder.fit(df[col].unique()) df[col] = encoder.fit_transform(df[col]) EncoderStore.save(col, encoder) if not use_label_encoder: one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') for col in cat_cols: enc_df = pd.DataFrame( one_hot_encoder.fit_transform(df[[col]])) enc_df.columns = one_hot_encoder.get_feature_names([col]) df = df.join(enc_df) df = df.drop(col, axis=1) logger.info('Columns in dataframe after one hot encoding: ' + str(df.columns)) logger.info('Shape of dataframe after one hot encoding: ' + str(df.shape)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In PreProcessor | convert_to_categorical_values finished") return df
def create_category_percent(self, data, categorical_cols): try: logger.info( "In FeatureEngineering | create_category_percent started") d1 = data.copy() length = len(d1) for col in categorical_cols: d1[col + 'Pct'] = (d1[col].groupby( d1[col]).transform('count')) * 100 / length logger.info( "In FeatureEngineering | create_category_percent finished") return d1 except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err))
def create_bin(self, data, numerical_cols, number_of_bins=4): try: logger.info("In FeatureEngineering | create_bin started") d1 = data.copy() for col in numerical_cols: bins = np.linspace(d1[col].min(), d1[col].max(), number_of_bins) d1[col + '_bin'] = pd.cut(d1[col], bins, precision=1, include_lowest=True, right=True) logger.info("In FeatureEngineering | create_bin finished") return d1 except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err))
def visualize_duration_vs_target(self, df): logger.info( "In DataVisualisation | visualize_duration_vs_target started") try: target_encoder = EncoderStore.get('target') bar_labels = target_encoder.classes_ inverse_target = target_encoder.inverse_transform(df['target']) target_as_no = (inverse_target == 'no').sum() target_as_yes = (inverse_target == 'yes').sum() duration_gt_180_and_target_no = ((df['duration'] > 180) & (inverse_target == 'no')).sum() duration_lte_180_and_target_no = ((df['duration'] <= 180) & (inverse_target == 'no')).sum() duration_gt_180_and_target_yes = ((df['duration'] > 180) & (inverse_target == 'yes')).sum() duration_lte_180_and_target_yes = ( (df['duration'] <= 180) & (inverse_target == 'yes')).sum() x_labels = ['duration>180', 'duration<=180'] x = np.arange(2) ax = plt.subplot(1, 1, 1) w = 0.3 not_paid = [ duration_gt_180_and_target_no / target_as_no, duration_lte_180_and_target_no / target_as_no ] paid = [ duration_gt_180_and_target_yes / target_as_yes, duration_lte_180_and_target_yes / target_as_yes ] plt.xticks(x + w / 2, x_labels) not_paid_bar = ax.bar(x, not_paid, color="lightcoral", width=w) paid_bar = ax.bar(x + w, paid, color="yellowgreen", width=w) plt.legend([not_paid_bar, paid_bar], bar_labels) plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'bar_visualization_duration_vs_target')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info( "In DataVisualisation | visualize_duration_vs_target finished")
def visualize_marital_status_vs_target(self, df): logger.info( "In DataVisualisation | visualize_marital_status_vs_target started" ) try: target_encoder = EncoderStore.get('target') bar_labels = target_encoder.classes_ inverse_target = target_encoder.inverse_transform(df['target']) target_as_no = (inverse_target == 'no').sum() target_as_yes = (inverse_target == 'yes').sum() marital_status_encoder = EncoderStore.get('marital') inverse_marital_status = marital_status_encoder.inverse_transform( df['marital']) x_labels = marital_status_encoder.classes_ not_paid = [] paid = [] for stat in x_labels: marital_stat_and_target_no = ((inverse_marital_status == stat) & (inverse_target == 'no')).sum() marital_stat_and_target_yes = ( (inverse_marital_status == stat) & (inverse_target == 'yes')).sum() not_paid.append(marital_stat_and_target_no / target_as_no) paid.append(marital_stat_and_target_yes / target_as_yes) x = np.arange(3) ax = plt.subplot(1, 1, 1) w = 0.3 plt.xticks(x + w / 2, x_labels) not_paid_bar = ax.bar(x, not_paid, color="lightcoral", width=w) paid_bar = ax.bar(x + w, paid, color="yellowgreen", width=w) plt.legend([not_paid_bar, paid_bar], bar_labels) plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'bar_visualization_marital_status_vs_target')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info( "In DataVisualisation | visualize_marital_status_vs_target finished" )
def get_missing_values_info(self, df): logger.info("In MissingValue | get_missing_values_info started") info = {} try: for col in df.columns: missing_val_count = df[col].isnull().sum() total_row_count = df[col].shape[0] logger.debug("Missing values in Column " + col + " : " + str(missing_val_count)) logger.debug("Total Entries in Column " + col + " : " + str(total_row_count)) info[col] = { 'count': missing_val_count, 'percentage': (missing_val_count / total_row_count) * 100 } except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In MissingValue | get_missing_values_info finished") return info
def visualize_outlier(self, df): logger.info("In OutlierDetection | visualize_outlier started") try: chart = boxplot(x='variable', y='value', data=pd.melt(df), width=0.5, palette="colorblind") chart.set_xticklabels(chart.get_xticklabels(), rotation=90) plt.ion() plt.show() plt.savefig( os.path.join(VISUALIZATION_SAVE_DIRECTORY, 'outlier_visualization')) plt.pause(1) plt.close() except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In OutlierDetection | visualize_outlier finished")
def get_dummies_data(self, df=None): logger.info("In DataFrameHandler | get_dummies_data started") if df is None: df = self.data_frame_original try: dummies_dataframe = df.copy() for col in self.categorical_cols: cat_list = pd.get_dummies(dummies_dataframe[col], prefix=col) dummies_dataframe = dummies_dataframe.join(cat_list) all_dummies_cols = dummies_dataframe.columns.values.tolist() cols_to_keep = [ col for col in all_dummies_cols if col not in self.categorical_cols ] dummies_dataframe = dummies_dataframe[cols_to_keep] logger.debug('Columns after Dummy Encoding : ' + str(dummies_dataframe.columns.values)) except Exception as exp: err = self.errObj.handleErr(str(exp)) logger.error(str(err)) logger.info("In DataFrameHandler | get_dummies_data finished") return dummies_dataframe