def eda_missing_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the percent of values missing for each feature. Optionally displays the underlying Pandas DataFrame. --- Parameters: data : Pandas DataFrame, default=None Pandas DataFrame containing independent variables. If left as none, the feature dataset provided to Machine during instantiation is used. color : str or color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame in addition to chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # use dataset provided during instantiation if None if data is None: data = self.data # return missingness summary percent_missing = self.missing_summary(data) # if missingness summary is not empty, create the visualization if not percent_missing.empty: # optionally display DataFrame summary if display_df: display(percent_missing) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Percent missing by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=percent_missing.index, counts=percent_missing["Percent missing"], label_rotate=45 if len(percent_missing.index) <=5 else 90, color=color, y_units="p", x_tick_wrap=False, ax=ax, ) # if missingness summary is empty, just print "No Nulls" else: print("No nulls")
def eda_missing_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the percent of values missing for each feature. Optionally displays the underlying Pandas DataFrame. --- Parameters: training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color : str or color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame in addition to chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # dynamically choose training data objects or validation data objects data, _, mlm_dtypes = self.training_or_validation_dataset(training_data) # return missingness summary percent_missing = self.missing_summary(training_data) # if missingness summary is not empty, create the visualization if not percent_missing.empty: # optionally display DataFrame summary if display_df: display(percent_missing) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Percent missing by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=percent_missing.index, counts=percent_missing["Percent missing"], label_rotate=45 if len(percent_missing.index) <=5 else 90, color=color, y_units="p", x_tick_wrap=False, ax=ax, ) ax.set_ylim([0,100]) # if missingness summary is empty, just print "No Nulls" else: print("No nulls")
def eda_skew_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the skew for each feature. Optionally displaying the underlying Pandas DataFrame. --- Parameters: data : Pandas DataFrame, default=None Pandas DataFrame containing independent variables. If left as none, the feature dataset provided to Machine during instantiation is used. color : str, color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame along with chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # use dataset provided during instantiation if None if data is None: data = self.data # return skewness summary skew_summary = self.skew_summary(data) # optionally display DataFrame summary if display_df: display(skew_summary) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Skew by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=skew_summary.index, counts=skew_summary["Skew"], label_rotate=45 if len(skew_summary.index) <=5 else 90, color=color, y_units="fff", x_tick_wrap=False, ax=ax, )
def eda_skew_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the skew for each feature. Optionally displaying the underlying Pandas DataFrame. --- Parameters: training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color : str, color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame along with chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # dynamically choose training data objects or validation data objects data, _, mlm_dtypes = self.training_or_validation_dataset(training_data) # return skewness summary skew_summary = self.skew_summary(data) # optionally display DataFrame summary if display_df: display(skew_summary) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Skew by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=skew_summary.index, counts=skew_summary["Skew"], label_rotate=45 if len(skew_summary.index) <=5 else 90, color=color, y_units="fff", x_tick_wrap=False, ax=ax, )
def eda_num_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", chart_scale=15): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a category feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap then the feature is skipped. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100 }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[self.target.name] = bi_df[self.target.name].astype(float) # create pivot table of target summary statistics, grouping by category feature bi_summ_piv_df = pd.pivot_table( bi_df, index=feature, aggfunc={ self.target.name: [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd] }) multi_index = bi_summ_piv_df.columns single_index = pd.Index([i[1] for i in multi_index.tolist()]) bi_summ_piv_df.columns = single_index bi_summ_piv_df.reset_index(inplace=True) bi_summ_piv_df = bi_summ_piv_df.rename( columns={ "nanmin": "Min", "nanmax": "Max", "nanmean": "Mean", "nanmedian": "Median", "nanstd": "StdDev", }) # fill nan's with zero fill_columns = bi_summ_piv_df.iloc[:, 1:].columns bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0) # reorder column bi_summ_piv_df = bi_summ_piv_df[[ feature, "Mean", "Median", "StdDev", "Min", "Max" ]] # convert to int if is_numeric_dtype(bi_summ_piv_df[feature]): bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64") # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_piv_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=1.0) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title="Feature distribution\n* {}".format(feature), position=132) # error catching block for resorting labels try: sorted(unique_vals, key=int) except ValueError: pass else: # sort unique_vals/unique_counts for bar chart new_ix = [ sorted(list(unique_vals), key=int).index(i) for i in list(unique_vals) ] unique_vals = np.array(sorted(list(unique_vals), key=int)) unique_counts = np.array( [y for x, y in sorted(zip(new_ix, unique_counts))]) # sort temporary data frame for box plot bi_df[feature] = bi_df[feature].astype(int) # dynamically set rotation angle based on number unique values and maximum length of # category labels. len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6: rotation = 0 else: rotation = 30 # represent x-axis tick labels as integers rather than floats x_values = list(map(str, unique_vals.tolist())) try: x_values = [int(float(x)) for x in x_values] except ValueError: pass # add bar chart to canvas p.bar_v( x=x_values, counts=unique_counts, label_rotate=rotation, color=style.style_grey, y_units="f", x_tick_wrap=True, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] # add canvas to prettierplot object ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature), position=133) ## dynamically determine precision of y-units # capture min and max feature values dist_min = bi_df[self.target.name].values.min() dist_max = bi_df[self.target.name].values.max() # determine y-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: y_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: y_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: y_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: y_units = "ff" else: y_units = "f" # add vertical box plot to canvas p.box_plot_v( x=feature, y=self.target.name, data=bi_df.sort_values([feature]), color=matplotlib.cm.get_cmap(name=color_map), label_rotate=rotation, y_units=y_units, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] plt.show()
from prettierplot.plotter import PrettierPlot from prettierplot import data import numpy as np df = data.attrition() # capture unique EmployeeField values and frequency counts unique_vals, unique_counts = np.unique( df[df["EducationField"].notnull()]["EducationField"], return_counts=True) # create plotting instance p = PrettierPlot(chart_scale=10) # create Axes object and decorate ax = p.make_canvas(title="Educational field category counts", y_label="Category counts", y_shift=0.47) # add plots p.bar_v(x=unique_vals, counts=unique_counts, label_rotate=45, x_tick_wrap=True)