def eda_cat_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a category feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap, then no visualization panel is produced. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100, }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # groupby category feature and count the occurrences of target classes # for each level in category bi_summ_df = ( bi_df.groupby([feature] + [self.target.name]).size().reset_index().pivot( columns=self.target.name, index=feature, values=0)) # overwrite DataFrame index with actual class labels if provided bi_summ_df.columns = pd.Index( legend_labels) if legend_labels is not None else pd.Index( [i for i in bi_summ_df.columns.tolist()]) bi_summ_df.reset_index(inplace=True) # fill nan's with zero fill_columns = bi_summ_df.iloc[:, 2:].columns bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0) # set values to int dtype where applicable to optimize displayed DataFrame for column in bi_summ_df.columns: try: bi_summ_df[column] = bi_summ_df[column].astype(np.int) except ValueError: bi_summ_df[column] = bi_summ_df[column] ## proportion by category summary # combine feature column and target prop_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls prop_df = prop_df[prop_df[feature].notnull()] # calculate percent of 100 by class label prop_df = prop_df.groupby([feature, self.target.name ]).agg({self.target.name: {"count"}}) prop_df = prop_df.groupby( level=0).apply(lambda x: 100 * x / float(x.sum())) prop_df = prop_df.reset_index() multiIndex = prop_df.columns singleIndex = [i[0] for i in multiIndex.tolist()] singleIndex[-1] = "Count" prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) prop_df = pd.pivot_table(prop_df, values=["Count"], columns=[feature], index=[self.target.name], aggfunc={"Count": np.mean}) prop_df = prop_df.reset_index(drop=True) multiIndex = prop_df.columns singleIndex = [] for column in multiIndex.tolist(): try: singleIndex.append(int(column[1])) except ValueError: singleIndex.append(column[1]) prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target prop_df.insert(loc=0, column="Class", value=legend_labels if legend_labels is not None else np.unique(self.target)) # fill nan's with zero fill_columns = prop_df.iloc[:, :].columns prop_df[fill_columns] = prop_df[fill_columns].fillna(0) # if there are only two class labels, perform z-test/t-test if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2: # total observations total_obs1 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[0])][feature].shape[0] total_obs2 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[1])][feature].shape[0] # total positive observations pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0]) & (bi_df[self.target.name] == 1)][feature].shape[0] pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1]) & (bi_df[self.target.name] == 1)][feature].shape[0] # perform z-test, return z-statistic and p-value z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2), nobs=(total_obs1, total_obs2)) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion", "Statistical test", ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) else: # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion" ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) ### visualizations # set label rotation angle len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4: rotation = 0 else: rotation = 90 # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=0.82) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Category counts by target\n* {}".format(feature), position=132) # add faceted categorical plot to canvas p.facet_cat( df=bi_summ_df, feature=feature, label_rotate=rotation, color_map=color_map, bbox=(1.0, 1.15), alpha=0.8, legend_labels=legend_labels, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Target proportion by category\n* {}".format(feature), position=133) # add stacked bar chart to canvas p.stacked_bar_h( df=prop_df.drop("Class", axis=1), bbox=(1.0, 1.15), legend_labels=legend_labels, color_map=color_map, alpha=0.8, ax=ax, ) plt.show()
def pair_plot(self, df, columns=None, target=None, diag_kind="auto", legend_labels=None, drop_na=True, bbox=(2.0, 1.0), alpha=0.7, color_map="viridis"): """ Documentation: --- Description: Create pair plot that produces a grid of scatter plots for all unique pairs of number features and a series of KDE or histogram plots along the diagonal. --- Parameters: df : Pandas DataFrame Pandas DataFrame containing data of interest. columns : list, default=None List of strings describing columns in Pandas DataFrame to be visualized. If None, all columns are visualized. target : Pandas Series, default=None Introduce third dimension to scatter plots through a color hue that differentiates dots based on the category value. diag_kind : str, default='auto. Type of plot created along diagonal. drop_na : boolean, default=True Controls whether rows with null values are dropped. legend_labels : list, default=None List containing strings of custom labels to display in legend. bbox : tuple of floats, default=None Coordinates for determining legend position. alpha : float, default=0.7 Controls transparency of objects. Accepts value between 0.0 and 1.0. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. """ # custom plot formatting settings for this particular chart. with plt.rc_context( { "axes.titlesize": 3.5 * self.chart_scale, "axes.labelsize": 1.5 * self.chart_scale, # axis title font size "xtick.labelsize": 1.2 * self.chart_scale, "xtick.major.size": 0.5 * self.chart_scale, "xtick.major.width": 0.05 * self.chart_scale, "xtick.color": style.style_grey, "ytick.labelsize": 1.2 * self.chart_scale, "ytick.major.size": 0.5 * self.chart_scale, "ytick.major.width": 0.05 * self.chart_scale, "ytick.color": style.style_grey, "figure.facecolor": style.style_white, "axes.facecolor": style.style_white, "axes.spines.left": False, "axes.spines.bottom": False, "axes.edgecolor": style.style_grey, "axes.grid": False, } ): # optionally drop rows with nulls if drop_na: df = df.dropna() # optionally limit to a subset of columns if columns is not None: df = df[columns] # merge df with target if target is provided if target is not None: df = df.merge(target, left_index=True, right_index=True) # create pair plot g = sns.pairplot( data=df if target is None else df.dropna(), vars=df.columns if target is None else [x for x in df.columns if x is not target.name], hue=target if target is None else target.name, diag_kind=diag_kind, height=0.2 * self.chart_scale, plot_kws={ "s": 2.0 * self.chart_scale, "edgecolor": None, "linewidth": 1, "alpha": alpha, "marker": "o", "facecolor": style.style_grey if target is None else None, }, diag_kws={ "facecolor": style.style_grey if target is None else style.style_white, "linewidth": 2, }, # diag_kws={"facecolor": style.style_grey if target is None else None}, palette=None if target is None else sns.color_palette( style.color_gen(color_map, num=len(np.unique(target))) ), ) # plot formatting for ax in g.axes.flat: _ = ax.set_xlabel( "\n".join(textwrap.wrap(str(ax.get_xlabel()).replace("_", " "), 12)) , rotation=40, ha="right") _ = ax.set_ylabel( "\n".join(textwrap.wrap(str(ax.get_ylabel()).replace("_", " "), 12)) , rotation=40, ha="right") _ = ax.xaxis.labelpad = 20 _ = ax.yaxis.labelpad = 40 _ = ax.xaxis.label.set_color(style.style_grey) _ = ax.yaxis.label.set_color(style.style_grey) # wrap long x-tick labels plt.xlabel( # 0, [ "\n".join(textwrap.wrap(str(i).replace("_", " "), 12)) for i in ax.get_xlabel() ], # ha="center", ) # wrap long y-tick labels plt.ylabel( # 0, [ "\n".join(textwrap.wrap(str(i).replace("_", " "), 12)) for i in ax.get_xlabel() ], # va="center_baseline", ) # adjust subplot relative positioning plt.subplots_adjust(hspace=0.0, wspace=0.0) # add custom legend describing hue labels if target is not None: g._legend.remove() ## create custom legend # create labels if legend_labels is None: legend_labels = np.unique(df[df[target.name].notnull()][target.name]) else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen("viridis", num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=patches, fontsize=0.6 * self.chart_scale * np.log1p(len(g.axes.flat)), loc="upper right", markerscale=0.15 * self.chart_scale * np.log1p(len(g.axes.flat)), ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey")
def binary_classification_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, labels=None, n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15): """ Documentation: --- Description: Generate a panel of reports and visualizations summarizing the performance of a classification model. --- Parameters: model : model object Instantiated model object. X_train : Pandas DataFrame Training data observations. y_train : Pandas Series Training target data. X_valid : Pandas DataFrame, default=None Validation data observations. y_valid : Pandas Series, default=None Validation target data. labels : list, default=None Custom labels for confusion matrix axes. If left as none, will default to 0, 1, 2... n_folds : int, default=None Number of cross-validation folds to use. If validation data is provided through X_valid/y_valid, n_folds is ignored. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.0 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. random_state : int, default=1 Random number seed. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ print("*" * 55) print("* Estimator: {}".format(model.estimator_name)) print("* Parameter set: {}".format(model.model_iter)) print("*" * 55) print("\n" + "*" * 55) print("Training data evaluation\n") ## training panel # fit model on training data and generate predictions using training data y_pred = model.fit(X_train, y_train).predict(X_train) # print and generate classification_report using training data print( classification_report( y_train, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - training data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion plot to canvas plot_confusion_matrix( estimator=model, X=X_train, y_true=y_train, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - training data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train, y_train=y_train, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show() # if validation data is provided if X_valid is not None: print("\n" + "*" * 55) print("Validation data evaluation\n") # fit model on training data and generate predictions using validation data y_pred = model.fit(X_train, y_train).predict(X_valid) # print and generate classification_report using training data print( classification_report( y_valid, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - validation data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion matrix to canvas plot_confusion_matrix( estimator=model, X=X_valid, y_true=y_valid, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - validation data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, # position=111 if X_valid is not None else 121, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show() # if n_folds are provided, indicating cross-validation elif isinstance(n_folds, int): print("\n" + "*" * 55) print("Cross validation evaluation\n") # generate cross-validation indices cv = list( StratifiedKFold( n_splits=n_folds, shuffle=True, random_state=random_state ).split(X_train, y_train) ) # generate colors color_list = style.color_gen(color_map, num=len(cv)) # iterate through cross-validation indices for i, (train_ix, valid_ix) in enumerate(cv): print("\n" + "*" * 55) print("CV Fold {}\n".format(i + 1)) X_train_cv = X_train.iloc[train_ix] y_train_cv = y_train.iloc[train_ix] X_valid_cv = X_train.iloc[valid_ix] y_valid_cv = y_train.iloc[valid_ix] # fit model on training data and generate predictions using holdout observations y_pred = model.fit(X_train_cv, y_train_cv).predict(X_valid_cv) # print and generate classification_report using holdout observations print( classification_report( y_valid_cv, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - CV Fold {}\nModel: {}\nParameter set: {}".format( i + 1, model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion matrix to canvas plot_confusion_matrix( estimator=model, X=X_valid_cv, y_true=y_valid_cv, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - CV Fold {}\nModel: {}\nParameter set: {}".format( i + 1, model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train_cv, y_train=y_train_cv, X_valid=X_valid_cv, y_valid=y_valid_cv, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show()
def scatter_2d_hue(self, x, y, target, label, df=None, x_units="f", x_ticks=None, y_units="f", y_ticks=None, plot_buffer=True, size=10, axis_limits=True, color=style.style_grey, facecolor="w", bbox=(1.2, 0.9), color_map="viridis", alpha=0.8, x_rotate=None, ax=None): """ Documentation: --- Description: Create 2-dimensional scatter plot with a third dimension represented as a color hue in the scatter dots. --- Parameters: x : array or string Either 1-dimensional array of values or a column name in a Pandas DataFrame. y : array or string Either 1-dimensional array of values or a column name in a Pandas DataFrame. target : array or string Either 1-dimensional array of values or a column name in a Pandas DataFrame. label : list Labels corresponding to color hue. df : Pandas DataFrame, default=None Pandas DataFrame containing data to plot. Can be any size - plotted columns will be chosen by columns names specified in x and y parameters. x_units : str, default='d' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. x_ticks : array, default=None Custom x-tick labels. y_units : str, default='d' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_ticks : array, default=None Custom y-tick labels. plot_buffer : bool, default=True Controls whether dynamic plot buffer function is executed. size : int or float, default=10 Size of scattered dots. axis_limits : bool, default=True Controls whether dynamic axis limit setting function is executed. color : str (color code of some sort), default=style.style_grey Color of scattered dots facecolor : str (color code of some sort), default='w' Face color of scattered dots bbox : tuple of floats, default=(1.2, 0.9) Coordinates for determining legend position. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. x_rotate : int, default=None Rotates x-axis tick mark labels x degrees. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # if a Pandas DataFrame is passed to function, create x and y and target arrays using columns names # passed into function. Also concatenates columns into single object if df is not None: x = df[[x, y, target]].values x = df[x].values y = df[y].values target = df[target].values # concatenate the x, y and target arrays else: x = np.c_[x, y, target] # unique target values target_ids = np.unique(x[:, 2]) # generate color list color_list = style.color_gen(name=color_map, num=len(target_ids)) # loop through sets of target values, labels and colors to create 2_d scatter with hue for target_id, target_name, color in zip(target_ids, label, color_list): plt.scatter( x=x[x[:, 2] == target_id][:, 0], y=x[x[:, 2] == target_id][:, 1], color=color, label=target_name, s=size * self.chart_scale, alpha=alpha, facecolor="w", linewidth=0.234 * self.chart_scale, ) # add legend to figure if label is not None: plt.legend( loc="upper right", bbox_to_anchor=bbox, ncol=1, frameon=True, fontsize=1.1 * self.chart_scale, ) # optionally set axis lower / upper limits if axis_limits: x_min, x_max, y_min, y_max = util.util_set_axes(x=x, y=y) plt.axis([x_min, x_max, y_min, y_max]) # optionally create smaller buffer around plot area to prevent cutting off elements if plot_buffer: util.util_plot_buffer(ax=ax, x=0.02, y=0.02) # optionally creates custom x-tick labels if x_ticks is not None: ax.set_xticks(x_ticks) # optionally creates custom y-tick labels if y_ticks is not None: ax.set_yticks(y_ticks) # format x and y ticklabels ax.set_yticklabels( ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(), rotation=0, fontsize=1.0 * self.chart_scale, color=style.style_grey, ) ax.set_xticklabels( ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(), rotation=0, fontsize=1.0 * self.chart_scale, color=style.style_grey, ) # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units, x_rotate=x_rotate)
def dist_plot(self, x, color, x_units="f", y_units="f", fit=None, kde=False, x_rotate=None, alpha=0.8, bbox=(1.2, 0.9), legend_labels=None, color_map="viridis", ax=None): """ Documentation: --- Description: Creates distribution plot for numeric variable. Optionally overlays a kernel density estimation curve. --- Parameters: x : array Data for plotting. color : str (some sort of color code) Color of bars and KDE lines. x_units : str, default='f' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_units : str, default='f' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. fit : random variabe object, default=None Allows for the addition of another curve. utilizing 'norm' overlays a normal distribution over the distribution bar chart. Useful for seeing how well, or not, the distribution tracks with a specified distrbution. kde : boolean, default=False Controls whether kernel density is plotted over distribution. x_rotate : int, default=None Rotates x_axis tick mark labels x degrees. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. bbox : tuple of floats, default=(1.2, 0.9) Coordinates for determining legend position. legend_labels : list, default=None Custom legend labels. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # create distribution plot with an optional fit curve g = sns.distplot( a=x, kde=kde, color=color, axlabel=False, fit=fit, kde_kws={"lw": 0.2 * self.chart_scale}, hist_kws={"alpha": alpha}, ax=ax, ) # tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale) # format x and y ticklabels ax.set_yticklabels( ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(), rotation=0, fontsize=1.1 * self.chart_scale, color=style.style_grey, ) ax.set_xticklabels( ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(), rotation=0, fontsize=1.1 * self.chart_scale, color=style.style_grey, ) # use label formatter utility function to customize chart labels util.util_label_formatter( ax=ax, x_units=x_units, y_units=y_units, x_rotate=x_rotate ) ## create custom legend if legend_labels is None: legend_labels = legend_labels else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen(color_map, num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=patches, fontsize=1.0 * self.chart_scale, loc="upper right", markerscale=0.5 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey")
def stacked_bar_h(self, df, label_rotate=0, x_units="p", alpha=0.8, color_map="viridis", bbox=(1.2,0.9), legend_labels=None, ax=None): """ Documentation: --- Description: create horizontal bar plot. --- Parameters: df : Pandas DataFrame 1-dimensional array of values to plot on y-axis representing distinct categories. label_rotate : float or int, default=45 Number of degrees to rotate the x-tick labels. x_units : str, default='f' Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. alpha : float, default=0.8 Controls transparency of bars. Accepts value between 0.0 and 1.0. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. bbox : tuple of floats, default=(1.2, 0.9) Coordinates for determining legend position. legend_labels : list, default=None Custom legend labels. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # define class label count and bar color list y = np.arange(len(df.index)) color_list = style.color_gen(color_map, num=len(y)) # define category labels category_levels = np.arange(len(df.columns)) # plot stacked bars for class_label, color in zip(np.arange(len(y)), color_list): # first category if class_label == 0: plt.barh( y=category_levels, width=df.loc[class_label], color=color, alpha=alpha, ) # stack all additional categories on previous categories else: plt.barh( y=category_levels, width=df.loc[class_label], left=df.drop([x for x in df.index if x >= class_label]).sum(axis=0), color=color, alpha=alpha, ) # convert x-axis tick labels to percentages ax.set_xticklabels( ax.get_xticklabels() * 100 if "p" in x_units else ax.get_xticklabels(), rotation=0, color=style.style_grey, ) ## create custom legend if legend_labels is None: legend_labels = np.arange(len(color_list)) else: legend_labels = np.array(legend_labels) # define colors label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=patches, fontsize=0.95 * self.chart_scale, loc="upper right", markerscale=0.3 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, x_units=x_units) # overwrite y-axis labels with category labels try: columns = df.columns.map(np.int) except ValueError: columns = df.columns # dynamically size y-labels if 7 < len(category_levels) <= 10: ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.9 * self.chart_scale) elif 10 < len(category_levels) <= 20: ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.75 * self.chart_scale) elif len(category_levels) > 20: ax.tick_params(axis="y", colors=style.style_grey, labelsize=0.6 * self.chart_scale) ax.tick_params(axis="x", colors=style.style_grey, labelsize=1.2 * self.chart_scale) # wrap long y-tick labels plt.yticks( category_levels, [ "\n".join(textwrap.wrap(str(i).replace("_", " "), 12)) for i in columns ], )
def facet_two_cat_point(self, df, x, y, split, cat_col=None, cat_row=None, bbox=None, aspect=1, alpha=0.8, height=4, legend_labels=None, color_map="viridis"): """ Documentation: --- Description: Creates pointplots of one categorical variable, and each can optionally be split by two additional categories along the column and/or row axes of the figure. --- Parameters: df : Pandas DataFrame Pandas DataFrame containing data for plotting. x : str Categorical variable to plot along x_axis. y : str Variable to be counted along y_axis. split : str Categorical variable for faceting the 'x' variable. cat_col : str Categorical variable faceted along the column axis. cat_row : str Categorical variable faceted along the row axis. bbox : tuple of floats, default=None Coordinates for determining legend position. aspect : float, default=1 higher values create wider plot, lower values create narrow plot, while keeping height constant. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. height : float, default=4 height in inches of each facet. legend_labels : list, default=None Custom legend labels. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. """ # create FacetGrid object g = sns.FacetGrid(df, row=cat_row, col=cat_col, aspect=aspect, height=height, margin_titles=True) # map pointplot to FacetGrid object g.map( sns.pointplot, x, y, split, order=df[x].sort_values().drop_duplicates().values.tolist(), hue_order=df[split].sort_values().drop_duplicates().values.tolist(), palette=sns.color_palette( style.color_gen(color_map, num=len(np.unique(df[split].values)))), alpha=alpha, ci=None, ) # format x any y ticklabels, x and y labels, and main title for ax in g.axes.flat: _ = ax.set_ylabel( ax.get_ylabel(), rotation=90, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) _ = ax.set_xlabel( ax.get_xlabel(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) _ = ax.set_title( ax.get_title(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) # resize y tick labels labels = ax.get_yticklabels() if len(labels) > 0: _ = ax.set_yticklabels( ax.get_yticklabels(), rotation=0, fontsize=0.8 * self.chart_scale, color=style.style_grey, ) # resize x tick labels labels = ax.get_xticklabels() if len(labels) > 0: _ = ax.set_xticklabels( ax.get_xticklabels(), rotation=0, fontsize=0.8 * self.chart_scale, color=style.style_grey, ) if ax.texts: # this contains the right ylabel text txt = ax.texts[0] ax.text( txt.get_unitless_position()[0], txt.get_unitless_position()[1], txt.get_text(), transform=ax.transAxes, va="center", fontsize=1.05 * self.chart_scale, color=style.style_grey, rotation=-90, ) # remove the original text ax.texts[0].remove() ## create custom legend # create labels if legend_labels is None: legend_labels = np.unique(df[df[split].notnull()][split]) else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen(color_map, num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [ Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items() ] # draw legend leg = plt.legend( handles=patches, fontsize=1.0 * self.chart_scale, loc="upper right", markerscale=0.5 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey")
def facet_cat(self, df, feature, label_rotate=0, x_units="s", y_units="f", bbox=(1.2, 0.9), alpha=0.8, legend_labels=None, color_map="viridis", ax=None): """ Documentation: --- Description: Creates a count plot for a categorical variable and facet the variable by another categorical variable. --- Parameters: df : Pandas DataFrame Pandas DataFrame containing data for plotting. feature : str Name of column that contains the category values to be used for faceting/ label_rotate : float or int, default=0 Number of degrees to rotate the x-tick labels. x_units : str, default='f' Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_units : str, default='s' Determines unit of measurement for y-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. bbox : tuple of floats, default=(1.2, 0.9) Coordinates for determining legend position. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. legend_labels : list, default=None Custom legend labels. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax ixs = np.arange(df.shape[0]) bar_width = 0.35 feature_dict = {} for feature in df.columns[1:]: feature_dict[feature] = df[feature].values.tolist() # generate color list if isinstance(color_map, str): color_list = style.color_gen(name=color_map, num=len(feature_dict.keys())) elif isinstance(color_map, list): color_list = color_map for feature_ix, (k, v) in enumerate(feature_dict.items()): plt.bar( ixs + (bar_width * feature_ix), feature_dict[k], bar_width, alpha=alpha, color=color_list[feature_ix], label=str(k), ) # wrap long x-tick labels plt.xticks( ixs[:df.shape[0]] + bar_width / 2, [ "\n".join(textwrap.wrap(str(i).replace("_", " "), 12)) for i in df.iloc[:, 0].values ], ) plt.xticks(rotation=label_rotate) ## create custom legend # create labels if legend_labels is None: legend_labels = np.arange(len(color_list)) else: legend_labels = np.array(legend_labels) # define colors label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [ Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items() ] # draw legend leg = plt.legend( handles=patches, fontsize=0.95 * self.chart_scale, loc="upper right", markerscale=0.3 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") ### general formatting # if data is float dtype, then format as a number if df.iloc[:, 0].values.dtype == np.float: x_units = "f" # otherwise represent data as a string else: x_units = "s" # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units) # tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale) # dynamically set x-axis label size if 7 < len(feature_dict[feature]) <= 10: ax.tick_params(axis="x", colors=style.style_grey, labelsize=0.9 * self.chart_scale) elif 10 < len(feature_dict[feature]) <= 20: ax.tick_params(axis="x", colors=style.style_grey, labelsize=0.75 * self.chart_scale) elif len(feature_dict[feature]) > 20: ax.tick_params(axis="x", colors=style.style_grey, labelsize=0.6 * self.chart_scale)
def facet_two_cat_bar(self, df, x, y, split, x_units=None, y_units=None, bbox=None, alpha=0.8, legend_labels=None, filter_nan=True, color_map="viridis", ax=None): """ Documentation: Description: Creates a series of bar plots that count a variable along the y_axis and separate the counts into bins based on two category variables. --- Parameters: df : Pandas DataFrame Pandas DataFrame containing data for plotting. x : str Categorical variable to plot along x-axis. y : str Pandas DataFrame containing data for plotting. ariable to be counted along y-axis. split : str Categorical variable for faceting the num_col variable. x_units : str, default=None Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_units : str, default=None Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. bbox : tuple of floats, default=None Coordinates for determining legend position. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. legend_labels : list, default=None Custom legend labels. filter_nan : bool, default=True Remove records that have a null value in the column specified by the 'x' parameter. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # remove nans from x columns if filter_nan: df = df.dropna(subset=[x]) # create bar plot g = sns.barplot( x=x, y=y, hue=split, data=df, palette=sns.color_palette( style.color_gen("viridis", num=len(np.unique(df[split].values)))), order=df[x].sort_values().drop_duplicates().values.tolist(), hue_order=df[split].sort_values().drop_duplicates().values.tolist() if split is not None else None, ax=ax, ci=None, ) # format x-tick labels g.set_xticklabels( g.get_xticklabels(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) # format y-tick labels g.set_yticklabels( g.get_yticklabels() * 100 if "p" in y_units else g.get_yticklabels(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) # format x-axis label g.set_xlabel( g.get_xlabel(), rotation=0, fontsize=1.35 * self.chart_scale, color=style.style_grey, ) # format y-axis label g.set_ylabel( g.get_ylabel(), rotation=90, fontsize=1.35 * self.chart_scale, color=style.style_grey, ) # format title g.set_title( g.get_title(), rotation=0, fontsize=1.5 * self.chart_scale, color=style.style_grey, ) ## create custom legend # create labels if split is not None: if legend_labels is None: legend_labels = (df[df[split].notnull()][split].sort_values(). drop_duplicates().values.tolist()) else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen(color_map, num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [ Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items() ] # draw legend leg = plt.legend( handles=patches, fontsize=1.25 * self.chart_scale, loc="upper right", markerscale=0.5 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units)
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15, color_map="viridis", title_scale=1.2, show_single_str_params=False): """ Documentation: --- Definition: Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to the distribution of values that were actually chosen, and visualizes how parameter value selections changes over time. --- Parameters: bayes_optim_summary : Pandas DataFrame Pandas DataFrame containing results from bayesian optimization process. estimator_class : str or sklearn api object Name of estimator to visualize. estimator_parameter_space : dictionary of dictionaries Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the value is a distribution of values from which trial values are drawn. n_iter : int Number of iterations to draw from theoretical distribution in order to visualize the theoretical distribution. Higher number leader to more robust distribution but can take considerably longer to create. chart_scale : float, default=15 Controls proportions of visualizations. larger values scale visual up in size, smaller values scale visual down in size. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.2 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. show_single_str_params : boolean, default=False Controls whether to display visuals for string attributes where there is only one unique value, i.e. there was only one choice for the optimization procedure to choose from during each iteration. """ # unpack bayes_optim_summary parameters for an estimator_class estimator_summary = self.unpack_bayes_optim_summary( bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class ) # override None with string representation estimator_summary = estimator_summary.replace([None], "None") # subset estimator_parameter_space to space for the specified estimator_class estimator_space = estimator_parameter_space[estimator_class] print("*" * 100) print("* {}".format(estimator_class)) print("*" * 100) # iterate through each parameter for param in estimator_space.keys(): # sample from theoretical distribution for n_iters theoretical_dist = [] for _ in range(n_iter): theoretical_dist.append(sample(estimator_space)[param]) ## override None with string representation # theoretical distribution theoretical_dist = ["none" if v is None else v for v in theoretical_dist] theoretical_dist = np.array(theoretical_dist) # actual distribution actual_dist = estimator_summary[param].tolist() actual_dist = ["none" if v is None else v for v in actual_dist] actual_dist = np.array(actual_dist) # limit estimator_summary to "iteration" and current "param" columns actual_iter_df = estimator_summary[["iteration", param]] # identify how many values in param column are zero or one zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum() # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE" if zeros_and_ones == actual_iter_df.shape[0]: actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"}) # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE" if isinstance(theoretical_dist[0], np.bool_): theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()]) estimator_summary = estimator_summary.replace([True], "TRUE") estimator_summary = estimator_summary.replace([False], "FALSE") # if theoretical distribution contains str data, then treat this as an object/category parameter if any(isinstance(d, str) for d in theoretical_dist): # generate color list for stripplot stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1) # generate color list for bar chart bar_color_list = style.color_gen(name=color_map, num=3) # identify unique values and associated count in theoretical distribution unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True) # if theoretical distribution only has one unique value and show_single_str_params is set to True if len(unique_vals_theo) > 1 or show_single_str_params: # identify unique values and associated count in actual distribution unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True) # store data in DataFrame df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual}) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # add faceted bar chart to canvas p.facet_cat( df=df, feature="param", color_map=bar_color_list[:-1], bbox=(1.0, 1.15), alpha=1.0, legend_labels=df.columns[1:].values, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.5, position=122, title_scale=title_scale, ) # add stripply to canvas sns.stripplot( x="iteration", y=param, data=estimator_summary, jitter=0.3, alpha=1.0, size=0.7 * chart_scale, palette=sns.color_palette(stripplot_color_list[:-1]), ax=ax, ).set(xlabel=None, ylabel=None) # set tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale) plt.show() # otherwise treat it as a numeric parameter else: # cast "iteration" as an int and the param values as float convert_dict = {"iteration": int, param: float} actual_iter_df = actual_iter_df.astype(convert_dict) # create color map color_list = style.color_gen(name=color_map, num=3) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # dynamically set x-unit precision based on max value if -1.0 <= np.nanmax(theoretical_dist) <= 1.0: x_units = "fff" elif 1.0 < np.nanmax(theoretical_dist) <= 5.0: x_units = "ff" elif np.nanmax(theoretical_dist) > 5.0: x_units = "f" # add kernsel density plot for theoretical distribution to canvas p.kde_plot( theoretical_dist, color=color_list[0], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) # add kernsel density plot for actual distribution to canvas p.kde_plot( actual_dist, color=color_list[1], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) ## create custom legend # create labels label_color = {} legend_labels = ["Theoretical", "Actual"] for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=Patches, fontsize=1.1 * chart_scale, loc="upper right", markerscale=0.6 * chart_scale, ncol=1, bbox_to_anchor=(.95, 1.1), ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") # dynamically set y-unit precision based on max value if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0: y_units = "fff" elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0: y_units = "ff" elif np.nanmax(actual_iter_df[param]) > 5.0: y_units = "f" # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=122, title_scale=title_scale, ) # add regression plot to canvas p.reg_plot( x="iteration", y=param, data=actual_iter_df, y_units=y_units, x_units="f", line_color=color_list[0], line_width=0.4, dot_color=color_list[1], dot_size=10.0, alpha=0.6, ax=ax ) plt.show()
def decision_region(self, x, y, estimator, test_idx=None, resolution=0.1, bbox=(1.2, 0.9), color_map="viridis", ax=None): """ Documentation: Description: Create 2-dimensional chart with shading used to highlight decision regions. Parameters: x : array m x 2 array containing 2 features. y : array m x 1 array containing labels for observations. estimator : sklearn model Estimator used to create decision regions. test_idx : tuple, default=None Optional parameter for specifying observations to be highlighted as test examples. resolution : float, default=0.1 Controls clarity of the graph by setting interval of the arrays passed into np.meshgrid. Higher resolution will take longer to generate because predictions have to be generated for each point on the grid. bbox : tuple of floats, default=(1.2, 0.9) Coordinates for determining legend position. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. ax : axes object, default=None Axis object for the visualization. """ # generate color list color_list = style.color_gen(name=color_map, num=len(np.unique(y))) # objects for marker generator and color map cmap = ListedColormap(color_list) # plot decision surface x1_min, x1_max = x[:, 0].min() - 1, x[:, 0].max() + 1 x2_min, x2_max = x[:, 1].min() - 1, x[:, 1].max() + 1 # generate meshgrid indices xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution)) # generate predictions using estimator for all points on grid z = estimator.predict(np.array([xx1.ravel(), xx2.ravel()]).T) # reshape the predictions and apply coloration z = z.reshape(xx1.shape) plt.contourf(xx1, xx2, z, alpha=0.3, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) # plot samples for idx, cl in enumerate(np.unique(y)): plt.scatter( x=x[y == cl, 0], y=x[y == cl, 1], alpha=1.0, c=color_list[idx], marker=style.style_markers[1], label=cl, s=12.5 * self.chart_scale, ) # highlight test samples if test_idx: x_test = x[test_idx, :] plt.scatter( x_test[:, 0], x_test[:, 1], facecolor="none", edgecolor="white", alpha=1.0, linewidth=1.4, marker="o", s=12.75 * self.chart_scale, label="test set", ) # add legend to figure plt.legend( loc="upper right", bbox_to_anchor=bbox, ncol=1, frameon=True, fontsize=1.1 * self.chart_scale, ) plt.tight_layout()
def model_loss_plot(self, bayes_optim_summary, estimator_class, chart_scale=15, trim_outliers=True, outlier_control=1.5, title_scale=0.7, color_map="viridis"): """ Documentation: --- Definition: Visualize how the bayesian optimization loss changes over time across all iterations. Extremely poor results are removed from visualized dataset by two filters. 1) Loss values worse than [loss mean + (2 x loss standard deviation)] 2) Loss values worse than [median * outliers_control]. 'outlier_control' is a parameter that can be set during function execution. --- Parameters: bayes_optim_summary : Pandas DataFrame Pandas DataFrame containing results from bayesian optimization process. estimator_class : str or sklearn api object Name of estimator to visualize. chart_scale : float, default=15 Control chart proportions. Higher values scale up size of chart objects, lower values scale down size of chart objects. trim_outliers : boolean, default=True Remove extremely high (poor) results by trimming values where the loss is greater than 2 standard deviations away from the mean. outlier_control : float: default=1.5 Controls enforcement of outlier trimming. Value is multiplied by median, and the resulting product is the cap placed on loss values. Values higher than this cap will be excluded. Lower values of outlier_control apply more extreme filtering to loss values. title_scale : float, default=0.7 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. """ # unpack bayes_optim_summary parameters for an estimator_class estimator_summary = self.unpack_bayes_optim_summary( bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class ) # apply outlier trimming if trim_outliers: mean = estimator_summary["iter_loss"].mean() median = estimator_summary["iter_loss"].median() std = estimator_summary["iter_loss"].std() cap = mean + (2.0 * std) estimator_summary = estimator_summary[ (estimator_summary["iter_loss"] < cap) & (estimator_summary["iter_loss"] < outlier_control * median) ] # create color list based on color_map color_list = style.color_gen(name=color_map, num=3) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale) # add canvas to prettierplot object ax = p.make_canvas( title="Loss by iteration - {}".format(estimator_class), y_shift=0.8, position=111, title_scale=title_scale, ) # add regression plot to canvas p.reg_plot( x="iteration", y="iter_loss", data=estimator_summary, y_units="ffff", line_color=color_list[0], dot_color=color_list[1], alpha=0.6, line_width=0.4, dot_size=10.0, ax=ax, ) plt.show()
def box_plot_h(self, x, y, data, color=style.style_grey, x_units="f", bbox=(1.05, 1), color_map="viridis", suppress_outliers=False, alpha=0.8, legend_labels=None, ax=None): """ Documentation: --- Description: create horizontal box plots. useful for evaluating a object target on the y_axis vs. a number independent variable on the x_axis. --- Parameters: x : str Name of categorical variable. y : str Name of numeric variable. data : Pandas DataFrame Pandas DataFrame including both x and y data. color : str (some sort of color code), default=style.style_grey Determines color of box plot figures. Ideally this object is a color palette, which can be a default seaborn palette, a custom seaborn palette, or a custom matplotlib cmap. x_units : str, default='f' Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. bbox : tuple of floats, default=(1.05, 1.0) Coordinates for determining legend position. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. suppress_outliers : boolean, default=False Controls removal of outliers from box/whisker plots alpha : float, default=0.8 Controls transparency of bars. Accepts value between 0.0 and 1.0. legend_labels : list, default=None Custom legend labels. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # create horizontal box plot g = sns.boxplot( x=x, y=y, hue=y, data=data, orient="h", palette=sns.color_palette( style.color_gen(color_map, num=len(np.unique(data[y].values))) ), showfliers=suppress_outliers, ax=ax, ).set(xlabel=None, ylabel=None) # fade box plot figures by reducing alpha plt.setp(ax.artists, alpha=alpha) ax.yaxis.set_visible(False) # tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale) # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, x_units=x_units) ## custom legend # use legend labels if provided, otherwise use unique values in y column if legend_labels is None: legend_labels = np.unique(data[y].values) else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen(color_map, num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=patches, fontsize=1.0 * self.chart_scale, loc="upper right", markerscale=0.5 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey")
def box_plot_v(self, x, y, data, color, label_rotate=0, y_units="f", color_map="viridis", alpha=0.8, suppress_outliers=False, ax=None): """ Documentation: --- Description: Create vertical box plots. Useful for evaluating a numeric variable on the y-axis versus several different category segments on the x-axis. --- Parameters: x : str Name of categorical variable. y : str Name of numeric variable. data : Pandas DataFrame Pandas DataFrame including both x and y data. color : str Determines color of box plot figures. Ideally this object is a color palette, which can be a default seaborn palette, a custom seaborn palette, or a custom matplotlib cmap. label_rotate : float or int, default=45 Number of degrees to rotate the x-tick labels. y_units : str, default='f' Determines unit of measurement for y-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. suppress_outliers : boolean, default=False Controls removal of outliers from box/whisker plots. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # create vertical box plot. g = sns.boxplot( x=x, y=y, data=data, orient="v", palette=sns.color_palette( style.color_gen(color_map, num=len(np.unique(data[x].values))) ), showfliers=suppress_outliers, ax=ax, ).set(xlabel=None, ylabel=None) # tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * self.chart_scale) # resize x-axis labels as needed unique = np.unique(data[x]) if len(unique) > 10 and len(unique) <= 20: ax.tick_params( axis="x", colors=style.style_grey, labelsize=1.0 * self.chart_scale ) elif len(unique) > 20: ax.tick_params( axis="x", colors=style.style_grey, labelsize=0.9 * self.chart_scale ) else: ax.tick_params( axis="x", colors=style.style_grey, labelsize=1.2 * self.chart_scale ) # resize y-axis ax.tick_params(axis="y", labelsize=1.2 * self.chart_scale) # fade box plot figures by reducing alpha. plt.setp(ax.artists, alpha=alpha) # rotate x-tick labels plt.xticks(rotation=label_rotate) ax.yaxis.set_visible(True) # use label formatter utility function to customize chart labels util.util_label_formatter(ax=ax, y_units=y_units)
def eda_cat_target_num_feat(self, feature, color_map="viridis", outliers_out_of_scope=None, legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a number feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. outliers_out_of_scope : boolean, float or int, default=None Truncates the x-axis upper limit so that outliers are out of scope of the visualization. The x-axis upper limit is reset to the maximum non-outlier value. To identify outliers, the IQR is calculated, and values that are below the first quartile minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is passed, the IQR is multiplied by that value. Higher values increase how extremem values need to be to be identified as outliers. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ ### data summaries ## bivariate roll_up table # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # bivariate summary statistics bi_summ_stats_df = pd.DataFrame( columns=["Class", "Count", "Proportion", "Mean", "StdDev"]) # for each unique class label for labl in np.unique(self.target): # get feature values associated with single class label feature_slice = bi_df[bi_df[self.target.name] == labl][feature] # append summary statistics for feature values associated with class label bi_summ_stats_df = bi_summ_stats_df.append( { "Class": labl, "Count": len(feature_slice), "Proportion": len(feature_slice) / len(bi_df[feature]) * 100, "Mean": np.mean(feature_slice), "StdDev": np.std(feature_slice), }, ignore_index=True, ) # apply custom legend labels, or set dtype to int if column values are numeric if legend_labels is not None: bi_summ_stats_df["Class"] = legend_labels elif is_numeric_dtype(bi_summ_stats_df["Class"]): bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int) ## Feature summary describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add missing percentage describe_df = describe_df.append( { "index": "missing", feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5), }, ignore_index=True, ) # add skew describe_df = describe_df.append( { "index": "skew", feature: np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5), }, ignore_index=True, ) # add kurtosis describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # execute z-test or t-test if len(np.unique(self.target)) == 2: s1 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[0])][feature] s2 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[1])][feature] if len(s1) > 30 and len(s2) > 30: # perform z-test, return z-statistic and p-value z, p_val = ztest(s1, s2) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) else: # perform t-test, return t-score and p-value t, p_val = stats.ttest_ind(s1, s2) # add t-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "t-test statistic": t, "p-value": p_val }], columns=["t-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Statistical test" ], ) else: # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # if boolean is passed to outliers_out_of_scope if isinstance(outliers_out_of_scope, bool): # if outliers_out_of_scope = True if outliers_out_of_scope: # identify outliers using IQR method and an IQR step of 5 outliers = self.outlier_IQR(self.data[feature], iqr_step=5) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # if outliers_out_of_scope is a float or int elif isinstance(outliers_out_of_scope, float) or isinstance( outliers_out_of_scope, int): # identify outliers using IQR method and an IQR step equal to the float/int passed outliers = self.outlier_IQR(self.data[feature], iqr_step=outliers_out_of_scope) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # add canvas to prettierplot object ax = p.make_canvas( title="Feature distribution\n* {}".format(feature), title_scale=0.85, position=221, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units="f", x_units=x_units, ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Probability plot\n* {}".format(feature), title_scale=0.85, position=222, ) # add QQ / probability plot to canvas p.prob_plot( x=bi_df[feature].values, plot=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Distribution by class\n* {}".format(feature), title_scale=0.85, position=223, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # generate color list color_list = style.color_gen(name=color_map, num=len(np.unique(self.target))) # add one distribution plot to canvas for each category class for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)): p.dist_plot( bi_df[bi_df[self.target.name] == labl][feature].values, color=color_list[ix], y_units="f", x_units=x_units, legend_labels=legend_labels if legend_labels is not None else np.arange(len(np.unique(self.target))), alpha=0.4, bbox=(1.0, 1.0), ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Boxplot by class\n* {}".format(feature), title_scale=0.85, position=224, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add horizontal box plot to canvas p.box_plot_h(x=feature, y=self.target.name, data=bi_df, alpha=0.7, x_units=x_units, legend_labels=legend_labels, bbox=(1.2, 1.0), suppress_outliers=True, ax=ax) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max) # apply position adjustment to subplots plt.subplots_adjust(bottom=-0.1) plt.show()
def facet_cat_num_hist(self, df, cat_row, cat_col, num_col, split, bbox=None, aspect=1, height=4, alpha=0.8, legend_labels=None, x_units="f", y_units="f", color_map="viridis"): """ Documentation: --- Description: Creates histograms of one numeric variable, and each can optionally be split by a category to show two or more distributions. Allows for faceting by up to two category variables along the column and/or row axes of the figure. --- Parameters: df : Pandas DataFrame Pandas DataFrame containing data for plotting. cat_row : str Categorical variable faceted along the row axis. cat_col : str Categorical variable faceted along the column axis. num_col : str number variable to plot along x_axis. split : str Categorical variable on which to differentiate the num_col variable. bbox : tuple of floats, default=None Coordinates for determining legend position. aspect : float, default=1 higher values create wider plot, lower values create narrow plot, while keeping height constant. height : float, default=4 height in inches of each facet. alpha : float, default=0.8 Controls transparency of objects. Accepts value between 0.0 and 1.0. legend_labels : list, default=None Custom legend labels. x_units : str, default='f' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_units : str, default='f' Determines unit of measurement for x-axis tick labels. 'f' displays float. 'p' displays percentages, d' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. """ # create FacetGrid object g = sns.FacetGrid( df, row=cat_row, col=cat_col, hue=split, hue_order=df[split].sort_values().drop_duplicates().values.tolist() if split is not None else None, palette=sns.color_palette( style.color_gen(color_map, num=len(np.unique(df[split].values)))), despine=True, height=height, aspect=aspect, margin_titles=True, ) # map histogram to FacetGrid object g.map( plt.hist, num_col, alpha=alpha, ) # format x any y ticklabels, x and y labels, and main title for i, ax in enumerate(g.axes.flat): _ = ax.set_ylabel( ax.get_ylabel(), rotation=90, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) _ = ax.set_xlabel( ax.get_xlabel(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) _ = ax.set_title( ax.get_title(), rotation=0, fontsize=1.05 * self.chart_scale, color=style.style_grey, ) # resize y tick labels labels = ax.get_yticklabels() if len(labels) > 0: _ = ax.set_yticklabels( ax.get_yticklabels(), rotation=0, fontsize=0.8 * self.chart_scale, color=style.style_grey, ) # resize x tick labels labels = ax.get_xticklabels() if len(labels) > 0: _ = ax.set_xticklabels( ax.get_xticklabels(), rotation=0, fontsize=0.8 * self.chart_scale, color=style.style_grey, ) if ax.texts: # this contains the right ylabel text txt = ax.texts[0] ax.text( txt.get_unitless_position()[0], txt.get_unitless_position()[1], txt.get_text(), transform=ax.transAxes, va="center", fontsize=1.05 * self.chart_scale, color=style.style_grey, rotation=-90, ) # remove the original text ax.texts[0].remove() ## create custom legend # create labels if split is not None: if legend_labels is None: legend_labels = (df[df[split].notnull()][split].sort_values(). drop_duplicates().values.tolist()) else: legend_labels = np.array(legend_labels) # generate colors color_list = style.color_gen(color_map, num=len(legend_labels)) label_color = {} for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches patches = [ Patch(color=v, label=k, alpha=alpha) for k, v in label_color.items() ] # draw legend leg = plt.legend( handles=patches, fontsize=1.0 * self.chart_scale, loc="upper right", markerscale=0.5 * self.chart_scale, ncol=1, bbox_to_anchor=bbox, ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey")
def eda_num_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", chart_scale=15): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a category feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap then the feature is skipped. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100 }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[self.target.name] = bi_df[self.target.name].astype(float) # create pivot table of target summary statistics, grouping by category feature bi_summ_piv_df = pd.pivot_table( bi_df, index=feature, aggfunc={ self.target.name: [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd] }) multi_index = bi_summ_piv_df.columns single_index = pd.Index([i[1] for i in multi_index.tolist()]) bi_summ_piv_df.columns = single_index bi_summ_piv_df.reset_index(inplace=True) bi_summ_piv_df = bi_summ_piv_df.rename( columns={ "nanmin": "Min", "nanmax": "Max", "nanmean": "Mean", "nanmedian": "Median", "nanstd": "StdDev", }) # fill nan's with zero fill_columns = bi_summ_piv_df.iloc[:, 1:].columns bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0) # reorder column bi_summ_piv_df = bi_summ_piv_df[[ feature, "Mean", "Median", "StdDev", "Min", "Max" ]] # convert to int if is_numeric_dtype(bi_summ_piv_df[feature]): bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64") # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_piv_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=1.0) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title="Feature distribution\n* {}".format(feature), position=132) # error catching block for resorting labels try: sorted(unique_vals, key=int) except ValueError: pass else: # sort unique_vals/unique_counts for bar chart new_ix = [ sorted(list(unique_vals), key=int).index(i) for i in list(unique_vals) ] unique_vals = np.array(sorted(list(unique_vals), key=int)) unique_counts = np.array( [y for x, y in sorted(zip(new_ix, unique_counts))]) # sort temporary data frame for box plot bi_df[feature] = bi_df[feature].astype(int) # dynamically set rotation angle based on number unique values and maximum length of # category labels. len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6: rotation = 0 else: rotation = 30 # represent x-axis tick labels as integers rather than floats x_values = list(map(str, unique_vals.tolist())) try: x_values = [int(float(x)) for x in x_values] except ValueError: pass # add bar chart to canvas p.bar_v( x=x_values, counts=unique_counts, label_rotate=rotation, color=style.style_grey, y_units="f", x_tick_wrap=True, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] # add canvas to prettierplot object ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature), position=133) ## dynamically determine precision of y-units # capture min and max feature values dist_min = bi_df[self.target.name].values.min() dist_max = bi_df[self.target.name].values.max() # determine y-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: y_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: y_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: y_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: y_units = "ff" else: y_units = "f" # add vertical box plot to canvas p.box_plot_v( x=feature, y=self.target.name, data=bi_df.sort_values([feature]), color=matplotlib.cm.get_cmap(name=color_map), label_rotate=rotation, y_units=y_units, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] plt.show()
def multi_line(self, x, y, label=None, df=None, linecolor=None, linestyle=None, bbox=(1.2, 0.9), x_units="f", x_ticks=None, y_units="f", y_ticks=None, marker_on=False, plot_buffer=False, axis_limits=False, color_map="viridis", ax=None): """ Documentation: Description: Create single plot with multiple lines. Capable of adjusting which axis will have the same data for each line and which will have different data for each line. --- Parameters: x : array or string Either 1-dimensional array of values, a multidimensional array of values, a list of columns in a Pandas DataFrame, or a column name in a Pandas DataFrame. y : array or string Either 1-dimensional array of values, a multidimensional array of values, a list of columns in a Pandas DataFrame, or a column name in a Pandas DataFrame. label : list of strings : default=None Custom legend label for each line. df : Pandas DataFrame, default=None Pandas DataFrame containing data to plot. Can be any size, as plotted columns will be chosen by columns names specified in x and y parameters. linecolor : str, default=None Line colors. If None, utilizes color_map linestyle : str, default=None Line style. bbox : tuple, default=(1.2, 0.9) Coordinates for determining legend position. x_units : str, default='d' Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. x_ticks : array, default=None Custom x-tick labels. y_units : str, default='d' Determines unit of measurement for x-axis tick labels. 's' displays string. 'f' displays float. 'p' displays percentages, 'd' displays dollars. Repeat character (e.g 'ff' or 'ddd') for additional decimal places. y_ticks : array, default=None Custom y-tick labels. marker_on : bool, default=False Controls whether to show line with markers for each data element. plot_buffer : bool, default=False Controls whether dynamic plot buffer function is executed to ensure visual elements are not cut-off at the figure borders. axis_limits : bool, default=False Controls whether dynamic axis limit setting function is executed. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. ax : axes object, default=None Axis object for the visualization. """ if ax is None: ax = self.ax # if a Pandas DataFrame is passed to function, create x and y arrays using columns names passed into function if df is not None: if isinstance(df.index, pd.core.indexes.base.Index): x = df.index.values else: x = df[x].values y = df[y].values else: # convert input list to array x = np.array(x) if isinstance(x, list) else x y = np.array(y) if isinstance(y, list) else y x = x.reshape(-1, 1) if len(x.shape) == 1 else x y = y.reshape(-1, 1) if len(y.shape) == 1 else y # generate color list color_list = style.color_gen(name=color_map, num=y.shape[1]) # add multiple lines to plot for ix in np.arange(y.shape[1]): y_col = y[:, ix] plt.plot( x, y_col * 100 if "p" in y_units else y_col, color=linecolor if linecolor is not None else color_list[ix], linestyle=linestyle if linestyle is not None else style.style_line_style[0], linewidth=0.247 * self.chart_scale, label=label[ix] if label is not None else None, marker="." if marker_on else None, markersize=17 if marker_on else None, markerfacecolor="w" if marker_on else None, markeredgewidth=2.2 if marker_on else None, ) # add legend to figure if label is not None: plt.legend( loc="upper right", bbox_to_anchor=bbox, ncol=1, frameon=True, fontsize=1.1 * self.chart_scale, ) # optionally set axis lower / upper limits if axis_limits: x_min, x_max, y_min, y_max = util.util_set_axes(x=x, y=y) plt.axis([x_min, x_max, y_min, y_max]) # optionally create smaller buffer around plot area to prevent cutting off elements if plot_buffer: util.util_plot_buffer(ax=ax, x=0.02, y=0.02) # optionally creates custom x-tick labels if x_ticks is not None: ax.set_xticks(x_ticks) # optionally creates custom y-tick labels if y_ticks is not None: ax.set_yticks(y_ticks) # format x and y ticklabels ax.set_yticklabels( ax.get_yticklabels() * 100 if "p" in y_units else ax.get_yticklabels(), rotation=0, fontsize=1.1 * self.chart_scale, color=style.style_grey, ) ax.set_xticklabels( ax.get_xticklabels() * 100 if "p" in y_units else ax.get_xticklabels(), rotation=0, fontsize=1.1 * self.chart_scale, color=style.style_grey, ) # axis tick label formatting util.util_label_formatter(ax=ax, x_units=x_units, y_units=y_units)