def eda_transform_log1(self, data, name, chart_scale=15): """ Documentation: --- Description: Creates a two_panel visualization. The left plot is the log + 1 transformed distribution overlayed on a normal distribution. The right plot is a log + 1 adjusted qqplot overlayed across a straight line. --- Parameters: data : Pandas Series Target variable data object. name : str Name of target variable. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # create prettierplot object p = PrettierPlot(chart_scale=chart_scale) # add canvas to prettierplot object ax = p.make_canvas( title="dist/kde - {} (log+1)".format(name), x_label="", y_label="", y_shift=0.8, position=223, ) # add distribution / kernel density plot to canvas p.dist_plot( np.log1p(data), color=style.style_grey, fit=stats.norm, x_rotate=True, ax=ax ) # turn off x and y ticks plt.xticks([]) plt.yticks([]) # add canvas to prettierplot object ax = p.make_canvas( title="probability plot - {} (log+1)".format(name), x_label="", y_label="", y_shift=0.8, position=224, ) # add QQ / probability plot to canvas p.prob_plot(np.log1p(data), plot=ax) # turn off x and y ticks plt.xticks([]) plt.yticks([])
def eda_num_target_num_feat(self, feature, color_map="viridis", chart_scale=15): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a numeric feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ ### data summaries ## feature summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[self.target.name] = bi_df[self.target.name].astype(float) # create summary statistic table describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add skew and kurtosis to describe_df describe_df = describe_df.append( { "index": "skew", feature: stats.skew(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # display summary tables display(describe_df) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Feature distribution\n* {}".format(feature), position=131, title_scale=1.2) # determine x-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: x_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: x_units = "ff" else: x_units = "f" # determine y-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: y_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: y_units = "ff" else: y_units = "f" # x rotation if -10000 < np.nanmax(bi_df[feature].values) < 10000: x_rotate = 0 else: x_rotate = 45 # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units=y_units, x_rotate=x_rotate, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title="Probability plot\n* {}".format(feature), position=132) # add QQ / probability plot to canvas p.prob_plot(x=bi_df[feature].values, plot=ax) # add canvas to prettierplot object ax = p.make_canvas( title="Regression plot - feature vs. target\n* {}".format(feature), position=133, title_scale=1.5) # add regression plot to canvas p.reg_plot( x=feature, y=self.target.name, data=bi_df, x_jitter=0.1, x_rotate=x_rotate, x_units=x_units, y_units=y_units, ax=ax, ) plt.show()
def eda_cat_target_num_feat(self, feature, color_map="viridis", outliers_out_of_scope=None, legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a number feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. outliers_out_of_scope : boolean, float or int, default=None Truncates the x-axis upper limit so that outliers are out of scope of the visualization. The x-axis upper limit is reset to the maximum non-outlier value. To identify outliers, the IQR is calculated, and values that are below the first quartile minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is passed, the IQR is multiplied by that value. Higher values increase how extremem values need to be to be identified as outliers. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ ### data summaries ## bivariate roll_up table # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # bivariate summary statistics bi_summ_stats_df = pd.DataFrame( columns=["Class", "Count", "Proportion", "Mean", "StdDev"]) # for each unique class label for labl in np.unique(self.target): # get feature values associated with single class label feature_slice = bi_df[bi_df[self.target.name] == labl][feature] # append summary statistics for feature values associated with class label bi_summ_stats_df = bi_summ_stats_df.append( { "Class": labl, "Count": len(feature_slice), "Proportion": len(feature_slice) / len(bi_df[feature]) * 100, "Mean": np.mean(feature_slice), "StdDev": np.std(feature_slice), }, ignore_index=True, ) # apply custom legend labels, or set dtype to int if column values are numeric if legend_labels is not None: bi_summ_stats_df["Class"] = legend_labels elif is_numeric_dtype(bi_summ_stats_df["Class"]): bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int) ## Feature summary describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add missing percentage describe_df = describe_df.append( { "index": "missing", feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5), }, ignore_index=True, ) # add skew describe_df = describe_df.append( { "index": "skew", feature: np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5), }, ignore_index=True, ) # add kurtosis describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # execute z-test or t-test if len(np.unique(self.target)) == 2: s1 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[0])][feature] s2 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[1])][feature] if len(s1) > 30 and len(s2) > 30: # perform z-test, return z-statistic and p-value z, p_val = ztest(s1, s2) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) else: # perform t-test, return t-score and p-value t, p_val = stats.ttest_ind(s1, s2) # add t-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "t-test statistic": t, "p-value": p_val }], columns=["t-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Statistical test" ], ) else: # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # if boolean is passed to outliers_out_of_scope if isinstance(outliers_out_of_scope, bool): # if outliers_out_of_scope = True if outliers_out_of_scope: # identify outliers using IQR method and an IQR step of 5 outliers = self.outlier_IQR(self.data[feature], iqr_step=5) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # if outliers_out_of_scope is a float or int elif isinstance(outliers_out_of_scope, float) or isinstance( outliers_out_of_scope, int): # identify outliers using IQR method and an IQR step equal to the float/int passed outliers = self.outlier_IQR(self.data[feature], iqr_step=outliers_out_of_scope) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # add canvas to prettierplot object ax = p.make_canvas( title="Feature distribution\n* {}".format(feature), title_scale=0.85, position=221, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units="f", x_units=x_units, ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Probability plot\n* {}".format(feature), title_scale=0.85, position=222, ) # add QQ / probability plot to canvas p.prob_plot( x=bi_df[feature].values, plot=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Distribution by class\n* {}".format(feature), title_scale=0.85, position=223, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # generate color list color_list = style.color_gen(name=color_map, num=len(np.unique(self.target))) # add one distribution plot to canvas for each category class for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)): p.dist_plot( bi_df[bi_df[self.target.name] == labl][feature].values, color=color_list[ix], y_units="f", x_units=x_units, legend_labels=legend_labels if legend_labels is not None else np.arange(len(np.unique(self.target))), alpha=0.4, bbox=(1.0, 1.0), ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Boxplot by class\n* {}".format(feature), title_scale=0.85, position=224, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add horizontal box plot to canvas p.box_plot_h(x=feature, y=self.target.name, data=bi_df, alpha=0.7, x_units=x_units, legend_labels=legend_labels, bbox=(1.2, 1.0), suppress_outliers=True, ax=ax) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max) # apply position adjustment to subplots plt.subplots_adjust(bottom=-0.1) plt.show()
def regression_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15): """ Documentation: Description: creates a set of residual plots and pandas DataFrames, where each row captures various summary statistics pertaining to a model's performance. generates residual plots and captures performance data for training and validation datasets. If no validation set is provided, then cross_validation is performed on the training dataset. Parameters: model : model object Instantiated model object. X_train : Pandas DataFrame Training data observations. y_train : Pandas Series Training target data. X_valid : Pandas DataFrame, default=None Validation data observations. y_valid : Pandas Series, default=None Validation target data. n_folds : int, default=None Number of cross-validation folds to use. If validation data is provided through X_valid/y_valid, n_folds is ignored. title_scale : float, default=1.0 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. random_state : int, default=1 Random number seed. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ print("*" * 55) print("* Estimator: {}".format(model.estimator_name)) print("* Parameter set: {}".format(model.model_iter)) print("*" * 55) print("\n" + "*" * 55) print("Training data evaluation") # fit model on training data model.fit(X_train.values, y_train.values) ## training dataset # generate predictions using training data and calculate residuals y_pred = model.predict(X_train.values) residuals = y_pred - y_train.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - training data\nModel: {}\nParameter set: {}". format( model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, title_scale=title_scale, position=121, ) # dynamically size precision of x-units based on magnitude of maximum # predicted values if -1 <= np.nanmax(y_pred) <= 1: x_units = "fff" elif -100 <= np.nanmax(y_pred) <= 100: x_units = "ff" else: x_units = "f" # dynamically size precision of y-units based on magnitude of maximum # predicted values if -0.1 <= np.nanmax(residuals) <= 0.1: y_units = "ffff" elif -1 <= np.nanmax(residuals) <= 1: y_units = "fff" elif -10 <= np.nanmax(residuals) <= 10: y_units = "ff" else: y_units = "f" # x tick label rotation if -10000 < np.nanmax(y_pred) < 10000: x_rotate = 0 else: x_rotate = 45 # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines(y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - training data\nModel: {}\nParameter set: {}". format( model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using training data and predictions results = self.regression_stats( model=model, y_true=y_train.values, y_pred=y_pred, feature_count=X_train.shape[1], ) # create shell results DataFrame and append regression_results_summary = pd.DataFrame(columns=list(results.keys())) regression_results_summary = regression_results_summary.append( results, ignore_index=True) ## validation dataset # if validation data is provided... if X_valid is not None: print("\n" + "*" * 55) print("Training data evaluation") # generate predictions with validation data and calculate residuals y_pred = model.predict(X_train.values) residuals = y_pred - y_train.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - training data\nModel: {}\nParameter set: {}" .format( model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, title_scale=title_scale, position=121, ) # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines(y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - training data\nModel: {}\nParameter set: {}" .format( model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using validation data and predictions results = self.regression_stats( model=model, y_true=y_train.values, y_pred=y_pred, feature_count=X_train.shape[1], data_type="validation", ) # append results to regression_results_summary regression_results_summary = regression_results_summary.append( results, ignore_index=True) display(regression_results_summary) # if n_folds are provided, indicating cross-validation elif isinstance(n_folds, int): # generate cross-validation indices cv = list( KFold(n_splits=n_folds, shuffle=True, random_state=random_state).split(X_train, y_train)) print("\n" + "*" * 55) print("Cross validation evaluation") # iterate through cross-validation indices for i, (train_ix, valid_ix) in enumerate(cv): X_train_cv = X_train.iloc[train_ix] y_train_cv = y_train.iloc[train_ix] X_valid_cv = X_train.iloc[valid_ix] y_valid_cv = y_train.iloc[valid_ix] # fit model on training data and generate predictions using holdout observations y_pred = model.fit(X_train_cv.values, y_train_cv.values).predict(X_valid_cv.values) # calculate residuals residuals = y_pred - y_valid_cv.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - CV fold {}\nModel: {}\nParameter set: {}" .format( i + 1, model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, position=121, title_scale=title_scale, ) # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, # color=color_list[i], y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines( y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2, ) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - CV fold {}\nModel: {}\nParameter set: {}" .format( i + 1, model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using holdout observations and predictions results = self.regression_stats( model=model, y_true=y_valid_cv, y_pred=y_pred, feature_count=X_valid_cv.shape[1], data_type="validation", fold=i + 1, ) # append results to regression_results_summary regression_results_summary = regression_results_summary.append( results, ignore_index=True) print("\n" + "*" * 55) print("Summary") display(regression_results_summary) else: display(regression_results_summary)
def eda_num_target_num_feat(self, feature, training_data=True, color_map="viridis", chart_scale=15, save_plots=False): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a numeric feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. save_plots : boolean, default=False Controls whether model loss plot imgaes are saved to the experiment directory. """ # dynamically choose training data objects or validation data objects data, target, mlm_dtypes = self.training_or_validation_dataset( training_data) ### data summaries ## feature summary # combine feature column and target bi_df = pd.concat([data[feature], target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[target.name] = bi_df[target.name].astype(float) # create summary statistic table describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add skew and kurtosis to describe_df describe_df = describe_df.append( { "index": "skew", feature: stats.skew(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # display summary tables display(describe_df) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title=f"Feature distribution\n* {feature}", position=131, title_scale=1.2) # determine x-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: x_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: x_units = "ff" else: x_units = "f" # determine y-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: y_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: y_units = "ff" else: y_units = "f" # x rotation if -10000 < np.nanmax(bi_df[feature].values) < 10000: x_rotate = 0 else: x_rotate = 45 # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units=y_units, x_rotate=x_rotate, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132) # add QQ / probability plot to canvas p.prob_plot(x=bi_df[feature].values, plot=ax) # add canvas to prettierplot object ax = p.make_canvas( title=f"Regression plot - feature vs. target\n* {feature}", position=133, title_scale=1.5) # add regression plot to canvas p.reg_plot( x=feature, y=target.name, data=bi_df, x_jitter=0.1, x_rotate=x_rotate, x_units=x_units, y_units=y_units, ax=ax, ) # save plots or show if save_plots: plot_path = os.path.join( self.eda_object_dir, f"{feature}.jpg".replace("/", ""), ) plt.tight_layout() plt.savefig(plot_path) plt.close() else: plt.show()