def feature_correlation_plot(df, y_column, model, feature_column, columns_to_exclude=()): """This function detects the feature type and plots the correlation information between feature and actual label. The correlation is defined as the fraction of data that has a positive true label (a.k.a. average of true label for binary label). It also plots the predicted positive class probability. Parameters ---------- df : DataFrame Data to be plotted y_column : str Name of the class column model : Scikitlearn-like-model The model object to be evaluated feature_column : str, or 1d array-like Name of the feature column to plot correlation on. If passed in as 1d array-like, the features will be treated as one-hot encoded. columns_to_exclude : tuple, optional (default=()) Names of unwanted columns Returns ------- plot_wrapper : pytalite.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot """ # Get X, y array representation and feature indices of data X, y, name_to_idx = df_to_arrays(df, y_column, columns_to_exclude, return_index=True) if type(feature_column) is str: # Set up feature values and feature names feature_idx = name_to_idx[feature_column] feature_values = X[:, feature_idx] # Determine the feature type and plot accordingly if _feature_type(feature_values) == "categorical": return _categorical_fc_plot(X, y, model, feature_idx, feature_column) else: return _numerical_fc_plot(X, y, model, feature_idx, feature_column) else: # One-hot features feature_idx = np.array([name_to_idx[cat] for cat in feature_column]) return _categorical_fc_plot(X, y, model, feature_idx, feature_column, one_hot=True)
def _feature_importance(df, y_column, model, n_jobs, columns_to_exclude=(), n_samples=100): """Compute all feature importances by performing multiprocessing""" X, y, name_to_idx = df_to_arrays(df, y_column, columns_to_exclude, return_index=True) n_jobs = None if n_jobs < 0 else n_jobs sample_importance_func = partial(_sample_feature_importance, X=X, y_true=y, model=model, sample_size=X.shape[0] // n_samples) if n_jobs == 1: sys.stderr.write("Going single process") stats = [] for stat in map(sample_importance_func, range(n_samples)): stats.append(stat) else: with Pool(n_jobs) as executor_instance: chunksize, extra = divmod(n_samples, len(executor_instance._pool)) if extra: chunksize += 1 stats = [] sys.stderr.write("Start Multiprocessing, num_processes=%d" % len(executor_instance._pool)) for stat in executor_instance.map(sample_importance_func, range(n_samples), chunksize): stats.append(stat) assert len(stats) == 100 stats = np.array(stats) normalized = (stats - np.min(stats)) / np.ptp(stats) importance = [(name, _comp_mean_ci(normalized[:, idx])) for name, idx in name_to_idx.items()] return sorted(importance, key=lambda x: -x[1][0])
def feature_ale_plot(df, y_column, model, feature_column, predictor=None, columns_to_exclude=(), bins=100): """This function create the Accumulated Local Effect (ALE) plot of the target feature. Visit https://christophm.github.io/interpretable-ml-book/ale.html for more detailed explanation of ALE. Parameters ---------- df : DataFrame Data to be plotted y_column : str Name of the class column model : Scikitlearn-like-model The model object to be evaluated feature_column : str Name of the feature column to plot ALE on predictor : function, optional (default=None) The prediction function, which should take in the feature matrix and return an array of predictions The function should output positive class probabilities for a classification task, and actual predicted values for a regression task. If not specified, defaults to a function equivalent to: lambda X: model.predict_prob(X)[:, 1], which is for classification. columns_to_exclude : tuple, optional (default=()) Names of unwanted columns bins : int, optional (default=100) The number of intervals for the ALE plot Returns ------- plot_wrapper : pytalite.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot """ # Get X, y array representation and feature indices from data X, _, name_to_idx = df_to_arrays(df, y_column, columns_to_exclude, return_index=True) feature_idx = name_to_idx[feature_column] if predictor is None: def predictor(X): return model.predict_proba(X)[:, 1] unique_feature_vals = np.unique(X[:, feature_idx]) unique_feature_vals = unique_feature_vals[unique_feature_vals != -1] quantiles = np.percentile(unique_feature_vals, [i * 100 / bins for i in range(0, bins + 1)]) ale, counts = _ale_num(feature_idx, X, predictor, quantiles) xs = (quantiles[1:] + quantiles[:-1]) / 2 with plt.style.context(style_path): fig = plt.figure(figsize=(12, 9)) grid = GridSpec(2, 1, height_ratios=[10, 1], hspace=0) ax1 = plt.subplot(grid[0]) fig.add_subplot(ax1) line_plot(ax1, xs, ale, line_label=False, xticks=[], ylabel="ALE of %s" % y_column) ax2 = plt.subplot(grid[1]) fig.add_subplot(ax2, sharex=ax1) event_plot(ax2, X[:, feature_idx][X[:, feature_idx] != -1], 0.5, 1, xlabel=feature_column, yticks=[], ylim=(-0.2, 1.2)) plt.show() return PlotWrapper(fig, (ax1, ax2), { "quantiles": quantiles, "ale": ale, "quantile_distribution": counts })
def density_plot(df, y_column, models, model_names=(), columns_to_exclude=()): """This function creates the density plot of predicted positive class probability on actual positive and negative data by each model in models in the same plot. It also computes the difference between the distributions on positive and negative data using Bhattacharyya distance, KL distance, and cross entropy (a.k.a. log-loss). Parameters ---------- df : DataFrame Data to be plotted y_column : str Label of the class column models : array-like The model objects to be evaluated model_names : array-like The name of the models to be shown in the legends columns_to_exclude : tuple, optional (default=()) Labels of unwanted columns Returns ------- plot_wrapper : pytalite.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot Raises ------ ValueError If models is empty or models and model_names does not have the same length """ # Get X, y array representation of data snd predict probability X, y = df_to_arrays(df, y_column, columns_to_exclude) pos_idx = y == 1 neg_idx = y == 0 n_models = len(models) if n_models == 0: raise ValueError("no models to evaluate") if len(model_names) == 0: model_names = ["model %d" % (i + 1) for i in range(n_models)] if len(model_names) != n_models: raise ValueError("models and model_names must have the same length") # List and array to store data pos_data = np.empty((0, 1000)) neg_data = np.empty((0, 1000)) bds = [] kls = [] ces = [] with plt.style.context(style_path): fig = plt.figure(figsize=(12, 9)) grid = GridSpec(2, 1, height_ratios=[3.5, 3.5], hspace=0) ax1 = fig.add_subplot(grid[0]) ax2 = fig.add_subplot(grid[1]) scores = [] # Compute density curve for all models for model, model_name in zip(models, model_names): y_prob = model.predict_proba(X)[:, 1] # Fit gaussian kernels on the data kernel_pos = st.gaussian_kde(y_prob[pos_idx]) kernel_neg = st.gaussian_kde(y_prob[neg_idx]) xs = np.arange(1000) / 1000 pos_y = kernel_pos(xs) neg_y = kernel_neg(xs) # Normalize the curve pos_norm = (pos_y / pos_y.sum())[np.newaxis, :] neg_norm = (neg_y / neg_y.sum())[np.newaxis, :] # Compute all three scores bd = _bhattacharyya_distance(pos_norm, neg_norm, normalize=True) kl = st.entropy(pos_norm[0], neg_norm[0]) ce = _cross_entropy(pos_norm, neg_norm, normalize=True) # Plot using the kernels line_plot(ax1, xs, pos_y, legend=model_name, line_color=None, line_label=False) line_plot(ax2, xs, neg_y, line_color=None, line_label=False) scores.append( "%s: Bhattacharyya Distance: %.4f, KL Distance: %.4f, Cross-Entropy: %.4f" % (model_name, bd, kl, ce)) # Add data pos_data = np.vstack((pos_data, pos_y)) neg_data = np.vstack((neg_data, neg_y)) bds.append(bd) kls.append(kl) ces.append(ce) ylim_max = max(pos_data.max(), neg_data.max()) * 1.1 ylim_min = round(-ylim_max * 0.05, 1) # Add scores to plot as text # ax3.text(0.5, 0.5, "\n".join(scores), va="center", ha="center") config_axes(ax1, xticks=[], ylabel="Positive Density", ylim=(ylim_min, ylim_max)) config_axes(ax2, y_invert=True, xlabel="Probability\n" + "\n".join(scores), ylabel="Negative Density", ylim=(ylim_min, ylim_max)) plt.show() return PlotWrapper( fig, (ax1, ax2), { "probability": xs, "pos_density": pos_data, "neg_density": neg_data, "Bhattacharyya": np.array(bds), "KL": np.array(kls), "cross_entropy": np.array(ces) })
def decile_plot(df, y_column, model, columns_to_exclude=(), num_deciles=10): """The function sorts the data points by the predicted positive class probability and divide them into bins. It plots bins based on the cumulative precision and recall in two plots. Parameters ---------- df : DataFrame Data to be plotted y_column : str Name of the class column model : Scikitlearn-like-model The model object to be evaluated columns_to_exclude : tuple, optional (default=()) Names of unwanted columns num_deciles : int, optional (default=10) Number of bars to be plotted, each bar represents about 1/num_deciles of the data Returns ------- plot_wrapper : pytalite.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot Raises ------ ValueError If the number of deciles exceeds 50 """ # Validation check if num_deciles > 50: raise ValueError("The number of deciles cannot exceed 50") # Get X, y array representation of data X, y = df_to_arrays(df, y_column, columns_to_exclude) # Get and sort predicted probability, then split to 10 arrays (deciles) y_prob = model.predict_proba(X) indices = np.argsort(y_prob[:, 1])[::-1] deciles = list(indices[:indices.shape[0] - indices.shape[0] % num_deciles].reshape( (num_deciles, y_prob.shape[0] // num_deciles))) deciles[-1] = np.concatenate( (deciles[-1], indices[indices.shape[0] - indices.shape[0] % num_deciles:])) true_counts = np.array( [np.bincount(y[decile], minlength=2)[1] for decile in deciles]) decile_size = np.array([decile.shape[0] for decile in deciles]) # Calculate the true label fraction on each decile and cumulative decile precision cum_recall_score = np.cumsum(true_counts) / true_counts.sum() cum_precision_score = np.cumsum(true_counts) / np.cumsum(decile_size) # Create decile plot with plt.style.context(style_path): fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 9)) xticks = np.arange(0, 11) / 10 xtick_labels = list(map(lambda x: "%d%%" % x, xticks * 100)) xs = np.arange(num_deciles) / num_deciles # Draw bar plot bar_plot(ax1, xs, cum_precision_score, width=1 / num_deciles, align='edge', ylim=(0, np.max(cum_precision_score) * 1.2), ylabel="Cumulative precision", edge_color='w', bar_label=False) # Create cumulative decile plot ax2 = plt.subplot(2, 1, 2, sharex=ax1) # Draw bar plot bar_plot(ax2, xs, cum_recall_score, width=1 / num_deciles, align='edge', xticks=xticks, xticklabels=xtick_labels, xlim=(0, 1), xlabel="Deciles", ylim=(0, np.max(cum_recall_score) * 1.2), ylabel="Cumulative recall", bar_color=clr.main[0], edge_color='w', bar_label=False) plt.show() return PlotWrapper( fig, (ax1, ax2), { "shared_x": xs, "cum_recall_score": cum_recall_score, "cum_precision_score": cum_precision_score })
def partial_dependence_plot(df, y_column, model, feature_column, predictor=None, columns_to_exclude=(), n_jobs=-1): """This function create the Partial Dependence plot (PDP) of the target feature. Visit https://christophm.github.io/interpretable-ml-book/pdp.html for more detailed explanation of PDP. Parameters ---------- df : DataFrame Data to be plotted y_column : str Name of the class column model : Scikitlearn-like-model The model object to be evaluated feature_column : str, or 1d array-like Name of the feature column to plot PDP on. If passed in as 1d array-like, the features will be treated as one- hot encoded. predictor : function, optional (default=None) The prediction function, which should take in the feature matrix and return an array of predictions The function should output positive class probabilities for a classification task, and actual predicted values for a regression task. If not specified, defaults to a function equivalent to: lambda X: model.predict_prob(X)[:, 1], which is for classification. columns_to_exclude : tuple, optional (default=()) Names of unwanted columns n_jobs : int, optional (default=-1) Level of multiprocessing, 1 means single-process, -1 means unlimited (actually number of processes depends on the machine) Returns ------- plot_wrapper : pytalite.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot """ X, _, name_to_idx = df_to_arrays(df, y_column, columns_to_exclude, return_index=True) if predictor is None: def predictor(X): return model.predict_proba(X)[:, 1] if type(feature_column) is str: feature_idx = name_to_idx[feature_column] feature_values = X[:, feature_idx] if _feature_type(feature_values) == "categorical": return _partial_dependence_plot_cat(X, predictor, feature_idx, y_column, feature_column, n_jobs) else: return _partial_dependence_plot_num(X, predictor, feature_idx, y_column, feature_column, n_jobs) else: feature_idx = np.array([name_to_idx[cat] for cat in feature_column]) return _partial_dependence_plot_cat(X, predictor, feature_idx, y_column, feature_column, n_jobs, one_hot=True)