def _get_jensen_shannon_core(Ks, dim, X_ns, Y_ns): # precompute the max/min possible digamma(i) values: the floors/ceils of # # M/(n+m-1) / (1 / (2 n - 1)) # M/(n+m-1) / (n / (m (2 n - 1))) # # for any valid value of n, m. min_X_n = np.min(X_ns) max_X_n = np.max(X_ns) if Y_ns is None: min_Y_n = min_X_n max_Y_n = max_X_n else: min_Y_n = np.min(Y_ns) max_Y_n = np.max(Y_ns) min_K = np.min(Ks) max_K = np.max(Ks) # figure out the smallest i value we might need (# of neighbors in ball) wt_bounds = [np.inf, -np.inf] min_wt_n = None min_wt_m = None # max_wt_n = None; max_wt_m = None n_ms = list(itertools.product([min_X_n, max_X_n], [min_Y_n, max_Y_n])) for n, m in itertools.chain(n_ms, map(reversed, n_ms)): base = (2 * n - 1) / (n + m - 1) for wt in (base, base * m / n): if wt < wt_bounds[0]: wt_bounds[0] = wt min_wt_n = n min_wt_m = m if wt > wt_bounds[1]: wt_bounds[1] = wt # max_wt_n = n # max_wt_m = m if wt_bounds[0] * min_K < 1: msg = "K={} is too small for Jensen-Shannon estimator with n={}, m={}" raise ValueError((msg + "; must be at least {}").format( min_K, min_wt_n, min_wt_m, int(np.ceil(1 / wt_bounds[0])))) min_i = int(np.floor(wt_bounds[0] * min_K)) max_i = int(np.ceil(wt_bounds[1] * max_K)) digamma_vals = psi(np.arange(min_i, max_i + 1)) # TODO: If we don't actually hit the worst case, might be nice to still # run and just nan those elements that we can't compute. This is # over-conservative. return partial(_jensen_shannon_core, Ks, dim, min_i, digamma_vals), max_i
def _get_jensen_shannon_core(Ks, dim, X_ns, Y_ns): # precompute the max/min possible digamma(i) values: the floors/ceils of # # M/(n+m-1) / (1 / (2 n - 1)) # M/(n+m-1) / (n / (m (2 n - 1))) # # for any valid value of n, m. min_X_n = np.min(X_ns) max_X_n = np.max(X_ns) if Y_ns is None: min_Y_n = min_X_n max_Y_n = max_X_n else: min_Y_n = np.min(Y_ns) max_Y_n = np.max(Y_ns) min_K = np.min(Ks) max_K = np.max(Ks) # figure out the smallest i value we might need (# of neighbors in ball) wt_bounds = [np.inf, -np.inf] min_wt_n = None; min_wt_m = None # max_wt_n = None; max_wt_m = None n_ms = list(itertools.product([min_X_n, max_X_n], [min_Y_n, max_Y_n])) for n, m in itertools.chain(n_ms, map(reversed, n_ms)): base = (2 * n - 1) / (n + m - 1) for wt in (base, base * m / n): if wt < wt_bounds[0]: wt_bounds[0] = wt min_wt_n = n min_wt_m = m if wt > wt_bounds[1]: wt_bounds[1] = wt # max_wt_n = n # max_wt_m = m if wt_bounds[0] * min_K < 1: msg = "K={} is too small for Jensen-Shannon estimator with n={}, m={}" raise ValueError((msg + "; must be at least {}").format( min_K, min_wt_n, min_wt_m, int(np.ceil(1 / wt_bounds[0])))) min_i = int(np.floor(wt_bounds[0] * min_K)) max_i = int(np.ceil( wt_bounds[1] * max_K)) digamma_vals = psi(np.arange(min_i, max_i + 1)) # TODO: If we don't actually hit the worst case, might be nice to still # run and just nan those elements that we can't compute. This is # over-conservative. return partial(_jensen_shannon_core, Ks, dim, min_i, digamma_vals), max_i
def plot_partial_dependence(gbrt, X, features, feature_names=None, label=None, n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), n_jobs=1, verbose=0, ax=None, line_kw=None, contour_kw=None, **fig_kw): """Partial dependence plots for ``features``. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- gbrt : BaseGradientBoosting A fitted gradient boosting model. X : array-like, shape=(n_samples, n_features) The data on which ``gbrt`` was trained. features : seq of tuples or ints If seq[i] is an int or a tuple with one int value, a one-way PDP is created; if seq[i] is a tuple of two ints, a two-way PDP is created. feature_names : seq of str Name of each feature; feature_names[i] holds the name of the feature with index i. label : object The class label for which the PDPs should be computed. Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``. n_cols : int The number of columns in the grid plot (default: 3). percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used to create the extreme values for the PDP axes. grid_resolution : int, default=100 The number of equally spaced points on the axes. n_jobs : int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to 1. verbose : int Verbose output during PD computations. Defaults to 0. ax : Matplotlib axis object, default None An axis object onto which the plots will be drawn. line_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For one-way partial dependence plots. contour_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For two-way partial dependence plots. fig_kw : dict Dict with keywords passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig : figure The Matplotlib Figure object. axs : seq of Axis objects A seq of Axis objects, one for each subplot. Examples -------- >>> from sklearn.datasets import make_friedman1 >>> from sklearn.ensemble import GradientBoostingRegressor >>> X, y = make_friedman1() >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.ticker import MaxNLocator from matplotlib.ticker import ScalarFormatter # if not isinstance(gbrt, BaseGradientBoosting): # raise ValueError('gbrt has to be an instance of BaseGradientBoosting') if gbrt.estimators_.shape[0] == 0: raise ValueError('Call %s.fit before partial_dependence' % gbrt.__class__.__name__) # set label_idx for multi-class GBRT if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: if label is None: raise ValueError('label is not given for multi-class PDP') label_idx = np.searchsorted(gbrt.classes_, label) if gbrt.classes_[label_idx] != label: raise ValueError('label %s not in ``gbrt.classes_``' % str(label)) else: # regression and binary classification label_idx = 0 X = check_array(X, dtype=DTYPE, order='C') if gbrt.n_features != X.shape[1]: raise ValueError('X.shape[1] does not match gbrt.n_features') if line_kw is None: line_kw = {'color': 'green'} if contour_kw is None: contour_kw = {} # convert feature_names to list if feature_names is None: # if not feature_names use fx indices as name feature_names = [str(i) for i in range(gbrt.n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() def convert_feature(fx): if isinstance(fx, six.string_types): try: fx = feature_names.index(fx) except ValueError: raise ValueError('Feature %s not in feature_names' % fx) return fx # convert features into a seq of int tuples tmp_features = [] for fxs in features: if isinstance(fxs, (numbers.Integral, ) + six.string_types): fxs = (fxs, ) try: fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) except TypeError: raise ValueError('features must be either int, str, or tuple ' 'of int/str') if not (1 <= np.size(fxs) <= 2): raise ValueError('target features must be either one or two') tmp_features.append(fxs) features = tmp_features names = [] try: for fxs in features: l = [] # explicit loop so "i" is bound for exception below for i in fxs: l.append(feature_names[i]) names.append(l) except IndexError: raise ValueError('features[i] must be in [0, n_features) ' 'but was %d' % i) # compute PD functions pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed( partial_dependence )(gbrt, fxs, X=X, grid_resolution=grid_resolution, percentiles=percentiles) for fxs in features) # get global min and max values of PD grouped by plot type pdp_lim = {} for pdp, axes in pd_result: min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() n_fx = len(axes) old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) min_pd = min(min_pd, old_min_pd) max_pd = max(max_pd, old_max_pd) pdp_lim[n_fx] = (min_pd, max_pd) # create contour levels for two-way plots if 2 in pdp_lim: Z_level = np.linspace(*pdp_lim[2], num=8) if ax is None: fig = plt.figure(**fig_kw) else: fig = ax.get_figure() fig.clear() n_cols = min(n_cols, len(features)) n_rows = int(np.ceil(len(features) / float(n_cols))) axs = [] for i, fx, name, (pdp, axes) in zip(count(), features, names, pd_result): ax = fig.add_subplot(n_rows, n_cols, i + 1) if len(axes) == 1: ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) else: # make contour plot assert len(axes) == 2 XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[label_idx].reshape(list(map(np.size, axes))).T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75, **contour_kw) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) # plot data deciles + axes labels deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ylim = ax.get_ylim() ax.vlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_xlabel(name[0]) ax.set_ylim(ylim) # prevent x-axis ticks from overlapping ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) tick_formatter = ScalarFormatter() tick_formatter.set_powerlimits((-3, 4)) ax.xaxis.set_major_formatter(tick_formatter) if len(axes) > 1: # two-way PDP - y-axis deciles + labels deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory( ax.transAxes, ax.transData) xlim = ax.get_xlim() ax.hlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_ylabel(name[1]) # hline erases xlim ax.set_xlim(xlim) else: ax.set_ylabel('Partial dependence') if len(axes) == 1: ax.set_ylim(pdp_lim[1]) axs.append(ax) fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, hspace=0.3) return fig, axs
'_final_predictions_' + str(k) + '.npy') for x in [9]: for y in [25]: target_feature = (x, y) fig = plt.figure() names = [priors[target_feature[0]], priors[target_feature[1]]] print( 'Convenience plot with ``partial_dependence_plots`` for %s and %s' % (names[0], names[1])) pdp, axes = partial_dependence(stacked_clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[0], fontsize=12) ax.set_ylabel(names[1], fontsize=12) ax.set_zlabel('Partial dependence', fontsize=12) ax.view_init(elev=12, azim=-142) plt.xticks([0, 0.5, 1]) plt.yticks([0, 0.5, 1]) ax.set_zticks([-0.2, -0.1, 0, 0.1, 0.2]) ax.set_zlim(-0.2, 0.2)
def plot_partial_dependence(est, X, features, feature_names=None, target=None, n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=1, verbose=0, ax=None, line_kw=None, contour_kw=None, **fig_kw): """Partial dependence plots. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- est : BaseEstimator A fitted classification or regression model. Classifiers must have a ``predict_proba()`` method. Multioutput-multiclass estimators aren't supported. X : array-like, shape=(n_samples, n_features) The data to use to build the grid of values on which the dependence will be evaluated. This is usually the training data. features : list of ints or strings, or tuples of ints or strings The target features for which to create the PDPs. If features[i] is an int or a string, a one-way PDP is created; if features[i] is a tuple, a two-way PDP is created. Each tuple must be of size 2. if any entry is a string, then it must be in ``feature_names``. feature_names : seq of str, shape=(n_features,) Name of each feature; feature_names[i] holds the name of the feature with index i. target : int, optional (default=None) - In a multiclass setting, specifies the class for which the PDPs should be computed. Note that for binary classification, the positive class (index 1) is always used. - In a multioutput setting, specifies the task for which the PDPs should be computed Ignored in binary classification or classical regression settings. n_cols : int, optional (default=3) The number of columns in the grid plot. grid_resolution : int, optional (default=100) The number of equally spaced points on the axes of the plots, for each target feature. percentiles : tuple of float, optional (default=(0.05, 0.95)) The lower and upper percentile used to create the extreme values for the PDP axes. method : str, optional (default='auto') The method to use to calculate the partial dependence predictions: - 'recursion' is only supported for objects inheriting from `BaseGradientBoosting`, but is more efficient in terms of speed. - 'brute' is supported for any estimator, but is more computationally intensive. - If 'auto', then 'recursion' will be used for ``BaseGradientBoosting`` estimators, and 'brute' used for other estimators. Unlike the 'brute' method, 'recursion' does not account for the ``init`` predictor of the boosting process. In practice this still produces the same plots, up to a constant offset in the target response. n_jobs : int, optional (default=1) The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. See :term:`Glossary <n_jobs>` for more details. verbose : int, optional (default=0) Verbose output during PD computations. ax : Matplotlib axis object, optional (default=None) An axis object onto which the plots will be drawn. line_kw : dict, optional Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. For one-way partial dependence plots. contour_kw : dict, optional Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. For two-way partial dependence plots. **fig_kw : dict, optional Dict with keywords passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig : figure The Matplotlib Figure object. axs : seq of Axis objects A seq of Axis objects, one for each subplot. Examples -------- >>> from sklearn.datasets import make_friedman1 >>> from sklearn.ensemble import GradientBoostingRegressor >>> X, y = make_friedman1() >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.ticker import MaxNLocator from matplotlib.ticker import ScalarFormatter # set target_idx for multi-class estimators if hasattr(est, 'classes_') and np.size(est.classes_) > 2: if target is None: raise ValueError('target must be specified for multi-class') target_idx = np.searchsorted(est.classes_, target) if (not (0 <= target_idx < len(est.classes_)) or est.classes_[target_idx] != target): raise ValueError('target not in est.classes_, got {}'.format( target)) else: # regression and binary classification target_idx = 0 X = check_array(X) n_features = X.shape[1] # convert feature_names to list if feature_names is None: # if feature_names is None, use feature indices as name feature_names = [str(i) for i in range(n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() def convert_feature(fx): if isinstance(fx, six.string_types): try: fx = feature_names.index(fx) except ValueError: raise ValueError('Feature %s not in feature_names' % fx) return int(fx) # convert features into a seq of int tuples tmp_features = [] for fxs in features: if isinstance(fxs, (numbers.Integral, six.string_types)): fxs = (fxs,) try: fxs = [convert_feature(fx) for fx in fxs] except TypeError: raise ValueError('Each entry in features must be either an int, ' 'a string, or an iterable of size at most 2.') if not (1 <= np.size(fxs) <= 2): raise ValueError('Each entry in features must be either an int, ' 'a string, or an iterable of size at most 2.') tmp_features.append(fxs) features = tmp_features names = [] try: for fxs in features: names_ = [] # explicit loop so "i" is bound for exception below for i in fxs: names_.append(feature_names[i]) names.append(names_) except IndexError: raise ValueError('All entries of features must be less than ' 'len(feature_names) = {0}, got {1}.' .format(len(feature_names), i)) # compute averaged predictions pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(partial_dependence)(est, fxs, X=X, method=method, grid_resolution=grid_resolution, percentiles=percentiles) for fxs in features) # For multioutput regression, we can only check the validity of target # now that we have the predictions. # Also note: as multiclass-multioutput classifiers are not supported, # multiclass and multioutput scenario are mutually exclusive. So there is # no risk of overwriting target_idx here. pd, _ = pd_result[0] # checking the first result is enough if is_regressor(est) and pd.shape[0] > 1: if target is None: raise ValueError( 'target must be specified for multi-output regressors') if not 0 <= target <= pd.shape[0]: raise ValueError( 'target must be in [0, n_tasks], got {}.'.format( target)) target_idx = target else: target_idx = 0 # get global min and max values of PD grouped by plot type pdp_lim = {} for pd, values in pd_result: min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max() n_fx = len(values) old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) min_pd = min(min_pd, old_min_pd) max_pd = max(max_pd, old_max_pd) pdp_lim[n_fx] = (min_pd, max_pd) # create contour levels for two-way plots if 2 in pdp_lim: Z_level = np.linspace(*pdp_lim[2], num=8) if ax is None: fig = plt.figure(**fig_kw) else: fig = ax.get_figure() fig.clear() if line_kw is None: line_kw = {'color': 'green'} if contour_kw is None: contour_kw = {} n_cols = min(n_cols, len(features)) n_rows = int(np.ceil(len(features) / float(n_cols))) axs = [] for i, fx, name, (pd, values) in zip(count(), features, names, pd_result): ax = fig.add_subplot(n_rows, n_cols, i + 1) if len(values) == 1: ax.plot(values[0], pd[target_idx].ravel(), **line_kw) else: # make contour plot assert len(values) == 2 XX, YY = np.meshgrid(values[0], values[1]) Z = pd[target_idx].reshape(list(map(np.size, values))).T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75, **contour_kw) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) # plot data deciles + axes labels deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ylim = ax.get_ylim() ax.vlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_xlabel(name[0]) ax.set_ylim(ylim) # prevent x-axis ticks from overlapping ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) tick_formatter = ScalarFormatter() tick_formatter.set_powerlimits((-3, 4)) ax.xaxis.set_major_formatter(tick_formatter) if len(values) > 1: # two-way PDP - y-axis deciles + labels deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transAxes, ax.transData) xlim = ax.get_xlim() ax.hlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_ylabel(name[1]) # hline erases xlim ax.set_xlim(xlim) else: ax.set_ylabel('Partial dependence') if len(values) == 1: ax.set_ylim(pdp_lim[1]) axs.append(ax) fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, hspace=0.3) return fig, axs