Exemplo n.º 1
0
    def __init__(self,
                 tree_model,
                 x_data: (pd.DataFrame, np.ndarray),
                 y_data: (pd.Series, np.ndarray),
                 feature_names: List[str] = None,
                 target_name: str = None,
                 class_names: (List[str], Mapping[int, str]) = None):
        """
        Parameters
        ----------
        :param tree_model: sklearn.tree.DecisionTreeRegressor, sklearn.tree.DecisionTreeClassifier, xgboost.core.Booster
            The decision tree to be interpreted
        :param x_data: pd.DataFrame, np.ndarray
            Features values on which the shadow tree will be build.
        :param y_data: pd.Series, np.ndarray
            Target values on which the shadow tree will be build.
        :param feature_names: List[str]
            Features' names
        :param target_name: str
            Target's name
        :param class_names: List[str], Mapping[int, str]
            Class' names (in case of a classifier)

        """

        self.tree_model = tree_model
        if not self.is_fit():
            raise Exception(f"Model {tree_model} is not fit.")

        self.feature_names = feature_names
        self.target_name = target_name
        self.x_data = ShadowDecTree._get_x_data(x_data)
        self.y_data = ShadowDecTree._get_y_data(y_data)
        self.root, self.leaves, self.internal = self._get_tree_nodes()
        if self.is_classifier():
            self.class_names = utils._normalize_class_names(
                class_names, self.nclasses())
Exemplo n.º 2
0
def clfviz_univar(model,
                  x: np.ndarray,
                  y: np.ndarray,
                  ntiles=100,
                  binary_threshold=0.5,
                  show=[
                      'instances', 'boundaries', 'probabilities',
                      'misclassified', 'legend'
                  ],
                  feature_name=None,
                  target_name=None,
                  class_names=None,
                  markers=None,
                  fontsize=9,
                  fontname="Arial",
                  dot_w=25,
                  yshift=.09,
                  sigma=.09,
                  colors: dict = None,
                  ax=None) -> None:
    """
    See comment and parameter descriptions for clfviz() above.
    """
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 1.2))

    if isinstance(x, pd.Series):
        x = x.values
    if isinstance(y, pd.Series):
        y = y.values

    if (len(x.shape) == 2 and x.shape[1] != 1) or len(x.shape) > 2:
        raise ValueError(f"Expecting 1D data not {x.shape}")

    colors = adjust_colors(colors)

    mu = 0.08
    class_values = np.unique(y)
    nclasses = len(class_values)
    class_colors = np.array(colors['classes'][nclasses])
    color_map = {v: class_colors[i] for i, v in enumerate(class_values)}

    x1r = np.max(x) - np.min(x)
    x1range = (np.min(x), np.max(x))
    grid_points, w = np.linspace(*x1range,
                                 num=ntiles,
                                 endpoint=True,
                                 retstep=True)
    grid_proba = _predict_proba(model, grid_points)
    if len(np.unique(y)) == 2:  # is k=2 binary?
        grid_pred = np.where(grid_proba[:, 1] >= binary_threshold, 1, 0)
    else:
        grid_pred = np.argmax(grid_proba,
                              axis=1)  # TODO: assumes classes are 0..k-1
    ymax = ax.get_ylim()[1]

    # compute the stripes on the bottom showing probabilities
    if 'probabilities' in show:
        class_values = np.unique(y)
        color_map, grid_pred_colors, grid_proba_colors = \
            _get_grid_colors(grid_proba, grid_pred, class_values, colors=adjust_colors(None))

        pred_box_height = .08 * ymax
        boxes = []
        for i, gx in enumerate(grid_points):
            rect = patches.Rectangle((gx, 0),
                                     w,
                                     pred_box_height,
                                     edgecolor='none',
                                     facecolor=grid_proba_colors[i],
                                     alpha=colors['tile_alpha'])
            boxes.append(rect)
        # drop box around the gradation
        ax.add_collection(PatchCollection(boxes, match_original=True))
        rect = patches.Rectangle((grid_points[0], 0),
                                 x1r + w,
                                 pred_box_height,
                                 linewidth=.3,
                                 edgecolor=colors['rect_edge'],
                                 facecolor='none')
        ax.add_patch(rect)

    if 'boundaries' in show:
        dx = np.abs(np.diff(grid_pred))
        dx = np.hstack([0, dx])
        dx_edge_idx = np.where(dx)  # indexes of dx class transitions?
        for lx in grid_points[dx_edge_idx]:
            ax.plot([lx, lx], [*ax.get_ylim()],
                    '--',
                    lw=.3,
                    c=colors['split_line'],
                    alpha=1.0)

    if 'instances' in show:
        # user should pass in short and wide fig
        x_proba = _predict_proba(model, x)
        if len(np.unique(y)) == 2:  # is k=2 binary?
            x_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0)
        else:
            x_pred = np.argmax(x_proba,
                               axis=1)  # TODO: assumes classes are 0..k-1
        class_x = [x[y == cl] for cl in class_values]
        class_x_pred = [x_pred[y == cl] for cl in class_values]

        if markers is None:
            markers = ['o'] * len(class_x)
        for i, x_, in enumerate(class_x):
            if 'misclassified' in show:
                # Show correctly classified markers
                good_x = x_[class_x_pred[i] == class_values[i]]
                noise = np.random.normal(mu, sigma, size=len(good_x))
                ax.scatter(good_x, [mu + i * yshift] * len(good_x) + noise,
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=colors['scatter_marker_alpha'],
                           edgecolors=colors['scatter_edge'],
                           lw=.5)
                # Show misclassified markers (can't have alpha per marker so do in 2 calls)
                bad_x = x_[class_x_pred[i] != class_values[i]]
                noise = np.random.normal(mu, sigma, size=len(bad_x))
                ax.scatter(bad_x, [mu + i * yshift] * len(bad_x) + noise,
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=1.0,
                           edgecolors=colors['warning'],
                           lw=.5)
            else:
                noise = np.random.normal(mu, sigma, size=len(x_))
                ax.scatter(x_, [mu + i * yshift] * len(x_) + noise,
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=colors['scatter_marker_alpha'],
                           edgecolors=colors['scatter_edge'],
                           lw=.5)

    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_linewidth(0.1)
    ax.set_yticks([])
    ax.tick_params(axis='both',
                   which='major',
                   width=.3,
                   labelcolor=colors['tick_label'],
                   labelsize=fontsize)
    for tick in ax.get_xticklabels():
        tick.set_fontname(fontname)
    for tick in ax.get_yticklabels():
        tick.set_fontname(fontname)
    ax.set_ylim(0, mu + nclasses * yshift + 6 * sigma)

    if feature_name is not None:
        ax.set_xlabel(f"{feature_name}",
                      fontsize=fontsize,
                      fontname=fontname,
                      color=colors['axis_label'])

    if 'legend' in show:
        class_names = utils._normalize_class_names(class_names, nclasses)
        add_classifier_legend(ax,
                              class_names,
                              class_values,
                              color_map,
                              target_name,
                              colors,
                              fontsize=fontsize,
                              fontname=fontname)
Exemplo n.º 3
0
def clfviz_bivar(model,
                 X: np.ndarray,
                 y: np.ndarray,
                 ntiles=50,
                 tile_fraction=.9,
                 binary_threshold=0.5,
                 show=[
                     'instances', 'boundaries', 'probabilities',
                     'misclassified', 'legend'
                 ],
                 feature_names=None,
                 target_name=None,
                 class_names=None,
                 markers=None,
                 boundary_marker='o',
                 boundary_markersize=.8,
                 fontsize=9,
                 fontname="Arial",
                 dot_w=25,
                 colors: dict = None,
                 ax=None) -> None:
    """
    See comment and parameter descriptions for clfviz() above.
    """
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values

    if len(X.shape) == 1 or (len(X.shape) == 2
                             and X.shape[1] != 2) or len(X.shape) > 2:
        raise ValueError(f"Expecting 2D data not {X.shape}")

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 3.5))

    # Created grid over the range of x1 and x2 variables, get probabilities, predictions
    grid_points, grid_proba, grid_pred_as_matrix, w, x_, class_X, class_values = \
        _compute_tiling(model, X, y, binary_threshold, ntiles, tile_fraction)

    x_proba = _predict_proba(model, X)
    if len(np.unique(y)) == 2:  # is k=2 binary?
        X_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0)
    else:
        X_pred = np.argmax(x_proba, axis=1)  # TODO: assumes classes are 0..k-1
    class_X_pred = [X_pred[y == cl] for cl in class_values]

    if markers is None:
        markers = ['o'] * len(class_X)

    colors = adjust_colors(colors)

    class_values = np.unique(y)  # returns sorted

    # Get class to color map for probabilities and predictions
    color_map, grid_pred_colors, grid_proba_colors = \
        _get_grid_colors(grid_proba, grid_pred_as_matrix, class_values, colors)

    # Draw probabilities or class prediction grid
    facecolors = grid_proba_colors if 'probabilities' in show else grid_pred_colors
    _draw_tiles(ax, grid_points, facecolors, colors['tile_alpha'], x_, w)

    # Get grid with class predictions with coordinates (x,y)
    # e.g., y_pred[0,0] is lower left pixel and y_pred[5,5] is top-right pixel
    # for npoints=5
    grid_pred_as_matrix = grid_pred_as_matrix.reshape(ntiles, ntiles)

    if 'boundaries' in show:
        _draw_boundary_edges(ax, grid_points, grid_pred_as_matrix,
                             boundary_marker, boundary_markersize, colors, w,
                             x_)

    # Draw the X instances circles
    if 'instances' in show:
        for i, x_ in enumerate(class_X):
            if 'misclassified' in show:
                # Show correctly classified markers
                good_x = x_[class_X_pred[i] == class_values[i], :]
                ax.scatter(good_x[:, 0],
                           good_x[:, 1],
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=colors['scatter_marker_alpha'],
                           edgecolors=colors['scatter_edge'],
                           lw=.5)
                # Show misclassified markers (can't have alpha per marker so do in 2 calls)
                bad_x = x_[class_X_pred[i] != class_values[i], :]
                ax.scatter(bad_x[:, 0],
                           bad_x[:, 1],
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=1.0,
                           edgecolors=colors['warning'],
                           lw=.5)
            else:
                ax.scatter(x_[:, 0],
                           x_[:, 1],
                           s=dot_w,
                           c=color_map[i],
                           marker=markers[i],
                           alpha=colors['scatter_marker_alpha'],
                           edgecolors=colors['scatter_edge'],
                           lw=.5)

    if feature_names is not None:
        ax.set_xlabel(f"{feature_names[0]}",
                      fontsize=fontsize,
                      fontname=fontname,
                      color=colors['axis_label'])
        ax.set_ylabel(f"{feature_names[1]}",
                      fontsize=fontsize,
                      fontname=fontname,
                      color=colors['axis_label'])

    if 'legend' in show:
        class_names = utils._normalize_class_names(class_names,
                                                   nclasses=len(class_values))
        add_classifier_legend(ax,
                              class_names,
                              class_values,
                              color_map,
                              target_name,
                              colors,
                              fontsize=fontsize,
                              fontname=fontname)

    ax.tick_params(axis='both',
                   which='major',
                   width=.3,
                   labelcolor=colors['tick_label'],
                   labelsize=fontsize)
    for tick in ax.get_xticklabels():
        tick.set_fontname(fontname)
    for tick in ax.get_yticklabels():
        tick.set_fontname(fontname)
    ax.spines['top'].set_visible(False)  # turns off the top "spine" completely
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(.5)
    ax.spines['bottom'].set_linewidth(.5)