def _get_feature_path_importance_sklearn_plot(features, feature_path_importance, figsize, colors, fontsize, fontname, grid): colors = adjust_colors(colors) fig, ax = plt.subplots(figsize=figsize) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_linewidth(.3) ax.spines['bottom'].set_linewidth(.3) ax.set_xticks(range(0, len(features))) ax.set_xticklabels(features) barcontainers = ax.bar(range(0, len(features)), feature_path_importance, color=colors["hist_bar"], lw=.3, align='center', width=1) for rect in barcontainers.patches: rect.set_linewidth(.5) rect.set_edgecolor(colors['rect_edge']) ax.set_xlabel("features", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel("feature importance", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.grid(b=grid) return ax
def draw_piechart(counts, size, colors, filename, label=None, fontname="Arial", graph_colors=None): graph_colors = adjust_colors(graph_colors) n_nonzero = np.count_nonzero(counts) i = np.nonzero(counts)[0][0] if n_nonzero==1: counts = [counts[i]] colors = [colors[i]] tweak = size * .01 fig, ax = plt.subplots(1, 1, figsize=(size, size)) ax.axis('equal') # ax.set_xlim(0 - tweak, size + tweak) # ax.set_ylim(0 - tweak, size + tweak) ax.set_xlim(0, size-10*tweak) ax.set_ylim(0, size-10*tweak) # frame=True needed for some reason to fit pie properly (ugh) # had to tweak the crap out of this to get tight box around piechart :( wedges, _ = ax.pie(counts, center=(size/2-6*tweak,size/2-6*tweak), radius=size/2, colors=colors, shadow=False, frame=True) for w in wedges: w.set_linewidth(.5) w.set_edgecolor(graph_colors['pie']) ax.axis('off') ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) if label is not None: ax.text(size/2-6*tweak, -10*tweak, label, horizontalalignment='center', verticalalignment='top', fontsize=9, color=graph_colors['text'], fontname=fontname) # plt.tight_layout() plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close()
def rtreeviz_bivar_heatmap(ax, X_train, y_train, max_depth, feature_names, fontsize=14, ticks_fontsize=12, fontname="Arial", show={'title'}, n_colors_in_map=100, colors=None ) -> tree.DecisionTreeClassifier: """ Show tesselated 2D feature space for bivariate regression tree. X_train can have lots of features but features lists indexes of 2 features to train tree with. """ if isinstance(X_train,pd.DataFrame): X_train = X_train.values if isinstance(y_train, pd.Series): y_train = y_train.values colors = adjust_colors(colors) rt = tree.DecisionTreeRegressor(max_depth=max_depth) rt.fit(X_train, y_train) y_lim = np.min(y_train), np.max(y_train) y_range = y_lim[1] - y_lim[0] color_map = [rgb2hex(c.rgb, force_long=True) for c in Color(colors['color_map_min']).range_to(Color(colors['color_map_max']), n_colors_in_map)] shadow_tree = ShadowDecTree(rt, X_train, y_train, feature_names=feature_names) tesselation = shadow_tree.tesselation() for node,bbox in tesselation: pred = node.prediction() color = color_map[int(((pred - y_lim[0]) / y_range) * (n_colors_in_map-1))] x = bbox[0] y = bbox[1] w = bbox[2] - bbox[0] h = bbox[3] - bbox[1] rect = patches.Rectangle((x, y), w, h, 0, linewidth=.3, alpha=.5, edgecolor=colors['edge'], facecolor=color) ax.add_patch(rect) color_map = [color_map[int(((y-y_lim[0])/y_range)*(n_colors_in_map-1))] for y in y_train] x, y, z = X_train[:,0], X_train[:,1], y_train ax.scatter(x, y, marker='o', alpha=.95, c=color_map, edgecolor=colors['scatter_edge'], lw=.3) ax.set_xlabel(f"{feature_names[0]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel(f"{feature_names[1]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=ticks_fontsize) if 'title' in show: accur = rt.score(X_train, y_train) title = f"Regression tree depth {max_depth}, training $R^2$={accur:.3f}" plt.title(title, fontsize=fontsize, color=colors['title']) return None
def regr_leaf_viz(node : ShadowDecTreeNode, y : (pd.Series,np.ndarray), target_name, filename:str=None, y_range=None, precision=1, label_fontsize: int = 9, ticks_fontsize: int = 8, fontname:str="Arial", colors=None): colors = adjust_colors(colors) samples = node.samples() y = y[samples] figsize = (.75, .8) fig, ax = plt.subplots(1, 1, figsize=figsize) ax.tick_params(colors=colors['tick_label']) m = np.mean(y) ax.set_ylim(y_range) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_linewidth(.3) ax.set_xticks([]) # ax.set_yticks(y_range) ticklabelpad = plt.rcParams['xtick.major.pad'] ax.annotate(f"{target_name}={myround(m,precision)}\nn={len(y)}", xy=(.5, 0), xytext=(.5, -.5*ticklabelpad), ha='center', va='top', xycoords='axes fraction', textcoords='offset points', fontsize=label_fontsize, fontname=fontname, color=colors['axis_label']) ax.tick_params(axis='y', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=ticks_fontsize) mu = .5 sigma = .08 X = np.random.normal(mu, sigma, size=len(y)) ax.set_xlim(0, 1) alpha = .25 ax.scatter(X, y, s=5, c=colors['scatter_marker'], alpha=alpha, lw=.3) ax.plot([0,len(node.samples())],[m,m],'--', color=colors['split_line'], linewidth=1) #plt.tight_layout() if filename is not None: plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close()
def draw_legend(shadow_tree, target_name, filename, colors=None): colors = adjust_colors(colors) n_classes = shadow_tree.nclasses() class_values = shadow_tree.unique_target_values class_names = shadow_tree.class_names color_values = colors['classes'][n_classes] color_map = {v:color_values[i] for i,v in enumerate(class_values)} boxes = [] for i, c in enumerate(class_values): box = patches.Rectangle((0, 0), 20, 10, linewidth=.4, edgecolor=colors['rect_edge'], facecolor=color_map[c], label=class_names[c]) boxes.append(box) fig, ax = plt.subplots(1, 1, figsize=(1,1)) leg = ax.legend(handles=boxes, frameon=True, shadow=False, fancybox=True, loc='center', title=target_name, handletextpad=.35, borderpad=.8, edgecolor=colors['legend_edge']) leg.get_frame().set_linewidth(.5) leg.get_title().set_color(colors['legend_title']) leg.get_title().set_fontsize(10) leg.get_title().set_fontweight('bold') for text in leg.get_texts(): text.set_color(colors['text']) text.set_fontsize(10) ax.set_xlim(0, 20) ax.set_ylim(0, 10) ax.axis('off') ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) if filename is not None: plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close()
def class_leaf_viz(node : ShadowDecTreeNode, colors : List[str], filename: str, graph_colors=None): graph_colors = adjust_colors(graph_colors) # size = prop_size(node.nsamples(), counts=node.shadow_tree.leaf_sample_counts(), # output_range=(.2, 1.5)) minsize = .15 maxsize = 1.3 slope = 0.02 nsamples = node.nsamples() size = nsamples * slope + minsize size = min(size, maxsize) # we visually need n=1 and n=9 to appear different but diff between 300 and 400 is no big deal # size = np.sqrt(np.log(size)) counts = node.class_counts() draw_piechart(counts, size=size, colors=colors, filename=filename, label=f"n={nsamples}", graph_colors=graph_colors)
def clfviz_univar(model, x: np.ndarray, y: np.ndarray, ntiles=100, binary_threshold=0.5, show=[ 'instances', 'boundaries', 'probabilities', 'misclassified', 'legend' ], feature_name=None, target_name=None, class_names=None, markers=None, fontsize=9, fontname="Arial", dot_w=25, yshift=.09, sigma=.09, colors: dict = None, ax=None) -> None: """ See comment and parameter descriptions for clfviz() above. """ if ax is None: fig, ax = plt.subplots(1, 1, figsize=(5, 1.2)) if isinstance(x, pd.Series): x = x.values if isinstance(y, pd.Series): y = y.values if (len(x.shape) == 2 and x.shape[1] != 1) or len(x.shape) > 2: raise ValueError(f"Expecting 1D data not {x.shape}") colors = adjust_colors(colors) mu = 0.08 class_values = np.unique(y) nclasses = len(class_values) class_colors = np.array(colors['classes'][nclasses]) color_map = {v: class_colors[i] for i, v in enumerate(class_values)} x1r = np.max(x) - np.min(x) x1range = (np.min(x), np.max(x)) grid_points, w = np.linspace(*x1range, num=ntiles, endpoint=True, retstep=True) grid_proba = _predict_proba(model, grid_points) if len(np.unique(y)) == 2: # is k=2 binary? grid_pred = np.where(grid_proba[:, 1] >= binary_threshold, 1, 0) else: grid_pred = np.argmax(grid_proba, axis=1) # TODO: assumes classes are 0..k-1 ymax = ax.get_ylim()[1] # compute the stripes on the bottom showing probabilities if 'probabilities' in show: class_values = np.unique(y) color_map, grid_pred_colors, grid_proba_colors = \ _get_grid_colors(grid_proba, grid_pred, class_values, colors=adjust_colors(None)) pred_box_height = .08 * ymax boxes = [] for i, gx in enumerate(grid_points): rect = patches.Rectangle((gx, 0), w, pred_box_height, edgecolor='none', facecolor=grid_proba_colors[i], alpha=colors['tile_alpha']) boxes.append(rect) # drop box around the gradation ax.add_collection(PatchCollection(boxes, match_original=True)) rect = patches.Rectangle((grid_points[0], 0), x1r + w, pred_box_height, linewidth=.3, edgecolor=colors['rect_edge'], facecolor='none') ax.add_patch(rect) if 'boundaries' in show: dx = np.abs(np.diff(grid_pred)) dx = np.hstack([0, dx]) dx_edge_idx = np.where(dx) # indexes of dx class transitions? for lx in grid_points[dx_edge_idx]: ax.plot([lx, lx], [*ax.get_ylim()], '--', lw=.3, c=colors['split_line'], alpha=1.0) if 'instances' in show: # user should pass in short and wide fig x_proba = _predict_proba(model, x) if len(np.unique(y)) == 2: # is k=2 binary? x_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0) else: x_pred = np.argmax(x_proba, axis=1) # TODO: assumes classes are 0..k-1 class_x = [x[y == cl] for cl in class_values] class_x_pred = [x_pred[y == cl] for cl in class_values] if markers is None: markers = ['o'] * len(class_x) for i, x_, in enumerate(class_x): if 'misclassified' in show: # Show correctly classified markers good_x = x_[class_x_pred[i] == class_values[i]] noise = np.random.normal(mu, sigma, size=len(good_x)) ax.scatter(good_x, [mu + i * yshift] * len(good_x) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) # Show misclassified markers (can't have alpha per marker so do in 2 calls) bad_x = x_[class_x_pred[i] != class_values[i]] noise = np.random.normal(mu, sigma, size=len(bad_x)) ax.scatter(bad_x, [mu + i * yshift] * len(bad_x) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=1.0, edgecolors=colors['warning'], lw=.5) else: noise = np.random.normal(mu, sigma, size=len(x_)) ax.scatter(x_, [mu + i * yshift] * len(x_) + noise, s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(0.1) ax.set_yticks([]) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) for tick in ax.get_xticklabels(): tick.set_fontname(fontname) for tick in ax.get_yticklabels(): tick.set_fontname(fontname) ax.set_ylim(0, mu + nclasses * yshift + 6 * sigma) if feature_name is not None: ax.set_xlabel(f"{feature_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) if 'legend' in show: class_names = utils._normalize_class_names(class_names, nclasses) add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors, fontsize=fontsize, fontname=fontname)
def clfviz_bivar(model, X: np.ndarray, y: np.ndarray, ntiles=50, tile_fraction=.9, binary_threshold=0.5, show=[ 'instances', 'boundaries', 'probabilities', 'misclassified', 'legend' ], feature_names=None, target_name=None, class_names=None, markers=None, boundary_marker='o', boundary_markersize=.8, fontsize=9, fontname="Arial", dot_w=25, colors: dict = None, ax=None) -> None: """ See comment and parameter descriptions for clfviz() above. """ if isinstance(X, pd.DataFrame): X = X.values if isinstance(y, pd.Series): y = y.values if len(X.shape) == 1 or (len(X.shape) == 2 and X.shape[1] != 2) or len(X.shape) > 2: raise ValueError(f"Expecting 2D data not {X.shape}") if ax is None: fig, ax = plt.subplots(1, 1, figsize=(5, 3.5)) # Created grid over the range of x1 and x2 variables, get probabilities, predictions grid_points, grid_proba, grid_pred_as_matrix, w, x_, class_X, class_values = \ _compute_tiling(model, X, y, binary_threshold, ntiles, tile_fraction) x_proba = _predict_proba(model, X) if len(np.unique(y)) == 2: # is k=2 binary? X_pred = np.where(x_proba[:, 1] >= binary_threshold, 1, 0) else: X_pred = np.argmax(x_proba, axis=1) # TODO: assumes classes are 0..k-1 class_X_pred = [X_pred[y == cl] for cl in class_values] if markers is None: markers = ['o'] * len(class_X) colors = adjust_colors(colors) class_values = np.unique(y) # returns sorted # Get class to color map for probabilities and predictions color_map, grid_pred_colors, grid_proba_colors = \ _get_grid_colors(grid_proba, grid_pred_as_matrix, class_values, colors) # Draw probabilities or class prediction grid facecolors = grid_proba_colors if 'probabilities' in show else grid_pred_colors _draw_tiles(ax, grid_points, facecolors, colors['tile_alpha'], x_, w) # Get grid with class predictions with coordinates (x,y) # e.g., y_pred[0,0] is lower left pixel and y_pred[5,5] is top-right pixel # for npoints=5 grid_pred_as_matrix = grid_pred_as_matrix.reshape(ntiles, ntiles) if 'boundaries' in show: _draw_boundary_edges(ax, grid_points, grid_pred_as_matrix, boundary_marker, boundary_markersize, colors, w, x_) # Draw the X instances circles if 'instances' in show: for i, x_ in enumerate(class_X): if 'misclassified' in show: # Show correctly classified markers good_x = x_[class_X_pred[i] == class_values[i], :] ax.scatter(good_x[:, 0], good_x[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) # Show misclassified markers (can't have alpha per marker so do in 2 calls) bad_x = x_[class_X_pred[i] != class_values[i], :] ax.scatter(bad_x[:, 0], bad_x[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=1.0, edgecolors=colors['warning'], lw=.5) else: ax.scatter(x_[:, 0], x_[:, 1], s=dot_w, c=color_map[i], marker=markers[i], alpha=colors['scatter_marker_alpha'], edgecolors=colors['scatter_edge'], lw=.5) if feature_names is not None: ax.set_xlabel(f"{feature_names[0]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel(f"{feature_names[1]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) if 'legend' in show: class_names = utils._normalize_class_names(class_names, nclasses=len(class_values)) add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors, fontsize=fontsize, fontname=fontname) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) for tick in ax.get_xticklabels(): tick.set_fontname(fontname) for tick in ax.get_yticklabels(): tick.set_fontname(fontname) ax.spines['top'].set_visible(False) # turns off the top "spine" completely ax.spines['right'].set_visible(False) ax.spines['left'].set_linewidth(.5) ax.spines['bottom'].set_linewidth(.5)
def regr_split_viz(node: ShadowDecTreeNode, X_train: np.ndarray, y_train: np.ndarray, target_name: str, filename: str = None, y_range=None, ticks_fontsize: int = 8, label_fontsize: int = 9, fontname: str = "Arial", precision=1, X : np.array = None, highlight_node : bool = False, colors: dict=None): colors = adjust_colors(colors) figsize = (2.5, 1.1) fig, ax = plt.subplots(1, 1, figsize=figsize) ax.tick_params(colors=colors['tick_label']) feature_name = node.feature_name() ax.set_xlabel(f"{feature_name}", fontsize=label_fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylim(y_range) if node==node.shadow_tree.root: ax.set_ylabel(target_name, fontsize=label_fontsize, fontname=fontname, color=colors['axis_label']) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_linewidth(.3) ax.spines['bottom'].set_linewidth(.3) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=ticks_fontsize) # Get X, y data for all samples associated with this node. X_feature = X_train[:,node.feature()] X_feature, y_train = X_feature[node.samples()], y_train[node.samples()] overall_feature_range = (np.min(X_train[:,node.feature()]), np.max(X_train[:,node.feature()])) ax.set_xlim(*overall_feature_range) xmin, xmax = overall_feature_range xr = xmax - xmin xticks = list(overall_feature_range) if node.split()>xmin+.10*xr and node.split()<xmax-.1*xr: # don't show split if too close to axis ends xticks += [node.split()] ax.set_xticks(xticks) ax.scatter(X_feature, y_train, s=5, c=colors['scatter_marker'], alpha=.4, lw=.3) left, right = node.split_samples() left = y_train[left] right = y_train[right] split = node.split() ax.plot([overall_feature_range[0],split],[np.mean(left),np.mean(left)],'--', color=colors['split_line'], linewidth=1) ax.plot([split,split],[*y_range],'--', color=colors['split_line'], linewidth=1) ax.plot([split,overall_feature_range[1]],[np.mean(right),np.mean(right)],'--', color=colors['split_line'], linewidth=1) def wedge(ax,x,color): ymin, ymax = ax.get_ylim() xr = xmax - xmin yr = ymax - ymin th = yr * .1 tw = xr * .018 tipy = ymin tria = np.array([[x, tipy], [x - tw, ymin-th], [x + tw, ymin-th]]) t = patches.Polygon(tria, facecolor=color) t.set_clip_on(False) ax.add_patch(t) wedge(ax, node.split(), color=colors['wedge']) if highlight_node: wedge(ax, X[node.feature()], color=colors['highlight']) #plt.tight_layout() if filename is not None: plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close()
def rtreeviz_univar(ax, x_train: (pd.Series, np.ndarray), # 1 vector of X data y_train: (pd.Series, np.ndarray), max_depth = 10, feature_name: str = None, target_name: str = None, min_samples_leaf = 1, fontsize: int = 14, show={'title','splits'}, split_linewidth=.5, mean_linewidth = 2, markersize=None, colors=None): if isinstance(x_train, pd.Series): x_train = x_train.values if isinstance(y_train, pd.Series): y_train = y_train.values colors = adjust_colors(colors) y_range = (min(y_train), max(y_train)) # same y axis for all overall_feature_range = (np.min(x_train), np.max(x_train)) t = tree.DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf) t.fit(x_train.reshape(-1,1), y_train) shadow_tree = ShadowDecTree(t, x_train.reshape(-1,1), y_train, feature_names=[feature_name]) splits = [] for node in shadow_tree.internal: splits.append(node.split()) splits = sorted(splits) bins = [overall_feature_range[0]] + splits + [overall_feature_range[1]] means = [] for i in range(len(bins) - 1): left = bins[i] right = bins[i + 1] inrange = y_train[(x_train >= left) & (x_train <= right)] means.append(np.mean(inrange)) ax.scatter(x_train, y_train, marker='o', alpha=.4, c=colors['scatter_marker'], s=markersize, edgecolor=colors['scatter_edge'], lw=.3) if 'splits' in show: for split in splits: ax.plot([split, split], [*y_range], '--', color=colors['split_line'], linewidth=split_linewidth) prevX = overall_feature_range[0] for i, m in enumerate(means): split = overall_feature_range[1] if i < len(splits): split = splits[i] ax.plot([prevX, split], [m, m], '-', color=colors['mean_line'], linewidth=mean_linewidth) prevX = split ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) if 'title' in show: title = f"Regression tree depth {max_depth}, samples per leaf {min_samples_leaf},\nTraining $R^2$={t.score(x_train.reshape(-1,1),y_train):.3f}" plt.title(title, fontsize=fontsize, color=colors['title']) plt.xlabel(feature_name, fontsize=fontsize, color=colors['axis_label']) plt.ylabel(target_name, fontsize=fontsize, color=colors['axis_label'])
def dtreeviz(tree_model: (tree.DecisionTreeRegressor, tree.DecisionTreeClassifier), X_train: (pd.DataFrame, np.ndarray), y_train: (pd.Series, np.ndarray), feature_names: List[str], target_name: str, class_names: (Mapping[Number, str], List[str]) = None, # required if classifier precision: int = 2, orientation: ('TD', 'LR') = "TD", show_root_edge_labels: bool = True, show_node_labels: bool = False, fancy: bool = True, histtype: ('bar', 'barstacked', 'strip') = 'barstacked', highlight_path: List[int] = [], X: np.ndarray = None, max_X_features_LR: int = 10, max_X_features_TD: int = 20, label_fontsize: int=12, ticks_fontsize: int=8, fontname: str="Arial", colors: dict=None ) \ -> DTreeViz: """ Given a decision tree regressor or classifier, create and return a tree visualization using the graphviz (DOT) language. :param tree_model: A DecisionTreeRegressor or DecisionTreeClassifier that has been fit to X_train, y_train. :param X_train: A data frame or 2-D matrix of feature vectors used to train the model. :param y_train: A pandas Series or 1-D vector with target values or classes. :param feature_names: A list of the feature names. :param target_name: The name of the target variable. :param class_names: [For classifiers] A dictionary or list of strings mapping class value to class name. :param precision: When displaying floating-point numbers, how many digits to display after the decimal point. Default is 2. :param orientation: Is the tree top down, "TD", or left to right, "LR"? :param show_root_edge_labels: Include < and >= on the edges emanating from the root? :param show_node_labels: Add "Node id" to top of each node in graph for educational purposes :param fancy: :param histtype: [For classifiers] Either 'bar' or 'barstacked' to indicate histogram type. We find that 'barstacked' looks great up to about. four classes. :param highlight_path: A list of node IDs to highlight, default is []. Useful for emphasizing node(s) in tree for discussion. If X argument given then this is ignored. :type highlight_path: List[int] :param X: Instance to run down the tree; derived path to highlight from this vector. Show feature vector with labels underneath leaf reached. highlight_path is ignored if X is not None. :type X: np.ndarray :param label_fontsize: Size of the label font :param ticks_fontsize: Size of the tick font :param fontname: Font which is used for labels and text :param max_X_features_LR: If len(X) exceeds this limit for LR layout, display only those features used to guide X vector down tree. Helps when len(X) is large. Default is 10. :param max_X_features_TD: If len(X) exceeds this limit for TD layout, display only those features used to guide X vector down tree. Helps when len(X) is large. Default is 25. :return: A string in graphviz DOT language that describes the decision tree. """ def node_name(node : ShadowDecTreeNode) -> str: return f"node{node.id}" def split_node(name, node_name, split): if fancy: labelgraph = node_label(node) if show_node_labels else '' html = f"""<table border="0"> {labelgraph} <tr> <td><img src="{tmp}/node{node.id}_{os.getpid()}.svg"/></td> </tr> </table>""" else: html = f"""<font face="Helvetica" color="#444443" point-size="12">{name}@{split}</font>""" if node.id in highlight_path: gr_node = f'{node_name} [margin="0" shape=box penwidth=".5" color="{colors["highlight"]}" style="dashed" label=<{html}>]' else: gr_node = f'{node_name} [margin="0" shape=none label=<{html}>]' return gr_node def regr_leaf_node(node, label_fontsize: int = 12): # always generate fancy regr leaves for now but shrink a bit for nonfancy. labelgraph = node_label(node) if show_node_labels else '' html = f"""<table border="0"> {labelgraph} <tr> <td><img src="{tmp}/leaf{node.id}_{os.getpid()}.svg"/></td> </tr> </table>""" if node.id in highlight_path: return f'leaf{node.id} [margin="0" shape=box penwidth=".5" color="{colors["highlight"]}" style="dashed" label=<{html}>]' else: return f'leaf{node.id} [margin="0" shape=box penwidth="0" color="{colors["text"]}" label=<{html}>]' def class_leaf_node(node, label_fontsize: int = 12): labelgraph = node_label(node) if show_node_labels else '' html = f"""<table border="0" CELLBORDER="0"> {labelgraph} <tr> <td><img src="{tmp}/leaf{node.id}_{os.getpid()}.svg"/></td> </tr> </table>""" if node.id in highlight_path: return f'leaf{node.id} [margin="0" shape=box penwidth=".5" color="{colors["highlight"]}" style="dashed" label=<{html}>]' else: return f'leaf{node.id} [margin="0" shape=box penwidth="0" color="{colors["text"]}" label=<{html}>]' def node_label(node): return f'<tr><td CELLPADDING="0" CELLSPACING="0"><font face="Helvetica" color="{colors["node_label"]}" point-size="14"><i>Node {node.id}</i></font></td></tr>' def class_legend_html(): return f""" <table border="0" cellspacing="0" cellpadding="0"> <tr> <td border="0" cellspacing="0" cellpadding="0"><img src="{tmp}/legend_{os.getpid()}.svg"/></td> </tr> </table> """ def class_legend_gr(): if not shadow_tree.isclassifier(): return "" return f""" subgraph cluster_legend {{ style=invis; legend [penwidth="0" margin="0" shape=box margin="0.03" width=.1, height=.1 label=< {class_legend_html()} >] }} """ def instance_html(path, instance_fontsize: int = 11): headers = [] features_used = [node.feature() for node in path[:-1]] # don't include leaf display_X = X display_feature_names = feature_names highlight_feature_indexes = features_used if (orientation == 'TD' and len(X) > max_X_features_TD) or\ (orientation == 'LR' and len(X) > max_X_features_LR): # squash all features down to just those used display_X = [X[i] for i in features_used] + ['...'] display_feature_names = [node.feature_name() for node in path[:-1]] + ['...'] highlight_feature_indexes = range(0,len(features_used)) for i,name in enumerate(display_feature_names): if i in highlight_feature_indexes: color = colors['highlight'] else: color = colors['text'] headers.append(f'<td cellpadding="1" align="right" bgcolor="white">' f'<font face="Helvetica" color="{color}" point-size="{instance_fontsize}">' f'{name}' '</font>' '</td>') values = [] for i,v in enumerate(display_X): if i in highlight_feature_indexes: color = colors['highlight'] else: color = colors['text'] if isinstance(v,int) or isinstance(v, str): disp_v = v else: disp_v = myround(v, precision) values.append(f'<td cellpadding="1" align="right" bgcolor="white">' f'<font face="Helvetica" color="{color}" point-size="{instance_fontsize}">{disp_v}</font>' '</td>') return f""" <table border="0" cellspacing="0" cellpadding="0"> <tr> {''.join(headers)} </tr> <tr> {''.join(values)} </tr> </table> """ def instance_gr(): if X is None: return "" pred, path = shadow_tree.predict(X) leaf = f"leaf{path[-1].id}" if shadow_tree.isclassifier(): edge_label = f"  Prediction<br/> {path[-1].prediction_name()}" else: edge_label = f"  Prediction<br/> {myround(path[-1].prediction(), precision)}" return f""" subgraph cluster_instance {{ style=invis; X_y [penwidth="0.3" margin="0" shape=box margin="0.03" width=.1, height=.1 label=< {instance_html(path)} >] }} {leaf} -> X_y [dir=back; penwidth="1.2" color="{colors['highlight']}" label=<<font face="Helvetica" color="{colors['leaf_label']}" point-size="{11}">{edge_label}</font>>] """ colors = adjust_colors(colors) if orientation=="TD": ranksep = ".2" nodesep = "0.1" else: if fancy: ranksep = ".22" nodesep = "0.1" else: ranksep = ".05" nodesep = "0.09" tmp = tempfile.gettempdir() # tmp = "/tmp" shadow_tree = ShadowDecTree(tree_model, X_train, y_train, feature_names=feature_names, class_names=class_names) if X is not None: pred, path = shadow_tree.predict(X) highlight_path = [n.id for n in path] n_classes = shadow_tree.nclasses() color_values = colors['classes'][n_classes] # Fix the mapping from target value to color for entire tree if shadow_tree.isclassifier(): class_values = shadow_tree.unique_target_values color_map = {v: color_values[i] for i, v in enumerate(class_values)} draw_legend(shadow_tree, target_name, f"{tmp}/legend_{os.getpid()}.svg", colors=colors) if isinstance(X_train, pd.DataFrame): X_train = X_train.values if isinstance(y_train, pd.Series): y_train = y_train.values if y_train.dtype == np.dtype(object): try: y_train = y_train.astype('float') except ValueError as e: raise ValueError('y_train needs to consist only of numerical values. {}'.format(e)) if len(y_train.shape) != 1: raise ValueError('y_train must a one-dimensional list or Pandas Series, got: {}'.format(y_train.shape)) y_range = (min(y_train) * 1.03, max(y_train) * 1.03) # same y axis for all # Find max height (count) for any bar in any node if shadow_tree.isclassifier(): nbins = get_num_bins(histtype, n_classes) node_heights = shadow_tree.get_split_node_heights(X_train, y_train, nbins=nbins) internal = [] for node in shadow_tree.internal: if fancy: if shadow_tree.isclassifier(): class_split_viz(node, X_train, y_train, filename=f"{tmp}/node{node.id}_{os.getpid()}.svg", precision=precision, colors={**color_map, **colors}, histtype=histtype, node_heights=node_heights, X=X, ticks_fontsize=ticks_fontsize, label_fontsize=label_fontsize, fontname=fontname, highlight_node=node.id in highlight_path) else: regr_split_viz(node, X_train, y_train, filename=f"{tmp}/node{node.id}_{os.getpid()}.svg", target_name=target_name, y_range=y_range, precision=precision, X=X, ticks_fontsize=ticks_fontsize, label_fontsize=label_fontsize, fontname=fontname, highlight_node=node.id in highlight_path, colors=colors) nname = node_name(node) gr_node = split_node(node.feature_name(), nname, split=myround(node.split(), precision)) internal.append(gr_node) leaves = [] for node in shadow_tree.leaves: if shadow_tree.isclassifier(): class_leaf_viz(node, colors=color_values, filename=f"{tmp}/leaf{node.id}_{os.getpid()}.svg", graph_colors=colors) leaves.append( class_leaf_node(node) ) else: # for now, always gen leaf regr_leaf_viz(node, y_train, target_name=target_name, filename=f"{tmp}/leaf{node.id}_{os.getpid()}.svg", y_range=y_range, precision=precision, ticks_fontsize=ticks_fontsize, label_fontsize=label_fontsize, fontname=fontname, colors=colors) leaves.append( regr_leaf_node(node) ) show_edge_labels = False all_llabel = '<' if show_edge_labels else '' all_rlabel = '≥' if show_edge_labels else '' root_llabel = '<' if show_root_edge_labels else '' root_rlabel = '≥' if show_root_edge_labels else '' edges = [] # non leaf edges with > and <= for node in shadow_tree.internal: nname = node_name(node) if node.left.isleaf(): left_node_name ='leaf%d' % node.left.id else: left_node_name = node_name(node.left) if node.right.isleaf(): right_node_name ='leaf%d' % node.right.id else: right_node_name = node_name(node.right) if node==shadow_tree.root: llabel = root_llabel rlabel = root_rlabel else: llabel = all_llabel rlabel = all_rlabel lcolor = rcolor = colors['arrow'] lpw = rpw = "0.3" if node.left.id in highlight_path: lcolor = colors['highlight'] lpw = "1.2" if node.right.id in highlight_path: lcolor = colors['highlight'] rpw = "1.2" edges.append( f'{nname} -> {left_node_name} [penwidth={lpw} color="{lcolor}" label=<{llabel}>]' ) edges.append( f'{nname} -> {right_node_name} [penwidth={rpw} color="{rcolor}" label=<{rlabel}>]' ) edges.append(f""" {{ rank=same; {left_node_name} -> {right_node_name} [style=invis] }} """) newline = "\n\t" dot = f""" digraph G {{ splines=line; nodesep={nodesep}; ranksep={ranksep}; rankdir={orientation}; margin=0.0; node [margin="0.03" penwidth="0.5" width=.1, height=.1]; edge [arrowsize=.4 penwidth="0.3"] {newline.join(internal)} {newline.join(edges)} {newline.join(leaves)} {class_legend_gr()} {instance_gr()} }} """ return DTreeViz(dot)
def ctreeviz_bivar(ax, X_train, y_train, feature_names, class_names, target_name, max_depth=None, min_samples_leaf=None, fontsize=14, fontname="Arial", show={'title','legend','splits'}, colors=None): """ Show tesselated 2D feature space for bivariate classification tree. X_train can have lots of features but features lists indexes of 2 features to train tree with. """ if isinstance(X_train,pd.DataFrame): X_train = X_train.values if isinstance(y_train, pd.Series): y_train = y_train.values if max_depth is None and min_samples_leaf is None: raise ValueError("Either max_depth or min_samples_leaf must be set") if max_depth is not None and min_samples_leaf is None: min_samples_leaf = 1 colors = adjust_colors(colors) ct = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf) ct.fit(X_train, y_train) shadow_tree = ShadowDecTree(ct, X_train, y_train, feature_names=feature_names, class_names=class_names) tesselation = shadow_tree.tesselation() n_classes = shadow_tree.nclasses() class_values = shadow_tree.unique_target_values color_values = colors['classes'][n_classes] color_map = {v: color_values[i] for i, v in enumerate(class_values)} if 'splits' in show: for node,bbox in tesselation: x = bbox[0] y = bbox[1] w = bbox[2]-bbox[0] h = bbox[3]-bbox[1] rect = patches.Rectangle((x, y), w, h, 0, linewidth=.3, alpha=.4, edgecolor=colors['rect_edge'], facecolor=color_map[node.prediction()]) ax.add_patch(rect) dot_w = 25 X_hist = [X_train[y_train == cl] for cl in class_values] for i, h in enumerate(X_hist): ax.scatter(h[:,0], h[:,1], alpha=1, marker='o', s=dot_w, c=color_map[i], edgecolors=colors['scatter_edge'], lw=.3) ax.set_xlabel(f"{feature_names[0]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel(f"{feature_names[1]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_linewidth(.3) if 'legend' in show: add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors) if 'title' in show: accur = ct.score(X_train, y_train) title = f"Classifier tree depth {max_depth}, training accuracy={accur*100:.2f}%" plt.title(title, fontsize=fontsize, color=colors['title'],) return None
def ctreeviz_univar(ax, x_train, y_train, feature_name, class_names, target_name, max_depth=None, min_samples_leaf=None, fontsize=14, fontname="Arial", nbins=25, gtype='strip', show={'title','legend','splits'}, colors=None): if isinstance(x_train, pd.Series): x_train = x_train.values if isinstance(y_train, pd.Series): y_train = y_train.values if max_depth is None and min_samples_leaf is None: raise ValueError("Either max_depth or min_samples_leaf must be set") if max_depth is not None and min_samples_leaf is None: min_samples_leaf = 1 colors = adjust_colors(colors) # ax.set_facecolor('#F9F9F9') ct = tree.DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf) ct.fit(x_train.reshape(-1, 1), y_train) shadow_tree = ShadowDecTree(ct, x_train.reshape(-1, 1), y_train, feature_names=[feature_name], class_names=class_names) n_classes = shadow_tree.nclasses() overall_feature_range = (np.min(x_train), np.max(x_train)) class_values = shadow_tree.unique_target_values color_values = colors['classes'][n_classes] color_map = {v: color_values[i] for i, v in enumerate(class_values)} X_colors = [color_map[cl] for cl in class_values] ax.set_xlabel(f"{feature_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.yaxis.set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_linewidth(.3) X_hist = [x_train[y_train == cl] for cl in class_values] if gtype == 'barstacked': bins = np.linspace(start=overall_feature_range[0], stop=overall_feature_range[1], num=nbins, endpoint=True) hist, bins, barcontainers = ax.hist(X_hist, color=X_colors, align='mid', histtype='barstacked', bins=bins, label=class_names) for patch in barcontainers: for rect in patch.patches: rect.set_linewidth(.5) rect.set_edgecolor(colors['edge']) ax.set_xlim(*overall_feature_range) ax.set_xticks(overall_feature_range) ax.set_yticks([0, max([max(h) for h in hist])]) elif gtype == 'strip': # user should pass in short and wide fig sigma = .013 mu = .08 class_step = .08 dot_w = 20 ax.set_ylim(0, mu + n_classes*class_step) for i, bucket in enumerate(X_hist): y_noise = np.random.normal(mu+i*class_step, sigma, size=len(bucket)) ax.scatter(bucket, y_noise, alpha=.7, marker='o', s=dot_w, c=color_map[i], edgecolors=colors['scatter_edge'], lw=.3) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=fontsize) splits = [node.split() for node in shadow_tree.internal] splits = sorted(splits) bins = [ax.get_xlim()[0]] + splits + [ax.get_xlim()[1]] pred_box_height = .07 * ax.get_ylim()[1] preds = [] for i in range(len(bins) - 1): left = bins[i] right = bins[i + 1] inrange = y_train[(x_train >= left) & (x_train <= right)] values, counts = np.unique(inrange, return_counts=True) pred = values[np.argmax(counts)] rect = patches.Rectangle((left, 0), (right - left), pred_box_height, linewidth=.3, edgecolor=colors['edge'], facecolor=color_map[pred]) ax.add_patch(rect) preds.append(pred) if 'legend' in show: add_classifier_legend(ax, class_names, class_values, color_map, target_name, colors) if 'title' in show: accur = ct.score(x_train.reshape(-1, 1), y_train) title = f"Classifier tree depth {max_depth}, training accuracy={accur*100:.2f}%" plt.title(title, fontsize=fontsize, color=colors['title']) if 'splits' in show: for split in splits: plt.plot([split, split], [*ax.get_ylim()], '--', color=colors['split_line'], linewidth=1)
def rtreeviz_bivar_3D(ax, X_train, y_train, max_depth, feature_names, target_name, fontsize=14, ticks_fontsize=10, fontname="Arial", azim=0, elev=0, dist=7, show={'title'}, colors=None, n_colors_in_map = 100 ) -> tree.DecisionTreeClassifier: """ Show 3D feature space for bivariate regression tree. X_train can have lots of features but features lists indexes of 2 features to train tree with. """ if isinstance(X_train, pd.DataFrame): X_train = X_train.values if isinstance(y_train, pd.Series): y_train = y_train.values colors = adjust_colors(colors) ax.view_init(elev=elev, azim=azim) ax.dist = dist def plane(node, bbox): x = np.linspace(bbox[0], bbox[2], 2) y = np.linspace(bbox[1], bbox[3], 2) xx, yy = np.meshgrid(x, y) z = np.full(xx.shape, node.prediction()) # print(f"{node.prediction()}->{int(((node.prediction()-y_lim[0])/y_range)*(n_colors_in_map-1))}, lim {y_lim}") # print(f"{color_map[int(((node.prediction()-y_lim[0])/y_range)*(n_colors_in_map-1))]}") ax.plot_surface(xx, yy, z, alpha=.85, shade=False, color=color_map[int(((node.prediction()-y_lim[0])/y_range)*(n_colors_in_map-1))], edgecolor=colors['edge'], lw=.3) rt = tree.DecisionTreeRegressor(max_depth=max_depth) rt.fit(X_train, y_train) y_lim = np.min(y_train), np.max(y_train) y_range = y_lim[1] - y_lim[0] color_map = [rgb2hex(c.rgb, force_long=True) for c in Color(colors['color_map_min']).range_to(Color(colors['color_map_max']), n_colors_in_map)] color_map = [color_map[int(((y-y_lim[0])/y_range)*(n_colors_in_map-1))] for y in y_train] shadow_tree = ShadowDecTree(rt, X_train, y_train, feature_names=feature_names) tesselation = shadow_tree.tesselation() for node, bbox in tesselation: plane(node, bbox) x, y, z = X_train[:, 0], X_train[:, 1], y_train ax.scatter(x, y, z, marker='o', alpha=.7, edgecolor=colors['scatter_edge'], lw=.3, c=color_map) ax.set_xlabel(f"{feature_names[0]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_ylabel(f"{feature_names[1]}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.set_zlabel(f"{target_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label']) ax.tick_params(axis='both', which='major', width=.3, labelcolor=colors['tick_label'], labelsize=ticks_fontsize) if 'title' in show: accur = rt.score(X_train, y_train) title = f"Regression tree depth {max_depth}, training $R^2$={accur:.3f}" plt.title(title, fontsize=fontsize, color=colors['title']) return None