def visualize(shap_values, feature_names=None, data=None, out_names=None): """ Visualize the given SHAP values with an additive force layout. """ if type(shap_values) != np.ndarray: return iml.visualize(shap_values) if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1,len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1]-1)] if data is None: data = ["" for i in range(len(feature_names))] if type(data) == np.ndarray: data = data.flatten() instance = Instance(np.zeros((1,len(feature_names))), data) e = AdditiveExplanation( shap_values[0,-1], np.sum(shap_values[0,:]), shap_values[0,:-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.zeros((1,len(feature_names))), list(feature_names)) ) return e else: exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1]-1)] if data is None: display_data = ["" for i in range(len(feature_names))] else: display_data = data[i,:] instance = Instance(np.ones((1,len(feature_names))), display_data) e = AdditiveExplanation( shap_values[i,-1], np.sum(shap_values[i,:]), shap_values[i,:-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.ones((1,len(feature_names))), list(feature_names)) ) exps.append(e) return exps
def explain_instances(model, data, feature_names, out_names): if out_names is None: out_names = ["model output"] if feature_names is None: feature_names = [(i + 1) + "" for i in range(data.shape[1])] if type(model) == xgboost.core.Booster: exps = [] contribs = model.predict(xgboost.DMatrix(data), pred_contribs=True) for i in range(data.shape[0]): instance = Instance(data[i:i + 1, :], data[i, :]) e = AdditiveExplanation( contribs[i, -1], np.sum(contribs[i, :]), contribs[i, :-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.zeros((1, data.shape[1])), list(feature_names))) exps.append(e) return exps
def explain_instance(model, data, feature_names, out_names): if out_names is None: out_names = ["model output"] if feature_names is None: feature_names = [(i + 1) + "" for i in range(data.shape[1])] if type(model) == xgboost.core.Booster: contribs = model.predict(xgboost.DMatrix(data), pred_contribs=True) elif type(model) == lightgbm.basic.Booster: contribs = model.predict(data, pred_contrib=True) else: return None instance = Instance(data[0:1, :], data[0, :]) e = AdditiveExplanation( contribs[0, -1], np.sum(contribs[0, :]), contribs[0, :-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.zeros((1, data.shape[1])), list(feature_names))) return e
def force_plot(shap_values, features=None, feature_names=None, out_names=None, link="identity", plot_cmap="RdBu"): """ Visualize the given SHAP values with an additive force layout. """ assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]." link = iml.links.convert_to_link(link) if type(shap_values) != np.ndarray: return iml.visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.as_matrix() elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.as_matrix() elif str(type(features)) == "list": if feature_names is None: feature_names = features features = None elif features is not None and len(features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1, len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = ["Feature " + str(i) for i in range(shap_values.shape[1] - 1)] if features is None: features = ["" for _ in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() instance = Instance(np.zeros((1, len(feature_names))), features) e = AdditiveExplanation( shap_values[0, -1], np.sum(shap_values[0, :]), shap_values[0, :-1], None, instance, link, Model(None, out_names), DenseData(np.zeros((1, len(feature_names))), list(feature_names)) ) return iml.visualize(e, plot_cmap) else: if shap_values.shape[0] > 3000: warnings.warn("shap.force_plot is slow many thousands of rows, try subsampling your data.") exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = ["Feature " + str(i) for i in range(shap_values.shape[1] - 1)] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[i, :] instance = Instance(np.ones((1, len(feature_names))), display_features) e = AdditiveExplanation( shap_values[i, -1], np.sum(shap_values[i, :]), shap_values[i, :-1], None, instance, link, Model(None, out_names), DenseData(np.ones((1, len(feature_names))), list(feature_names)) ) exps.append(e) return iml.visualize(exps, plot_cmap=plot_cmap)
def explain(self, incoming_instance, **kwargs): # convert incoming input to a standardized iml object instance = convert_to_instance(incoming_instance) match_instance_to_data(instance, self.data) # find the feature groups we will test. If a feature does not change from its # current value then we know it doesn't impact the model self.varyingInds = self.varying_groups(instance.x) self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds] self.M = len(self.varyingFeatureGroups) # find f(x) if self.keep_index: model_out = self.model.f(instance.convert_to_df()) else: model_out = self.model.f(instance.x) if isinstance(model_out, (pd.DataFrame, pd.Series)): model_out = model_out.values self.fx = model_out[0] if not self.vector_out: self.fx = np.array([self.fx]) # if no features vary then there no feature has an effect if self.M == 0: phi = np.zeros((len(self.data.groups), self.D)) phi_var = np.zeros((len(self.data.groups), self.D)) # if only one feature varies then it has all the effect elif self.M == 1: phi = np.zeros((len(self.data.groups), self.D)) phi_var = np.zeros((len(self.data.groups), self.D)) diff = self.link.f(self.fx) - self.link.f(self.fnull) for d in range(self.D): phi[self.varyingInds[0],d] = diff[d] # if more than one feature varies then we have to do real work else: self.l1_reg = kwargs.get("l1_reg", "auto") # pick a reasonable number of samples if the user didn't specify how many they wanted self.nsamples = kwargs.get("nsamples", "auto") if self.nsamples == "auto": self.nsamples = 2 * self.M + 2**11 # if we have enough samples to enumerate all subsets then ignore the unneeded samples self.max_samples = 2 ** 30 if self.M <= 30: self.max_samples = 2 ** self.M - 2 if self.nsamples > self.max_samples: self.nsamples = self.max_samples # reserve space for some of our computations self.allocate() # weight the different subset sizes num_subset_sizes = np.int(np.ceil((self.M - 1) / 2.0)) num_paired_subset_sizes = np.int(np.floor((self.M - 1) / 2.0)) weight_vector = np.array([(self.M - 1.0) / (i * (self.M - i)) for i in range(1, num_subset_sizes + 1)]) weight_vector[:num_paired_subset_sizes] *= 2 weight_vector /= np.sum(weight_vector) log.debug("weight_vector = {0}".format(weight_vector)) log.debug("num_subset_sizes = {0}".format(num_subset_sizes)) log.debug("num_paired_subset_sizes = {0}".format(num_paired_subset_sizes)) log.debug("M = {0}".format(self.M)) # fill out all the subset sizes we can completely enumerate # given nsamples*remaining_weight_vector[subset_size] num_full_subsets = 0 num_samples_left = self.nsamples group_inds = np.arange(self.M, dtype='int64') mask = np.zeros(self.M) remaining_weight_vector = copy.copy(weight_vector) for subset_size in range(1, num_subset_sizes + 1): # determine how many subsets (and their complements) are of the current size nsubsets = binom(self.M, subset_size) if subset_size <= num_paired_subset_sizes: nsubsets *= 2 log.debug("subset_size = {0}".format(subset_size)) log.debug("nsubsets = {0}".format(nsubsets)) log.debug("self.nsamples*weight_vector[subset_size-1] = {0}".format( num_samples_left * remaining_weight_vector[subset_size - 1])) log.debug("self.nsamples*weight_vector[subset_size-1/nsubsets = {0}".format( num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets)) # see if we have enough samples to enumerate all subsets of this size if num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets >= 1.0 - 1e-8: num_full_subsets += 1 num_samples_left -= nsubsets # rescale what's left of the remaining weight vector to sum to 1 if remaining_weight_vector[subset_size - 1] < 1.0: remaining_weight_vector /= (1 - remaining_weight_vector[subset_size - 1]) # add all the samples of the current subset size w = weight_vector[subset_size - 1] / binom(self.M, subset_size) if subset_size <= num_paired_subset_sizes: w /= 2.0 for inds in itertools.combinations(group_inds, subset_size): mask[:] = 0.0 mask[np.array(inds, dtype='int64')] = 1.0 self.addsample(instance.x, mask, w) if subset_size <= num_paired_subset_sizes: mask[:] = np.abs(mask - 1) self.addsample(instance.x, mask, w) else: break log.info("num_full_subsets = {0}".format(num_full_subsets)) # add random samples from what is left of the subset space samples_left = self.nsamples - self.nsamplesAdded log.debug("samples_left = {0}".format(samples_left)) if num_full_subsets != num_subset_sizes: weight_left = np.sum(weight_vector[num_full_subsets:]) rand_sample_weight = weight_left / samples_left log.info("weight_left = {0}".format(weight_left)) log.info("rand_sample_weight = {0}".format(rand_sample_weight)) remaining_weight_vector = weight_vector[num_full_subsets:] remaining_weight_vector /= np.sum(remaining_weight_vector) log.info("remaining_weight_vector = {0}".format(remaining_weight_vector)) log.info("num_paired_subset_sizes = {0}".format(num_paired_subset_sizes)) ind_set = np.arange(len(remaining_weight_vector)) while samples_left > 0: mask[:] = 0.0 np.random.shuffle(group_inds) ind = np.random.choice(ind_set, 1, p=remaining_weight_vector)[0] mask[group_inds[:ind + num_full_subsets + 1]] = 1.0 samples_left -= 1 self.addsample(instance.x, mask, rand_sample_weight) # add the compliment sample if samples_left > 0: mask -= 1.0 mask[:] = np.abs(mask) self.addsample(instance.x, mask, rand_sample_weight) samples_left -= 1 # execute the model on the synthetic samples we have created self.run() # solve then expand the feature importance (Shapley value) vector to contain the non-varying features phi = np.zeros((len(self.data.groups), self.D)) phi_var = np.zeros((len(self.data.groups), self.D)) for d in range(self.D): vphi, vphi_var = self.solve(self.nsamples / self.max_samples, d) phi[self.varyingInds, d] = vphi phi_var[self.varyingInds, d] = vphi_var if not self.vector_out: phi = np.squeeze(phi, axis=1) phi_var = np.squeeze(phi_var, axis=1) # return the Shapley values along with variances of the estimates # note that if features were eliminated by l1 regression their # variance will be 0, even though they are not perfectly known return AdditiveExplanation( self.link.f(self.fnull if self.vector_out else self.fnull[0]), self.link.f(self.fx if self.vector_out else self.fx[0]), phi, phi_var, instance, self.link, self.model, self.data )
def force_plot(base_value, shap_values, features=None, feature_names=None, out_names=None, link="identity", plot_cmap="RdBu"): """ Visualize the given SHAP values with an additive force layout. """ # auto unwrap the base_value if type(base_value) == np.ndarray and len(base_value) == 1: base_value = base_value[0] if (type(base_value) == np.ndarray or type(base_value) == list): if type(shap_values) != list or len(shap_values) != len(base_value): raise Exception("In v0.20 force_plot now requires the base value as the first parameter! " \ "Try shap.force_plot(explainer.expected_value, shap_values) or " \ "for multi-output models try " \ "shap.force_plot(explainer.expected_value[0], shap_values[0]).") assert not type( shap_values ) == list, "The shap_values arg looks looks multi output, try shap_values[i]." link = iml.links.convert_to_link(link) if type(shap_values) != np.ndarray: return iml.visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.values elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.values elif isinstance(features, list): if feature_names is None: feature_names = features features = None elif features is not None and len( features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1, len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = [ labels['FEATURE'] % str(i) for i in range(shap_values.shape[1]) ] if features is None: features = ["" for _ in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() # check that the shape of the shap_values and features match if len(features) != shap_values.shape[1]: msg = "Length of features is not equal to the length of shap_values!" if len(features) == shap_values.shape[1] - 1: msg += " You might be using an old format shap_values array with the base value " \ "as the last column. In this case just pass the array without the last column." raise Exception(msg) instance = Instance(np.zeros((1, len(feature_names))), features) e = AdditiveExplanation( base_value, np.sum(shap_values[0, :]) + base_value, shap_values[0, :], None, instance, link, Model(None, out_names), DenseData(np.zeros((1, len(feature_names))), list(feature_names))) return iml.visualize(e, plot_cmap) else: if shap_values.shape[0] > 3000: warnings.warn( "shap.force_plot is slow many thousands of rows, try subsampling your data." ) exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = [ labels['FEATURE'] % str(i) for i in range(shap_values.shape[1]) ] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[i, :] instance = Instance(np.ones((1, len(feature_names))), display_features) e = AdditiveExplanation( base_value, np.sum(shap_values[i, :]) + base_value, shap_values[i, :], None, instance, link, Model(None, out_names), DenseData(np.ones((1, len(feature_names))), list(feature_names))) exps.append(e) return iml.visualize(exps, plot_cmap=plot_cmap)
def explain(self, incoming_instance, **kwargs): # convert incoming input to a standardized iml object instance = convert_to_instance(incoming_instance) match_instance_to_data(instance, self.data) assert len(self.data.groups) == self.P, "SamplingExplainer does not support feature groups!" # find the feature groups we will test. If a feature does not change from its # current value then we know it doesn't impact the model self.varyingInds = self.varying_groups(instance.x) #self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds] self.M = len(self.varyingInds) # find f(x) if self.keep_index: model_out = self.model.f(instance.convert_to_df()) else: model_out = self.model.f(instance.x) if isinstance(model_out, (pd.DataFrame, pd.Series)): model_out = model_out.values[0] self.fx = model_out[0] if not self.vector_out: self.fx = np.array([self.fx]) # if no features vary then there no feature has an effect if self.M == 0: phi = np.zeros((len(self.data.groups), self.D)) phi_var = np.zeros((len(self.data.groups), self.D)) # if only one feature varies then it has all the effect elif self.M == 1: phi = np.zeros((len(self.data.groups), self.D)) phi_var = np.zeros((len(self.data.groups), self.D)) diff = self.fx - self.fnull for d in range(self.D): phi[self.varyingInds[0],d] = diff[d] # if more than one feature varies then we have to do real work else: # pick a reasonable number of samples if the user didn't specify how many they wanted self.nsamples = kwargs.get("nsamples", 0) assert self.nsamples % 2 == 0, "nsamples must be divisible by 2!" if self.nsamples == 0: self.nsamples = 1000 * self.M min_samples_per_feature = kwargs.get("min_samples_per_feature", 100) round1_samples = self.nsamples round2_samples = 0 if round1_samples > self.M * min_samples_per_feature: round2_samples = round1_samples - self.M * min_samples_per_feature round1_samples -= round2_samples # divide up the samples among the features for round 1 nsamples_each1 = np.ones(self.M, dtype=np.int64) * 2 * (round1_samples // (self.M * 2)) for i in range((round1_samples % (self.M * 2)) // 2): nsamples_each1[i] += 2 # explain every feature in round 1 phi = np.zeros((self.P, self.D)) phi_var = np.zeros((self.P, self.D)) self.X_masked = np.zeros((nsamples_each1.max(), self.data.data.shape[1])) for i,ind in enumerate(self.varyingInds): phi[ind,:],phi_var[ind,:] = self.sampling_estimate(ind, self.model.f, instance.x, self.data.data, nsamples=nsamples_each1[i]) # optimally allocate samples according to the variance phi_var /= phi_var.sum() nsamples_each2 = (phi_var[self.varyingInds,:].mean(1) * round2_samples).astype(np.int) for i in range(len(nsamples_each2)): if nsamples_each2[i] % 2 == 1: nsamples_each2[i] += 1 for i in range(len(nsamples_each2)): if nsamples_each2.sum() > round2_samples: nsamples_each2[i] -= 2 elif nsamples_each2.sum() < round2_samples: nsamples_each2[i] += 2 else: break self.X_masked = np.zeros((nsamples_each2.max(), self.data.data.shape[1])) for i,ind in enumerate(self.varyingInds): if nsamples_each2[i] > 0: val,var = self.sampling_estimate(ind, self.model.f, instance.x, self.data.data, nsamples=nsamples_each2[i]) total_samples = nsamples_each1[i] + nsamples_each2[i] phi[ind,:] = (phi[ind,:] * nsamples_each1[i] + val * nsamples_each2[i]) / total_samples phi_var[ind,:] = (phi_var[ind,:] * nsamples_each1[i] + var * nsamples_each2[i]) / total_samples # convert from the variance of the differences to the variance of the mean (phi) for i,ind in enumerate(self.varyingInds): phi_var[ind,:] /= np.sqrt(nsamples_each1[i] + nsamples_each2[i]) # correct the sum of the SHAP values to equal the output of the model using a linear # regression model with priors of the coefficents equal to the estimated variances for each # SHAP value (note that 1e-6 is designed to increase the weight of the sample and so closely # match the correct sum) sum_error = self.fx - phi.sum(0) - self.fnull for i in range(self.D): inds = np.where(phi_var[:,i] > 0)[0] normed_vars = 1 / phi_var[inds,i] normed_vars /= normed_vars.sum() adj = np.linalg.inv(np.ones((len(inds), len(inds))) + 1e-6 * np.diag(normed_vars)).sum(1) * sum_error[i] adj += (sum_error[i] - adj.sum())/len(adj) phi[inds,i] += adj if phi.shape[1] == 1: phi = phi[:,0] return AdditiveExplanation(self.fnull, model_out, phi, np.zeros(len(phi)), instance, self.link, self.model, self.data)
def visualize(shap_values, features=None, feature_names=None, out_names=None, data=None): """ Visualize the given SHAP values with an additive force layout. """ # backwards compatability if data is not None: warnings.warn( "the 'data' parameter has been renamed to 'features' for consistency" ) if features is None: features = data if type(shap_values) != np.ndarray: return iml.visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.as_matrix() elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.as_matrix() elif str(type(features)) == "list": if feature_names is None: feature_names = features features = None elif len(features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1, len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1] - 1)] if features is None: features = ["" for i in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() instance = Instance(np.zeros((1, len(feature_names))), features) e = AdditiveExplanation( shap_values[0, -1], np.sum(shap_values[0, :]), shap_values[0, :-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.zeros((1, len(feature_names))), list(feature_names))) return e else: exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1] - 1)] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[i, :] instance = Instance(np.ones((1, len(feature_names))), display_features) e = AdditiveExplanation( shap_values[i, -1], np.sum(shap_values[i, :]), shap_values[i, :-1], None, instance, IdentityLink(), Model(None, out_names), DenseData(np.ones((1, len(feature_names))), list(feature_names))) exps.append(e) return exps
def force_plot(shap_values, features=None, feature_names=None, out_names=None, link="identity"): """ Visualize the given SHAP values with an additive force layout. """ link = iml.links.convert_to_link(link) if type(shap_values) != np.ndarray: return iml.visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.as_matrix() elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.as_matrix() elif str(type(features)) == "list": if feature_names is None: feature_names = features features = None elif len(features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1,len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1]-1)] if features is None: features = ["" for i in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() instance = Instance(np.zeros((1,len(feature_names))), features) e = AdditiveExplanation( shap_values[0,-1], np.sum(shap_values[0,:]), shap_values[0,:-1], None, instance, link, Model(None, out_names), DenseData(np.zeros((1,len(feature_names))), list(feature_names)) ) return e else: exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = ["" for i in range(shap_values.shape[1]-1)] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[i,:] instance = Instance(np.ones((1,len(feature_names))), display_features) e = AdditiveExplanation( shap_values[i,-1], np.sum(shap_values[i,:]), shap_values[i,:-1], None, instance, link, Model(None, out_names), DenseData(np.ones((1,len(feature_names))), list(feature_names)) ) exps.append(e) return exps