예제 #1
0
    def explain(self, incoming_instance, **kwargs):
        # convert incoming input to a standardized iml object
        instance = convert_to_instance(incoming_instance)
        match_instance_to_data(instance, self.data)

        # find the feature groups we will test. If a feature does not change from its
        # current value then we know it doesn't impact the model
        self.varyingInds = self.varying_groups(instance.x)
        self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds]
        self.M = len(self.varyingFeatureGroups)

        # find f(x) and E_x[f(x)]
        model_out = self.model.f(instance.x)
        self.fx = model_out[0]
        self.fnull = np.mean(self.model.f(self.data.data), 0)
        self.vector_out = True
        if len(model_out.shape) == 1:
            self.vector_out = False
            self.D = 1
            self.fx = np.array([self.fx])
            self.fnull = np.array([self.fnull])
        else:
            self.D = model_out.shape[1]

        # if no features vary then there no feature has an effect
        if self.M == 0:
            phi = np.zeros(len(self.data.groups))
            phi_var = np.zeros(len(self.data.groups))
            return AdditiveExplanation(self.fnull, self.fx, phi, phi_var, instance, self.link, self.model, self.data)

        # if only one feature varies then it has all the effect
        elif self.M == 1:
            phi = np.zeros(len(self.data.groups))
            phi[self.varyingInds[0]] = self.link.f(self.fx) - self.link.f(self.fnull)
            phi_var = np.zeros(len(self.data.groups))
            return AdditiveExplanation(self.fnull, self.fx, phi, phi_var, instance, self.link, self.model, self.data)

        self.l1_reg = kwargs.get("l1_reg", "auto")

        # pick a reasonable number of samples if the user didn't specify how many they wanted
        self.nsamples = kwargs.get("nsamples", 0)
        if self.nsamples == 0:
            self.nsamples = 2 * self.M + 1000

        # if we have enough samples to enumerate all subsets then ignore the unneeded samples
        self.max_samples = 2 ** 30
        if self.M <= 30 and self.nsamples > 2 ** self.M - 2:
            self.nsamples = 2 ** self.M - 2
            self.max_samples = self.nsamples

        # reserve space for some of our computations
        self.allocate()

        # weight the different subset sizes
        num_subset_sizes = np.int(np.ceil((self.M - 1) / 2.0))
        num_paired_subset_sizes = np.int(np.floor((self.M - 1) / 2.0))
        weight_vector = np.array([(self.M - 1.0) / (i * (self.M - i)) for i in range(1, num_subset_sizes + 1)])
        weight_vector[:num_paired_subset_sizes] *= 2
        weight_vector /= np.sum(weight_vector)
        log.debug("weight_vector = {0}".format(weight_vector))
        log.debug("num_subset_sizes = {0}".format(num_subset_sizes))
        log.debug("num_paired_subset_sizes = {0}".format(num_paired_subset_sizes))

        # fill out all the subset sizes we can completely enumerate
        # given nsamples*remaining_weight_vector[subset_size]
        num_full_subsets = 0
        num_samples_left = self.nsamples
        group_inds = np.arange(self.M, dtype='int64')
        mask = np.zeros(self.M)
        remaining_weight_vector = copy.copy(weight_vector)
        for subset_size in range(1, num_subset_sizes + 1):

            # determine how many subsets (and their complements) are of the current size
            nsubsets = binom(self.M, subset_size)
            if subset_size <= num_paired_subset_sizes: nsubsets *= 2
            log.debug("subset_size = {0}".format(subset_size))
            log.debug("nsubsets = {0}".format(nsubsets))
            log.debug("self.nsamples*weight_vector[subset_size-1] = {0}".format(
                num_samples_left * remaining_weight_vector[subset_size - 1]))
            log.debug("self.nsamples*weight_vector[subset_size-1/nsubsets = {0}".format(
                num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets))

            # see if we have enough samples to enumerate all subsets of this size
            if num_samples_left * remaining_weight_vector[subset_size - 1] / nsubsets >= 1.0 - 1e-8:
                num_full_subsets += 1
                num_samples_left -= nsubsets

                # rescale what's left of the remaining weight vector to sum to 1
                if remaining_weight_vector[subset_size - 1] < 1.0:
                    remaining_weight_vector /= (1 - remaining_weight_vector[subset_size - 1])

                # add all the samples of the current subset size
                w = weight_vector[subset_size - 1] / binom(self.M, subset_size)
                if subset_size <= num_paired_subset_sizes: w /= 2.0
                for inds in itertools.combinations(group_inds, subset_size):
                    mask[:] = 0.0
                    mask[np.array(inds, dtype='int64')] = 1.0
                    self.addsample(instance.x, mask, w)
                    if subset_size <= num_paired_subset_sizes:
                        mask[:] = np.abs(mask - 1)
                        self.addsample(instance.x, mask, w)
            else:
                break
        log.info("num_full_subsets = {0}".format(num_full_subsets))

        # add random samples from what is left of the subset space
        samples_left = self.nsamples - self.nsamplesAdded
        log.debug("samples_left = {0}".format(samples_left))
        if num_full_subsets != num_subset_sizes:
            weight_left = np.sum(weight_vector[num_full_subsets:])
            rand_sample_weight = weight_left / samples_left
            log.info("weight_left = {0}".format(weight_left))
            log.info("rand_sample_weight = {0}".format(rand_sample_weight))
            remaining_weight_vector = weight_vector[num_full_subsets:]
            remaining_weight_vector /= np.sum(remaining_weight_vector)
            log.info("remaining_weight_vector = {0}".format(remaining_weight_vector))
            log.info("num_paired_subset_sizes = {0}".format(num_paired_subset_sizes))
            ind_set = np.arange(len(remaining_weight_vector))
            while samples_left > 0:
                mask[:] = 0.0
                np.random.shuffle(group_inds)
                ind = np.random.choice(ind_set, 1, p=remaining_weight_vector)[0]
                mask[group_inds[:ind + num_full_subsets + 1]] = 1.0
                samples_left -= 1
                self.addsample(instance.x, mask, rand_sample_weight)

                # add the compliment sample
                if samples_left > 0:
                    mask -= 1.0
                    mask[:] = np.abs(mask)
                    self.addsample(instance.x, mask, rand_sample_weight)
                    samples_left -= 1

        # execute the model on the synthetic samples we have created
        self.run()

        # solve then expand the feature importance (Shapley value) vector to contain the non-varying features
        phi = np.zeros((len(self.data.groups), self.D))
        phi_var = np.zeros((len(self.data.groups), self.D))
        for d in range(self.D):
            vphi, vphi_var = self.solve(self.nsamples / self.max_samples, d)
            phi[self.varyingInds, d] = vphi
            phi_var[self.varyingInds, d] = vphi_var

        if not self.vector_out:
            phi = np.squeeze(phi, axis=1)
            phi_var = np.squeeze(phi_var, axis=1)
            self.fx = self.fx[0]
            self.fnull = self.fnull[0]

        # return the Shapley values along with variances of the estimates
        # note that if features were eliminated by l1 regression their
        # variance will be 0, even though they are not perfectly known
        return AdditiveExplanation(self.link.f(self.fnull), self.link.f(self.fx), phi, phi_var, instance, self.link,
                                   self.model, self.data)
예제 #2
0
파일: plots.py 프로젝트: gustavocarita/shap
def visualize(shap_values,
              features=None,
              feature_names=None,
              out_names=None,
              data=None):
    """ Visualize the given SHAP values with an additive force layout. """

    # backwards compatability
    if data is not None:
        warnings.warn(
            "the 'data' parameter has been renamed to 'features' for consistency"
        )
        if features is None:
            features = data

    if type(shap_values) != np.ndarray:
        return iml.visualize(shap_values)

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.as_matrix()
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.as_matrix()
    elif str(type(features)) == "list":
        if feature_names is None:
            feature_names = features
        features = None
    elif len(features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1, len(shap_values)))

    if out_names is None:
        out_names = ["output value"]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = ["" for i in range(shap_values.shape[1] - 1)]
        if features is None:
            features = ["" for i in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        instance = Instance(np.zeros((1, len(feature_names))), features)
        e = AdditiveExplanation(
            shap_values[0, -1], np.sum(shap_values[0, :]), shap_values[0, :-1],
            None, instance, IdentityLink(), Model(None, out_names),
            DenseData(np.zeros((1, len(feature_names))), list(feature_names)))
        return e

    else:
        exps = []
        for i in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = ["" for i in range(shap_values.shape[1] - 1)]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[i, :]

            instance = Instance(np.ones((1, len(feature_names))),
                                display_features)
            e = AdditiveExplanation(
                shap_values[i, -1], np.sum(shap_values[i, :]),
                shap_values[i, :-1], None, instance, IdentityLink(),
                Model(None, out_names),
                DenseData(np.ones((1, len(feature_names))),
                          list(feature_names)))
            exps.append(e)
        return exps
예제 #3
0
def force_plot(shap_values, features=None, feature_names=None, out_names=None, link="identity",
               plot_cmap="RdBu"):
    """ Visualize the given SHAP values with an additive force layout. """

    link = iml.links.convert_to_link(link)

    if type(shap_values) == list:
        assert False, "The shap_values arg looks looks multi output, try shap_values[i]."


    if type(shap_values) != np.ndarray:
        return iml.visualize(shap_values)


    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.as_matrix()
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.as_matrix()
    elif str(type(features)) == "list":
        if feature_names is None:
            feature_names = features
        features = None
    elif features is not None and len(features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1,len(shap_values)))

    if out_names is None:
        out_names = ["output value"]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = ["Feature "+str(i) for i in range(shap_values.shape[1]-1)]
        if features is None:
            features = ["" for i in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        instance = Instance(np.zeros((1,len(feature_names))), features)
        e = AdditiveExplanation(
            shap_values[0,-1],
            np.sum(shap_values[0,:]),
            shap_values[0,:-1],
            None,
            instance,
            link,
            Model(None, out_names),
            DenseData(np.zeros((1,len(feature_names))), list(feature_names)),
            Plot_CMAP(plot_cmap)
        )
        return e

    else:
        exps = []
        for i in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = ["Feature "+str(i) for i in range(shap_values.shape[1]-1)]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[i,:]

            instance = Instance(np.ones((1,len(feature_names))), display_features)
            e = AdditiveExplanation(
                shap_values[i,-1],
                np.sum(shap_values[i,:]),
                shap_values[i,:-1],
                None,
                instance,
                link,
                Model(None, out_names),
                DenseData(np.ones((1,len(feature_names))), list(feature_names)),
                Plot_CMAP(plot_cmap)
            )
            exps.append(e)
        return exps
예제 #4
0
def force_plot(base_value,
               shap_values,
               features=None,
               feature_names=None,
               out_names=None,
               link="identity",
               plot_cmap="RdBu"):
    """ Visualize the given SHAP values with an additive force layout. """

    # auto unwrap the base_value
    if type(base_value) == np.ndarray and len(base_value) == 1:
        base_value = base_value[0]

    if (type(base_value) == np.ndarray or type(base_value) == list):
        if type(shap_values) != list or len(shap_values) != len(base_value):
            raise Exception("In v0.20 force_plot now requires the base value as the first parameter! " \
                            "Try shap.force_plot(explainer.expected_value, shap_values) or " \
                            "for multi-output models try " \
                            "shap.force_plot(explainer.expected_value[0], shap_values[0]).")

    assert not type(
        shap_values
    ) == list, "The shap_values arg looks looks multi output, try shap_values[i]."

    link = iml.links.convert_to_link(link)

    if type(shap_values) != np.ndarray:
        return iml.visualize(shap_values)

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.values
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.values
    elif isinstance(features, list):
        if feature_names is None:
            feature_names = features
        features = None
    elif features is not None and len(
            features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1, len(shap_values)))

    if out_names is None:
        out_names = ["output value"]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = [
                labels['FEATURE'] % str(i) for i in range(shap_values.shape[1])
            ]
        if features is None:
            features = ["" for _ in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        # check that the shape of the shap_values and features match
        if len(features) != shap_values.shape[1]:
            msg = "Length of features is not equal to the length of shap_values!"
            if len(features) == shap_values.shape[1] - 1:
                msg += " You might be using an old format shap_values array with the base value " \
                       "as the last column. In this case just pass the array without the last column."
            raise Exception(msg)

        instance = Instance(np.zeros((1, len(feature_names))), features)
        e = AdditiveExplanation(
            base_value,
            np.sum(shap_values[0, :]) + base_value, shap_values[0, :], None,
            instance, link, Model(None, out_names),
            DenseData(np.zeros((1, len(feature_names))), list(feature_names)))
        return iml.visualize(e, plot_cmap)

    else:
        if shap_values.shape[0] > 3000:
            warnings.warn(
                "shap.force_plot is slow many thousands of rows, try subsampling your data."
            )

        exps = []
        for i in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = [
                    labels['FEATURE'] % str(i)
                    for i in range(shap_values.shape[1])
                ]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[i, :]

            instance = Instance(np.ones((1, len(feature_names))),
                                display_features)
            e = AdditiveExplanation(
                base_value,
                np.sum(shap_values[i, :]) + base_value, shap_values[i, :],
                None, instance, link, Model(None, out_names),
                DenseData(np.ones((1, len(feature_names))),
                          list(feature_names)))
            exps.append(e)
        return iml.visualize(exps, plot_cmap=plot_cmap)
예제 #5
0
파일: plots.py 프로젝트: yinglarp/shap
def force_plot(shap_values,
               features=None,
               feature_names=None,
               out_names=None,
               link="identity",
               plot_cmap="RdBu"):
    """ Visualize the given SHAP values with an additive force layout. """

    assert not type(
        shap_values
    ) == list, "The shap_values arg looks looks multi output, try shap_values[i]."

    link = iml.links.convert_to_link(link)

    if type(shap_values) != np.ndarray:
        return iml.visualize(shap_values)

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.values
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.values
    elif str(type(features)) == "list":
        if feature_names is None:
            feature_names = features
        features = None
    elif features is not None and len(
            features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1, len(shap_values)))

    if out_names is None:
        out_names = ["output value"]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = [
                labels['FEATURE'] % str(i)
                for i in range(shap_values.shape[1] - 1)
            ]
        if features is None:
            features = ["" for _ in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        instance = Instance(np.zeros((1, len(feature_names))), features)
        e = AdditiveExplanation(
            shap_values[0, -1], np.sum(shap_values[0, :]), shap_values[0, :-1],
            None, instance, link, Model(None, out_names),
            DenseData(np.zeros((1, len(feature_names))), list(feature_names)))
        return iml.visualize(e, plot_cmap)

    else:
        if shap_values.shape[0] > 3000:
            warnings.warn(
                "shap.force_plot is slow many thousands of rows, try subsampling your data."
            )

        exps = []
        for i in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = [
                    labels['FEATURE'] % str(i)
                    for i in range(shap_values.shape[1] - 1)
                ]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[i, :]

            instance = Instance(np.ones((1, len(feature_names))),
                                display_features)
            e = AdditiveExplanation(
                shap_values[i, -1], np.sum(shap_values[i, :]),
                shap_values[i, :-1], None, instance, link,
                Model(None, out_names),
                DenseData(np.ones((1, len(feature_names))),
                          list(feature_names)))
            exps.append(e)
        return iml.visualize(exps, plot_cmap=plot_cmap)
예제 #6
0
    def explain(self, incoming_instance, **kwargs):
        # convert incoming input to a standardized iml object
        instance = convert_to_instance(incoming_instance)
        match_instance_to_data(instance, self.data)

        assert len(
            self.data.groups
        ) == self.P, "SamplingExplainer does not support feature groups!"

        # find the feature groups we will test. If a feature does not change from its
        # current value then we know it doesn't impact the model
        self.varyingInds = self.varying_groups(instance.x)
        #self.varyingFeatureGroups = [self.data.groups[i] for i in self.varyingInds]
        self.M = len(self.varyingInds)

        # find f(x)
        if self.keep_index:
            model_out = self.model.f(instance.convert_to_df())
        else:
            model_out = self.model.f(instance.x)
        if isinstance(model_out, (pd.DataFrame, pd.Series)):
            model_out = model_out.values[0]
        self.fx = model_out[0]

        if not self.vector_out:
            self.fx = np.array([self.fx])

        # if no features vary then there no feature has an effect
        if self.M == 0:
            phi = np.zeros((len(self.data.groups), self.D))
            phi_var = np.zeros((len(self.data.groups), self.D))

        # if only one feature varies then it has all the effect
        elif self.M == 1:
            phi = np.zeros((len(self.data.groups), self.D))
            phi_var = np.zeros((len(self.data.groups), self.D))
            diff = self.fx - self.fnull
            for d in range(self.D):
                phi[self.varyingInds[0], d] = diff[d]

        # if more than one feature varies then we have to do real work
        else:

            # pick a reasonable number of samples if the user didn't specify how many they wanted
            self.nsamples = kwargs.get("nsamples", 0)
            assert self.nsamples % 2 == 0, "nsamples must be divisible by 2!"
            if self.nsamples == 0:
                self.nsamples = 1000 * self.M

            min_samples_per_feature = kwargs.get("min_samples_per_feature",
                                                 100)
            round1_samples = self.nsamples
            round2_samples = 0
            if round1_samples > self.M * min_samples_per_feature:
                round2_samples = round1_samples - self.M * min_samples_per_feature
                round1_samples -= round2_samples

            # divide up the samples among the features for round 1
            nsamples_each1 = np.ones(
                self.M, dtype=np.int64) * 2 * (round1_samples // (self.M * 2))
            for i in range((round1_samples % (self.M * 2)) // 2):
                nsamples_each1[i] += 2

            # explain every feature in round 1
            phi = np.zeros((self.P, self.D))
            phi_var = np.zeros((self.P, self.D))
            self.X_masked = np.zeros(
                (nsamples_each1.max(), self.data.data.shape[1]))
            for i, ind in enumerate(self.varyingInds):
                phi[ind, :], phi_var[ind, :] = self.sampling_estimate(
                    ind,
                    self.model.f,
                    instance.x,
                    self.data.data,
                    nsamples=nsamples_each1[i])

            # optimally allocate samples according to the variance
            phi_var /= phi_var.sum()
            nsamples_each2 = (phi_var[self.varyingInds, :].mean(1) *
                              round2_samples).astype(np.int)
            for i in range(len(nsamples_each2)):
                if nsamples_each2[i] % 2 == 1: nsamples_each2[i] += 1
            for i in range(len(nsamples_each2)):
                if nsamples_each2.sum() > round2_samples:
                    nsamples_each2[i] -= 2
                elif nsamples_each2.sum() < round2_samples:
                    nsamples_each2[i] += 2
                else:
                    break

            self.X_masked = np.zeros(
                (nsamples_each2.max(), self.data.data.shape[1]))
            for i, ind in enumerate(self.varyingInds):
                if nsamples_each2[i] > 0:
                    val, var = self.sampling_estimate(
                        ind,
                        self.model.f,
                        instance.x,
                        self.data.data,
                        nsamples=nsamples_each2[i])

                    total_samples = nsamples_each1[i] + nsamples_each2[i]
                    phi[ind, :] = (phi[ind, :] * nsamples_each1[i] +
                                   val * nsamples_each2[i]) / total_samples
                    phi_var[ind, :] = (phi_var[ind, :] * nsamples_each1[i] +
                                       var * nsamples_each2[i]) / total_samples

            # convert from the variance of the differences to the variance of the mean (phi)
            for i, ind in enumerate(self.varyingInds):
                phi_var[ind, :] /= np.sqrt(nsamples_each1[i] +
                                           nsamples_each2[i])

            # correct the sum of the SHAP values to equal the output of the model using a linear
            # regression model with priors of the coefficents equal to the estimated variances for each
            # SHAP value (note that 1e6 is designed to increase the weight of the sample and so closely
            # match the correct sum)
            sum_error = self.fx - phi.sum(0) - self.fnull
            for i in range(self.D):
                # this is a ridge regression with one sample of all ones with sum_error[i] as the label
                # and 1/v as the ridge penalties. This simlified (and stable) form comes from the
                # Sherman-Morrison formula
                v = (phi_var[:, i] / phi_var[:, i].max()) * 1e6
                adj = sum_error[i] * (v - (v * v.sum()) / (1 + v.sum()))
                phi[:, i] += adj

        if phi.shape[1] == 1:
            phi = phi[:, 0]

        return AdditiveExplanation(self.fnull, model_out, phi,
                                   np.zeros(len(phi)), instance, self.link,
                                   self.model, self.data)