Пример #1
0
    def test_lime_explainer_with_data_stats(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        # Generate stats using a quartile descritizer
        descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names,
                                          random_state=20)

        d_means = descritizer.means
        d_stds = descritizer.stds
        d_mins = descritizer.mins
        d_maxs = descritizer.maxs
        d_bins = descritizer.bins(self.train, self.target_names)

        # Compute feature values and frequencies of all columns
        cat_features = np.arange(self.train.shape[1])
        discretized_training_data = descritizer.discretize(self.train)

        feature_values = {}
        feature_frequencies = {}
        for feature in cat_features:
            column = discretized_training_data[:, feature]
            feature_count = collections.Counter(column)
            values, frequencies = map(list, zip(*(feature_count.items())))
            feature_values[feature] = values
            feature_frequencies[feature] = frequencies

        # Convert bins to list from array
        d_bins_revised = {}
        index = 0
        for bin in d_bins:
            d_bins_revised[index] = bin.tolist()
            index = index+1

        # Descritized stats
        data_stats = {}
        data_stats["means"] = d_means
        data_stats["stds"] = d_stds
        data_stats["maxs"] = d_maxs
        data_stats["mins"] = d_mins
        data_stats["bins"] = d_bins_revised
        data_stats["feature_values"] = feature_values
        data_stats["feature_frequencies"] = feature_frequencies

        data = np.zeros((2, len(self.feature_names)))
        explainer = LimeTabularExplainer(
            data, feature_names=self.feature_names, random_state=10,
            training_data_stats=data_stats, training_labels=self.target_names)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2,
                                         model_regressor=LinearRegression())

        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
Пример #2
0
    def __init__(self,
                 training_data,
                 mode="classification",
                 training_labels=None,
                 feature_names=None,
                 categorical_features=None,
                 categorical_names=None,
                 kernel_width=None,
                 kernel=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 discretize_continuous=True,
                 discretizer='quartile',
                 sample_around_instance=False,
                 random_state=None,
                 training_data_stats=None):
        """Init function.

        Args:
            training_data: numpy 2d array
            mode: "classification" or "regression"
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
                If None, defaults to sqrt (number of columns) * 0.75
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True
                and data is not sparse. Options are 'quartile', 'decile',
                'entropy' or a BaseDiscretizer instance.
            sample_around_instance: if True, will sample continuous features
                in perturbed samples from a normal centered at the instance
                being explained. Otherwise, the normal is centered on the mean
                of the feature data.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
            training_data_stats: a dict object having the details of training data
                statistics. If None, training data information will be used, only matters
                if discretize_continuous is True. Must have the following keys:
                means", "mins", "maxs", "stds", "feature_values",
                "feature_frequencies"
        """
        self.random_state = check_random_state(random_state)
        self.mode = mode
        self.categorical_names = categorical_names or {}
        self.sample_around_instance = sample_around_instance
        self.training_data_stats = training_data_stats

        # Check and raise proper error in stats are supplied in non-descritized path
        if self.training_data_stats:
            self.validate_training_data_stats(self.training_data_stats)

        if categorical_features is None:
            categorical_features = []
        if feature_names is None:
            feature_names = [str(i) for i in range(training_data.shape[1])]

        self.categorical_features = list(categorical_features)
        self.feature_names = list(feature_names)

        self.discretizer = None
        if discretize_continuous and not sp.sparse.issparse(training_data):
            # Set the discretizer if training data stats are provided
            if self.training_data_stats:
                discretizer = StatsDiscretizer(
                    training_data,
                    self.categorical_features,
                    self.feature_names,
                    labels=training_labels,
                    data_stats=self.training_data_stats,
                    random_state=self.random_state)

            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                    training_data,
                    self.categorical_features,
                    self.feature_names,
                    labels=training_labels,
                    random_state=self.random_state)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(
                    training_data,
                    self.categorical_features,
                    self.feature_names,
                    labels=training_labels,
                    random_state=self.random_state)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                    training_data,
                    self.categorical_features,
                    self.feature_names,
                    labels=training_labels,
                    random_state=self.random_state)
            elif isinstance(discretizer, BaseDiscretizer):
                self.discretizer = discretizer
            else:
                raise ValueError('''Discretizer must be 'quartile',''' +
                                 ''' 'decile', 'entropy' or a''' +
                                 ''' BaseDiscretizer instance''')
            self.categorical_features = list(range(training_data.shape[1]))

            # Get the discretized_training_data when the stats are not provided
            if (self.training_data_stats is None):
                discretized_training_data = self.discretizer.discretize(
                    training_data)

        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        if kernel is None:

            def kernel(d, kernel_width):
                return np.sqrt(np.exp(-(d**2) / kernel_width**2))

        kernel_fn = partial(kernel, kernel_width=kernel_width)

        self.feature_selection = feature_selection
        self.base = lime_base.LimeBase(kernel_fn,
                                       verbose,
                                       random_state=self.random_state)
        self.class_names = class_names

        # Though set has no role to play if training data stats are provided
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            if training_data_stats is None:
                if self.discretizer is not None:
                    column = discretized_training_data[:, feature]
                else:
                    column = training_data[:, feature]

                feature_count = collections.Counter(column)
                values, frequencies = map(
                    list, zip(*(sorted(feature_count.items()))))
            else:
                values = training_data_stats["feature_values"][feature]
                frequencies = training_data_stats["feature_frequencies"][
                    feature]

            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 float(sum(frequencies)))
            self.scaler.mean_[feature] = 0
            self.scaler.scale_[feature] = 1
Пример #3
0
    def __init__(
            self,
            training_data,
            training_labels=None,
            feature_names=None,
            categorical_features=None,
            categorical_names=None,
            kernel_width=None,
            verbose=False,
            class_names=None,
            feature_selection='auto',
            discretize_continuous=True,
            proposal_method="random",  # random proposal vs. kde proposal
            discretizer='quartile'):
        """Init function.

        Args:
            training_data: numpy 2d array
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
            If None, defaults to sqrt(number of columns) * 0.75
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True. Options
                are 'quartile', 'decile' or 'entropy'
        """
        #### jiaxuan's addition for kde proposing distribution ####
        self.proposal_method = proposal_method
        # standardize data
        X_train = training_data
        self.kde_scaler = StandardScaler()
        X_train = self.kde_scaler.fit_transform(X_train)

        # learn a kde classifier
        # use grid search cross-validation to optimize the bandwidth
        params = {'bandwidth': np.logspace(-1, 1, 20)}
        grid = GridSearchCV(KernelDensity(), params)
        grid.fit(X_train)

        print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

        # use the best estimator to compute the kernel density estimate
        self.kde = grid.best_estimator_
        #### end jiaxuan's addition ########

        self.categorical_names = categorical_names
        self.categorical_features = categorical_features
        if self.categorical_names is None:
            self.categorical_names = {}
        if self.categorical_features is None:
            self.categorical_features = []
        self.discretizer = None
        if discretize_continuous:
            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                    training_data,
                    self.categorical_features,
                    feature_names,
                    labels=training_labels)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(training_data,
                                                     self.categorical_features,
                                                     feature_names,
                                                     labels=training_labels)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                    training_data,
                    self.categorical_features,
                    feature_names,
                    labels=training_labels)
            else:
                raise ValueError('''Discretizer must be 'quartile',''' +
                                 ''' 'decile' or 'entropy' ''')
            self.categorical_features = range(
                training_data.shape[1])  # so all categorical by the end!
            discretized_training_data = self.discretizer.discretize(
                training_data)

        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        def kernel(d):
            return np.sqrt(np.exp(-(d**2) / kernel_width**2))

        self.feature_selection = feature_selection
        self.base = lime_base.LimeBase(kernel, verbose)
        self.scaler = None
        self.class_names = class_names
        self.feature_names = feature_names
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            feature_count = collections.defaultdict(lambda: 0.0)
            column = training_data[:, feature]
            # this is for continuously converted categorical data
            if self.discretizer is not None:
                column = discretized_training_data[:, feature]
                feature_count[0] = 0.  # only handles quantile? or useless?
                feature_count[1] = 0.
                feature_count[2] = 0.
                feature_count[3] = 0.
            for value in column:
                feature_count[value] += 1
            values, frequencies = map(list, zip(*(feature_count.items())))
            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 sum(frequencies))
            self.scaler.mean_[feature] = 0  # not scaled for categorical data
            self.scaler.scale_[feature] = 1
    def __init__(self,
                 training_data,
                 mode="classification",
                 training_labels=None,
                 feature_names=None,
                 categorical_features=None,
                 categorical_names=None,
                 kernel_width=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 discretize_continuous=True,
                 discretizer='quartile',
                 random_state=None):
        """Init function.

        Args:
            training_data: numpy 2d array
            mode: "classification" or "regression"
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
                If None, defaults to sqrt (number of columns) * 0.75
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True. Options
                are 'quartile', 'decile', 'entropy' or a BaseDiscretizer
                instance.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
        """
        self.random_state = check_random_state(random_state)
        self.mode = mode
        self.categorical_names = categorical_names or {}

        if categorical_features is None:
            categorical_features = []
        if feature_names is None:
            feature_names = [str(i) for i in range(training_data.shape[1])]

        self.categorical_features = list(categorical_features)
        self.feature_names = list(feature_names)

        self.discretizer = None
        if discretize_continuous:
            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels)
            elif isinstance(discretizer, BaseDiscretizer):
                self.discretizer = discretizer
            else:
                raise ValueError('''Discretizer must be 'quartile',''' +
                                 ''' 'decile', 'entropy' or a''' +
                                 ''' BaseDiscretizer instance''')
            self.categorical_features = list(range(training_data.shape[1]))
            discretized_training_data = self.discretizer.discretize(
                    training_data)

        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        def kernel(d):
            return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        self.feature_selection = feature_selection
        self.base = lime_base.LimeBase(kernel, verbose, random_state=self.random_state)
        self.scaler = None
        self.class_names = class_names
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            if self.discretizer is not None:
                column = discretized_training_data[:, feature]
            else:
                column = training_data[:, feature]

            feature_count = collections.Counter(column)
            values, frequencies = map(list, zip(*(feature_count.items())))

            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 float(sum(frequencies)))
            self.scaler.mean_[feature] = 0
            self.scaler.scale_[feature] = 1
Пример #5
0
    def test_lime_tabular_explainer_not_equal_random_state(self):
        X, y = make_classification(n_samples=1000,
                                   n_features=20,
                                   n_informative=2,
                                   n_redundant=2,
                                   random_state=10)

        rf = RandomForestClassifier(n_estimators=500, random_state=10)
        rf.fit(X, y)
        instance = np.random.RandomState(10).randint(0, X.shape[0])
        feature_names = ["feature" + str(i) for i in range(20)]

        # ----------------------------------------------------------------------
        # -------------------------Quartile Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Decile Discretizer--------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Entropy Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())
Пример #6
0
    def __init__(self, training_data, training_labels=None, feature_names=None,
                 categorical_features=None, categorical_names=None,
                 kernel_width=None, verbose=False, class_names=None,
                 feature_selection='auto', discretize_continuous=True,
                 discretizer='quartile'):
        """Init function.

        Args:
            training_data: numpy 2d array
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
            If None, defaults to sqrt(number of columns) * 0.75
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True. Options
                are 'quartile', 'decile' or 'entropy'
        """
        self.categorical_names = categorical_names
        self.categorical_features = categorical_features
        if self.categorical_names is None:
            self.categorical_names = {}
        if self.categorical_features is None:
            self.categorical_features = []
        self.discretizer = None
        if discretize_continuous:
            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                    training_data, self.categorical_features, feature_names,
                    labels=training_labels)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(
                    training_data, self.categorical_features, feature_names,
                    labels=training_labels)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                    training_data, self.categorical_features, feature_names,
                    labels=training_labels)
            else:
                raise ('''Discretizer must be 'quartile', 'decile' ''' +
                       '''or 'entropy' ''')
            self.categorical_features = range(training_data.shape[1])
            discretized_training_data = self.discretizer.discretize(
                training_data)

        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        def kernel(d):
            return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        self.feature_selection = feature_selection
        self.base = lime_base.LimeBase(kernel, verbose)
        self.scaler = None
        self.class_names = class_names
        self.feature_names = feature_names
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            feature_count = collections.defaultdict(lambda: 0.0)
            column = training_data[:, feature]
            if self.discretizer is not None:
                column = discretized_training_data[:, feature]
                feature_count[0] = 0.
                feature_count[1] = 0.
                feature_count[2] = 0.
                feature_count[3] = 0.
            for value in column:
                feature_count[value] += 1
            values, frequencies = map(list, zip(*(feature_count.items())))
            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 sum(frequencies))
            self.scaler.mean_[feature] = 0
            self.scaler.scale_[feature] = 1
Пример #7
0
    def test_lime_explainer_with_data_stats(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        # Generate stats using a quartile descritizer
        descritizer = QuartileDiscretizer(self.train, [],
                                          self.feature_names,
                                          self.target_names,
                                          random_state=20)

        d_means = descritizer.means
        d_stds = descritizer.stds
        d_mins = descritizer.mins
        d_maxs = descritizer.maxs
        d_bins = descritizer.bins(self.train, self.target_names)

        # Compute feature values and frequencies of all columns
        cat_features = np.arange(self.train.shape[1])
        discretized_training_data = descritizer.discretize(self.train)

        feature_values = {}
        feature_frequencies = {}
        for feature in cat_features:
            column = discretized_training_data[:, feature]
            feature_count = collections.Counter(column)
            values, frequencies = map(list, zip(*(feature_count.items())))
            feature_values[feature] = values
            feature_frequencies[feature] = frequencies

        # Convert bins to list from array
        d_bins_revised = {}
        index = 0
        for bin in d_bins:
            d_bins_revised[index] = bin.tolist()
            index = index + 1

        # Descritized stats
        data_stats = {}
        data_stats["means"] = d_means
        data_stats["stds"] = d_stds
        data_stats["maxs"] = d_maxs
        data_stats["mins"] = d_mins
        data_stats["bins"] = d_bins_revised
        data_stats["feature_values"] = feature_values
        data_stats["feature_frequencies"] = feature_frequencies

        data = np.zeros((2, len(self.feature_names)))
        explainer = LimeTabularExplainer(data,
                                         feature_names=self.feature_names,
                                         random_state=10,
                                         training_data_stats=data_stats,
                                         training_labels=self.target_names)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2,
                                         model_regressor=LinearRegression())

        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")