def test_lime_explainer_with_data_stats(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) # Generate stats using a quartile descritizer descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, random_state=20) d_means = descritizer.means d_stds = descritizer.stds d_mins = descritizer.mins d_maxs = descritizer.maxs d_bins = descritizer.bins(self.train, self.target_names) # Compute feature values and frequencies of all columns cat_features = np.arange(self.train.shape[1]) discretized_training_data = descritizer.discretize(self.train) feature_values = {} feature_frequencies = {} for feature in cat_features: column = discretized_training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) feature_values[feature] = values feature_frequencies[feature] = frequencies # Convert bins to list from array d_bins_revised = {} index = 0 for bin in d_bins: d_bins_revised[index] = bin.tolist() index = index+1 # Descritized stats data_stats = {} data_stats["means"] = d_means data_stats["stds"] = d_stds data_stats["maxs"] = d_maxs data_stats["mins"] = d_mins data_stats["bins"] = d_bins_revised data_stats["feature_values"] = feature_values data_stats["feature_frequencies"] = feature_frequencies data = np.zeros((2, len(self.feature_names))) explainer = LimeTabularExplainer( data, feature_names=self.feature_names, random_state=10, training_data_stats=data_stats, training_labels=self.target_names) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")
def __init__(self, training_data, mode="classification", training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, kernel=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile', sample_around_instance=False, random_state=None, training_data_stats=None): """Init function. Args: training_data: numpy 2d array mode: "classification" or "regression" training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt (number of columns) * 0.75 kernel: similarity kernel that takes euclidean distances and kernel width as input and outputs weights in (0,1). If None, defaults to an exponential kernel. verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True and data is not sparse. Options are 'quartile', 'decile', 'entropy' or a BaseDiscretizer instance. sample_around_instance: if True, will sample continuous features in perturbed samples from a normal centered at the instance being explained. Otherwise, the normal is centered on the mean of the feature data. random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. training_data_stats: a dict object having the details of training data statistics. If None, training data information will be used, only matters if discretize_continuous is True. Must have the following keys: means", "mins", "maxs", "stds", "feature_values", "feature_frequencies" """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance self.training_data_stats = training_data_stats # Check and raise proper error in stats are supplied in non-descritized path if self.training_data_stats: self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] if feature_names is None: feature_names = [str(i) for i in range(training_data.shape[1])] self.categorical_features = list(categorical_features) self.feature_names = list(feature_names) self.discretizer = None if discretize_continuous and not sp.sparse.issparse(training_data): # Set the discretizer if training data stats are provided if self.training_data_stats: discretizer = StatsDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, data_stats=self.training_data_stats, random_state=self.random_state) if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) # Get the discretized_training_data when the stats are not provided if (self.training_data_stats is None): discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) if kernel is None: def kernel(d, kernel_width): return np.sqrt(np.exp(-(d**2) / kernel_width**2)) kernel_fn = partial(kernel, kernel_width=kernel_width) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) self.class_names = class_names # Though set has no role to play if training data stats are provided self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: if training_data_stats is None: if self.discretizer is not None: column = discretized_training_data[:, feature] else: column = training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map( list, zip(*(sorted(feature_count.items())))) else: values = training_data_stats["feature_values"][feature] frequencies = training_data_stats["feature_frequencies"][ feature] self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / float(sum(frequencies))) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1
def __init__( self, training_data, training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, proposal_method="random", # random proposal vs. kde proposal discretizer='quartile'): """Init function. Args: training_data: numpy 2d array training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt(number of columns) * 0.75 verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile' or 'entropy' """ #### jiaxuan's addition for kde proposing distribution #### self.proposal_method = proposal_method # standardize data X_train = training_data self.kde_scaler = StandardScaler() X_train = self.kde_scaler.fit_transform(X_train) # learn a kde classifier # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(X_train) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate self.kde = grid.best_estimator_ #### end jiaxuan's addition ######## self.categorical_names = categorical_names self.categorical_features = categorical_features if self.categorical_names is None: self.categorical_names = {} if self.categorical_features is None: self.categorical_features = [] self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer(training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile' or 'entropy' ''') self.categorical_features = range( training_data.shape[1]) # so all categorical by the end! discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) def kernel(d): return np.sqrt(np.exp(-(d**2) / kernel_width**2)) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel, verbose) self.scaler = None self.class_names = class_names self.feature_names = feature_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: feature_count = collections.defaultdict(lambda: 0.0) column = training_data[:, feature] # this is for continuously converted categorical data if self.discretizer is not None: column = discretized_training_data[:, feature] feature_count[0] = 0. # only handles quantile? or useless? feature_count[1] = 0. feature_count[2] = 0. feature_count[3] = 0. for value in column: feature_count[value] += 1 values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / sum(frequencies)) self.scaler.mean_[feature] = 0 # not scaled for categorical data self.scaler.scale_[feature] = 1
def __init__(self, training_data, mode="classification", training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile', random_state=None): """Init function. Args: training_data: numpy 2d array mode: "classification" or "regression" training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt (number of columns) * 0.75 verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile', 'entropy' or a BaseDiscretizer instance. random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} if categorical_features is None: categorical_features = [] if feature_names is None: feature_names = [str(i) for i in range(training_data.shape[1])] self.categorical_features = list(categorical_features) self.feature_names = list(feature_names) self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) def kernel(d): return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel, verbose, random_state=self.random_state) self.scaler = None self.class_names = class_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: if self.discretizer is not None: column = discretized_training_data[:, feature] else: column = training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / float(sum(frequencies))) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1
def test_lime_tabular_explainer_not_equal_random_state(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) instance = np.random.RandomState(10).randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Entropy Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map())
def __init__(self, training_data, training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile'): """Init function. Args: training_data: numpy 2d array training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt(number of columns) * 0.75 verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile' or 'entropy' """ self.categorical_names = categorical_names self.categorical_features = categorical_features if self.categorical_names is None: self.categorical_names = {} if self.categorical_features is None: self.categorical_features = [] self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) else: raise ('''Discretizer must be 'quartile', 'decile' ''' + '''or 'entropy' ''') self.categorical_features = range(training_data.shape[1]) discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) def kernel(d): return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel, verbose) self.scaler = None self.class_names = class_names self.feature_names = feature_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: feature_count = collections.defaultdict(lambda: 0.0) column = training_data[:, feature] if self.discretizer is not None: column = discretized_training_data[:, feature] feature_count[0] = 0. feature_count[1] = 0. feature_count[2] = 0. feature_count[3] = 0. for value in column: feature_count[value] += 1 values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / sum(frequencies)) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1
def test_lime_explainer_with_data_stats(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) # Generate stats using a quartile descritizer descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, random_state=20) d_means = descritizer.means d_stds = descritizer.stds d_mins = descritizer.mins d_maxs = descritizer.maxs d_bins = descritizer.bins(self.train, self.target_names) # Compute feature values and frequencies of all columns cat_features = np.arange(self.train.shape[1]) discretized_training_data = descritizer.discretize(self.train) feature_values = {} feature_frequencies = {} for feature in cat_features: column = discretized_training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) feature_values[feature] = values feature_frequencies[feature] = frequencies # Convert bins to list from array d_bins_revised = {} index = 0 for bin in d_bins: d_bins_revised[index] = bin.tolist() index = index + 1 # Descritized stats data_stats = {} data_stats["means"] = d_means data_stats["stds"] = d_stds data_stats["maxs"] = d_maxs data_stats["mins"] = d_mins data_stats["bins"] = d_bins_revised data_stats["feature_values"] = feature_values data_stats["feature_frequencies"] = feature_frequencies data = np.zeros((2, len(self.feature_names))) explainer = LimeTabularExplainer(data, feature_names=self.feature_names, random_state=10, training_data_stats=data_stats, training_labels=self.target_names) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")