def test_feature_names_2(self): self.maxDiff = None discretizer = DecileDiscretizer(self.x, [], self.feature_names, self.y, random_state=10) self.assertDictEqual( { 0: [ 'sepal length (cm) <= 4.80', '4.80 < sepal length (cm) <= 5.00', '5.00 < sepal length (cm) <= 5.27', '5.27 < sepal length (cm) <= 5.60', '5.60 < sepal length (cm) <= 5.80', '5.80 < sepal length (cm) <= 6.10', '6.10 < sepal length (cm) <= 6.30', '6.30 < sepal length (cm) <= 6.52', '6.52 < sepal length (cm) <= 6.90', 'sepal length (cm) > 6.90' ], 1: [ 'sepal width (cm) <= 2.50', '2.50 < sepal width (cm) <= 2.70', '2.70 < sepal width (cm) <= 2.80', '2.80 < sepal width (cm) <= 3.00', '3.00 < sepal width (cm) <= 3.10', '3.10 < sepal width (cm) <= 3.20', '3.20 < sepal width (cm) <= 3.40', '3.40 < sepal width (cm) <= 3.61', 'sepal width (cm) > 3.61' ], 2: [ 'petal length (cm) <= 1.40', '1.40 < petal length (cm) <= 1.50', '1.50 < petal length (cm) <= 1.70', '1.70 < petal length (cm) <= 3.90', '3.90 < petal length (cm) <= 4.35', '4.35 < petal length (cm) <= 4.64', '4.64 < petal length (cm) <= 5.00', '5.00 < petal length (cm) <= 5.32', '5.32 < petal length (cm) <= 5.80', 'petal length (cm) > 5.80' ], 3: [ 'petal width (cm) <= 0.20', '0.20 < petal width (cm) <= 0.40', '0.40 < petal width (cm) <= 1.16', '1.16 < petal width (cm) <= 1.30', '1.30 < petal width (cm) <= 1.50', '1.50 < petal width (cm) <= 1.80', '1.80 < petal width (cm) <= 1.90', '1.90 < petal width (cm) <= 2.20', 'petal width (cm) > 2.20' ] }, discretizer.names)
def __init__(self, training_data, mode="classification", training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile'): """Init function. Args: training_data: numpy 2d array mode: "classification" or "regression" training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt(number of columns) * 0.75 verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile' or 'entropy' """ self.mode = mode self.feature_names = list(feature_names) self.categorical_names = categorical_names self.categorical_features = categorical_features if self.categorical_names is None: self.categorical_names = {} if self.categorical_features is None: self.categorical_features = [] if self.feature_names is None: self.feature_names = [ str(i) for i in range(training_data.shape[1]) ] self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer(training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile' or 'entropy' ''') self.categorical_features = range(training_data.shape[1]) discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) def kernel(d): return np.sqrt(np.exp(-(d**2) / kernel_width**2)) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel, verbose) self.scaler = None self.class_names = class_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: feature_count = collections.defaultdict(lambda: 0.0) column = training_data[:, feature] if self.discretizer is not None: column = discretized_training_data[:, feature] feature_count[0] = 0. feature_count[1] = 0. feature_count[2] = 0. feature_count[3] = 0. for value in column: feature_count[value] += 1 values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / sum(frequencies)) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1
def __init__(self, training_data, mode="classification", training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, kernel=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile', sample_around_instance=False, random_state=None, training_data_stats=None, generator = "Perturb", generator_specs = None, dummies = None, integer_attributes = []): """Init function. Args: training_data: numpy 2d array mode: "classification" or "regression" training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt (number of columns) * 0.75 kernel: similarity kernel that takes euclidean distances and kernel width as input and outputs weights in (0,1). If None, defaults to an exponential kernel. verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True and data is not sparse. Options are 'quartile', 'decile', 'entropy' or a BaseDiscretizer instance. sample_around_instance: if True, will sample continuous features in perturbed samples from a normal centered at the instance being explained. Otherwise, the normal is centered on the mean of the feature data. random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. training_data_stats: a dict object having the details of training data statistics. If None, training data information will be used, only matters if discretize_continuous is True. Must have the following keys: means", "mins", "maxs", "stds", "feature_values", "feature_frequencies" generator: "Perturb", "VAE", "DropoutVAE", "RBF" or "Forest". Determines, which data generator will be used for generating new samples generator_specs: only matters if generator is not "perturb". Dictionary with values, required by generator dummies: list of lists of categorical feature indices corresponding to dummy variable for the same categorical feature integer_attributes: list of indices of integer attributes """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance self.training_data_stats = training_data_stats # Check and raise proper error in stats are supplied in non-descritized path if self.training_data_stats: self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] if feature_names is None: feature_names = [str(i) for i in range(training_data.shape[1])] self.categorical_features = list(categorical_features) self.feature_names = list(feature_names) self.discretizer = None if discretize_continuous and not sp.sparse.issparse(training_data): # Set the discretizer if training data stats are provided if self.training_data_stats: discretizer = StatsDiscretizer(training_data, self.categorical_features, self.feature_names, labels=training_labels, data_stats=self.training_data_stats, random_state=self.random_state) if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels, random_state=self.random_state) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) self.dummies = dummies # Get the discretized_training_data when the stats are not provided if(self.training_data_stats is None): discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) if kernel is None: def kernel(d, kernel_width): return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) kernel_fn = partial(kernel, kernel_width=kernel_width) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) self.class_names = class_names # Create the generator (because RBF and Forest are not yet implemented in Python, it is only noted that they are used) if generator == "VAE": self.generator = VAE(original_dim = generator_specs["original_dim"], input_shape = (generator_specs["original_dim"],), intermediate_dim = generator_specs["intermediate_dim"], latent_dim = generator_specs["latent_dim"]) elif generator == "DropoutVAE": self.generator = DropoutVAE(original_dim = generator_specs["original_dim"], input_shape = (generator_specs["original_dim"],), intermediate_dim = generator_specs["intermediate_dim"], dropout = generator_specs["dropout"], latent_dim = generator_specs["latent_dim"]) elif generator in ["RBF", "Forest"]: self.generator = generator self.generator_specs = generator_specs else: self.generator = None # Dodamo integer atribute self.integer_attributes = integer_attributes # Though set has no role to play if training data stats are provided if (generator in ["VAE", "DropoutVAE"]): self.generator_scaler = sklearn.preprocessing.MinMaxScaler() self.generator_scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} self.dummies = dummies self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: if training_data_stats is None: if self.discretizer is not None: column = discretized_training_data[:, feature] else: column = training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(sorted(feature_count.items())))) else: values = training_data_stats["feature_values"][feature] frequencies = training_data_stats["feature_frequencies"][feature] self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / float(sum(frequencies))) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1 # Generator training if isinstance(self.generator, VAE) or isinstance(self.generator, DropoutVAE): scaled_data = self.generator_scaler.transform(training_data) self.generator.fit_unsplit(scaled_data, epochs = generator_specs["epochs"])
def test_lime_tabular_explainer_not_equal_random_state(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) instance = np.random.RandomState(10).randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Entropy Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map())
def __init__(self, training_data, mode="classification", training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, kernel=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, discretizer='quartile', sample_around_instance=False, random_state=None): """Init function. Args: training_data: numpy 2d array mode: "classification" or "regression" training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt (number of columns) * 0.75 kernel: similarity kernel that takes euclidean distances and kernel width as input and outputs weights in (0,1). If None, defaults to an exponential kernel. verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile', 'entropy' or a BaseDiscretizer instance. sample_around_instance: if True, will sample continuous features in perturbed samples from a normal centered at the instance being explained. Otherwise, the normal is centered on the mean of the feature data. random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance if categorical_features is None: categorical_features = [] if feature_names is None: feature_names = [str(i) for i in range(training_data.shape[1])] self.categorical_features = list(categorical_features) self.feature_names = list(feature_names) self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) if kernel is None: def kernel(d, kernel_width): return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2)) kernel_fn = partial(kernel, kernel_width=kernel_width) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) self.scaler = None self.class_names = class_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: if self.discretizer is not None: column = discretized_training_data[:, feature] else: column = training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / float(sum(frequencies))) self.scaler.mean_[feature] = 0 self.scaler.scale_[feature] = 1
def __init__( self, training_data, training_labels=None, feature_names=None, categorical_features=None, categorical_names=None, kernel_width=None, verbose=False, class_names=None, feature_selection='auto', discretize_continuous=True, proposal_method="random", # random proposal vs. kde proposal discretizer='quartile'): """Init function. Args: training_data: numpy 2d array training_labels: labels for training data. Not required, but may be used by discretizer. feature_names: list of names (strings) corresponding to the columns in the training data. categorical_features: list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. kernel_width: kernel width for the exponential kernel. If None, defaults to sqrt(number of columns) * 0.75 verbose: if true, print local prediction values from linear model class_names: list of class names, ordered according to whatever the classifier is using. If not present, class names will be '0', '1', ... feature_selection: feature selection method. can be 'forward_selection', 'lasso_path', 'none' or 'auto'. See function 'explain_instance_with_data' in lime_base.py for details on what each of the options does. discretize_continuous: if True, all non-categorical features will be discretized into quartiles. discretizer: only matters if discretize_continuous is True. Options are 'quartile', 'decile' or 'entropy' """ #### jiaxuan's addition for kde proposing distribution #### self.proposal_method = proposal_method # standardize data X_train = training_data self.kde_scaler = StandardScaler() X_train = self.kde_scaler.fit_transform(X_train) # learn a kde classifier # use grid search cross-validation to optimize the bandwidth params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(X_train) print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth)) # use the best estimator to compute the kernel density estimate self.kde = grid.best_estimator_ #### end jiaxuan's addition ######## self.categorical_names = categorical_names self.categorical_features = categorical_features if self.categorical_names is None: self.categorical_names = {} if self.categorical_features is None: self.categorical_features = [] self.discretizer = None if discretize_continuous: if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer(training_data, self.categorical_features, feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( training_data, self.categorical_features, feature_names, labels=training_labels) else: raise ValueError('''Discretizer must be 'quartile',''' + ''' 'decile' or 'entropy' ''') self.categorical_features = range( training_data.shape[1]) # so all categorical by the end! discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: kernel_width = np.sqrt(training_data.shape[1]) * .75 kernel_width = float(kernel_width) def kernel(d): return np.sqrt(np.exp(-(d**2) / kernel_width**2)) self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel, verbose) self.scaler = None self.class_names = class_names self.feature_names = feature_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: feature_count = collections.defaultdict(lambda: 0.0) column = training_data[:, feature] # this is for continuously converted categorical data if self.discretizer is not None: column = discretized_training_data[:, feature] feature_count[0] = 0. # only handles quantile? or useless? feature_count[1] = 0. feature_count[2] = 0. feature_count[3] = 0. for value in column: feature_count[value] += 1 values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / sum(frequencies)) self.scaler.mean_[feature] = 0 # not scaled for categorical data self.scaler.scale_[feature] = 1