Пример #1
0
    def test_partial_dependence_multiclass(self):
        # Iris data classes: ['setosa', 'versicolor', 'virginica']
        iris = datasets.load_iris()
        # 1. Using GB Classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)

        expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)')

        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{0} not in columns {1}".format(expected_feature_name,
                                                      pdp_df.columns.values))
        # 2. Using SVC
        from sklearn import svm
        # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false
        clf = svm.SVC(probability=True)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)
        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{} not in columns {}".format(*[expected_feature_name,
                                                      pdp_df.columns.values]))
Пример #2
0
    def test_partial_dependence_binary_classification(self):
        # In the default implementation of pdp on sklearn, there is an approx. done
        # if the number of unique values for a feature space < grid_resolution specified.
        # For now, we have decided to not have that approximation. In V2, we will be benchmarking for
        # performance as well. Around that time we will revisit the same.
        # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py
        # TODO: check on the feature space approximation (V2)
        # Test partial dependence for classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x)
        interpreter = Interpretation()
        interpreter.load_data(np.array(self.sample_x), self.sample_feature_name)
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'],
                                                                   classifier_predict_fn,
                                                                   grid_resolution=5,
                                                                   sample=True)

        self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0'])))

        # now with our own grid
        ud_grid = np.unique(self.sample_x[:, 0])
        # input: array([-2, -1,  1,  2])
        # the returned grid should have only 4 values as specified by the user
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn,
                                                                   grid=ud_grid, sample=True)
        self.assertEquals(pdp_df.shape[0], 4)
Пример #3
0
def plot_partial_dependence_skater(estimator, X_train, feature_names):
    # Initialize names and interpreter class (which serves as a 'data manager')
    interpreter = Interpretation()
    interpreter.load_data(X_train, feature_names=feature_names)
    model = InMemoryModel(estimator.predict_proba, examples=X_train)
    # Plot partial dependence plots
    pdplots = interpreter.partial_dependence.plot_partial_dependence(
        feature_names,
        model,
        n_samples=100,
        n_jobs=3,
        grid_resolution=50,
        figsize=(10, 15))
Пример #4
0
def _create_skater_stuff(mdl, test_x, test_z):
    from skater.model import InMemoryModel
    from skater.core.explanations import Interpretation
    from hassbrain_algorithm.benchmark.interpretation import ModelWrapper
    from hassbrain_algorithm.benchmark.interpretation import _boolean2str

    wrapped_model = ModelWrapper(mdl)
    class_names = mdl.get_state_lbl_lst()
    feature_names = mdl.get_obs_lbl_lst()

    # this has to be done in order for skater to recognize the values as categorical and not numerical
    test_x = _boolean2str(test_x)

    # create interpretation
    interpreter = Interpretation(
        test_x,
        #class_names=class_names,
        feature_names=feature_names)

    # create model
    # supports classifiers with or without probability scores
    examples = test_x[:10]
    skater_model = InMemoryModel(
        wrapped_model.predict,
        #target_names=class_names,
        feature_names=feature_names,
        model_type='classifier',
        unique_values=class_names,
        probability=False,
        examples=examples)

    interpreter.load_data(test_x,
                          training_labels=test_z,
                          feature_names=feature_names)
    # todo flag for deletion (3lines below)
    #    if this can savely be deleted
    tmp = interpreter.data_set.feature_info
    for key, val in tmp.items():
        val['numeric'] = False
    return skater_model, interpreter
Пример #5
0
def part_dep_plot(features):
    for feature in features:
        interpreter = Interpretation()
        interpreter.load_data(import_quest_demos, feature_names=[feature])
        model = InMemoryModel(rf_final.predict_proba,
                              examples=import_quest_demos)
        pdplots = interpreter.partial_dependence.plot_partial_dependence(
            [feature],
            model,
            n_samples=100,
            n_jobs=-1,
            grid_resolution=50,
            figsize=(15, 15))
        name = "images/pdp_" + feature + ".png"
        plt.title("Partial Dependency Plot of Question " + feature,
                  fontsize=20)
        plt.ylabel(
            "Average Predicted Probability of Attrition by Question Value (*0.1)",
            fontsize=15)
        plt.xlabel("Question " + feature + " Response Value", fontsize=15)
        plt.savefig(name)
        plt.close()
Пример #6
0
class TestFeatureImportance(unittest.TestCase):
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0,
                          1,
                          size=(self.n, self.dim),
                          random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]

        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2],
                                  [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [
            str(i) for i in range(self.sample_x.shape[1])
        ]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict,
                                                  examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(
            self.classifier.predict,
            examples=self.X,
            unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(
            self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(
            self.string_classifier.predict_proba, examples=self.X)

    @staticmethod
    def feature_column_name_formatter(columnname):
        return "feature: {}".format(columnname)

    def test_feature_importance(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, n_jobs=1, progressbar=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, n_jobs=2, progressbar=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_progressbar(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_entropy_with_and_without_scaling(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True, use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True, use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_regression_via_preformance_decrease(self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y)
        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_classifier_via_preformance_decrease(self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y_as_int)
        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_classifier_proba_via_preformance_decrease(
            self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y_as_int)
        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_proba_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_proba_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_plot_feature_importance(self):
        self.interpreter.feature_importance.plot_feature_importance(
            self.regressor_predict_fn)
Пример #7
0
class TestPartialDependence(unittest.TestCase):

    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X)


        # Yet another set of input!!
        self.sample_x_categorical = np.array([['B', -1], ['A', -1], ['A', -2], ['C', 1], ['C', 2], ['A', 1]])
        self.sample_y_categorical = np.array(['A', 'A', 'A', 'B', 'B', 'B'])
        self.categorical_feature_names = ['Letters', 'Numbers']
        self.categorical_transformer = MultiColumnLabelBinarizer()
        self.categorical_transformer.fit(self.sample_x_categorical)
        self.sample_x_categorical_transormed = self.categorical_transformer.transform(self.sample_x_categorical)
        self.categorical_classifier = LogisticRegression()
        self.categorical_classifier.fit(self.sample_x_categorical_transormed, self.sample_y_categorical)
        self.categorical_predict_fn = lambda x: self.categorical_classifier.predict_proba(self.categorical_transformer.transform(x))
        self.categorical_model = InMemoryModel(self.categorical_predict_fn, examples=self.sample_x_categorical)


    def test_pdp_with_default_sampling(self):
        pdp_df = self.interpreter.partial_dependence.partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn,
                                                                        sample=True)
        self.assertEquals(pdp_df.shape, (30, 3))  # default grid resolution is 30

    def test_pd_with_categorical_features(self):
        interpreter = Interpretation(self.sample_x_categorical, feature_names=self.categorical_feature_names)
        try:
            interpreter.partial_dependence.partial_dependence([self.categorical_feature_names[0]], self.categorical_model)
        except:
            self.fail("PD computation function failed with categorical features")
        try:
            interpreter.partial_dependence.plot_partial_dependence([self.categorical_feature_names], self.categorical_model)
        except:
            self.fail("PDP plotting function failed with categorical features")



    def test_partial_dependence_binary_classification(self):
        # In the default implementation of pdp on sklearn, there is an approx. done
        # if the number of unique values for a feature space < grid_resolution specified.
        # For now, we have decided to not have that approximation. In V2, we will be benchmarking for
        # performance as well. Around that time we will revisit the same.
        # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py
        # TODO: check on the feature space approximation (V2)
        # Test partial dependence for classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x)
        interpreter = Interpretation()
        interpreter.load_data(np.array(self.sample_x), self.sample_feature_name)
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'],
                                                                   classifier_predict_fn,
                                                                   grid_resolution=5,
                                                                   sample=True)

        self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0'])))

        # now with our own grid
        ud_grid = np.unique(self.sample_x[:, 0])
        # input: array([-2, -1,  1,  2])
        # the returned grid should have only 4 values as specified by the user
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn,
                                                                   grid=ud_grid, sample=True)
        self.assertEquals(pdp_df.shape[0], 4)


    def test_partial_dependence_multiclass(self):
        # Iris data classes: ['setosa', 'versicolor', 'virginica']
        iris = datasets.load_iris()
        # 1. Using GB Classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)

        expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)')

        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{0} not in columns {1}".format(expected_feature_name,
                                                      pdp_df.columns.values))
        # 2. Using SVC
        from sklearn import svm
        # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false
        clf = svm.SVC(probability=True)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)
        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{} not in columns {}".format(*[expected_feature_name,
                                                      pdp_df.columns.values]))




    def test_pdp_regression_coefs_closeness(self, epsilon=1):
        pdp_df = self.interpreter.partial_dependence.partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn)
        val_col = PartialDependence.feature_column_name_formatter(self.features[0])

        y = np.array(pdp_df[self.regressor_predict_fn.target_names[0]])
        x = np.array(pdp_df[val_col])[:, np.newaxis]
        regressor = LinearRegression()
        regressor.fit(x, y)
        self.interpreter.logger.debug("Regressor coefs: {}".format(regressor.coef_))
        self.interpreter.logger.debug("Regressor coef shape: {}".format(regressor.coef_.shape))
        coef = regressor.coef_[0]
        self.assertTrue(abs(coef - self.B[0]) < epsilon, True)


    def test_pdp_inputs(self):
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        interpreter = Interpretation()
        self.assertRaisesRegexp(Exception, "Invalid Data", interpreter.load_data, None, self.sample_feature_name)


    def test_2D_pdp(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.regressor_predict_fn,
                                                                   grid_resolution=10,
                                                                   sample=True)
        except:
            self.fail("2D regressor pd failed")


    def test_plot_1D_pdp(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10)
        except:
            self.fail("1D regressor plot failed")


    def test_plot_1D_pdp_with_sampling(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(
                [self.features[0]],
                self.regressor_predict_fn,
                grid_resolution=10,
                sample=True)
        except:
            self.fail("1D classifier plot with sampling failed")


    def test_plot_2D_pdp(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(self.features[:2],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10,
                                                                        sample=False)
        except:
            self.fail("2D partial dep plot failed")

    def test_plot_2D_pdp_with_sampling(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(self.features[:2],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10,
                                                                        sample=True)
        except:
            self.fail("2D regressor with sampling failed")


    def test_fail_when_grid_range_is_outside_0_and_1(self):
        pdp_func = partial(self.interpreter.partial_dependence.partial_dependence,
                           *[[self.features[0]], self.regressor_predict_fn],
                           **{'grid_range': (.01, 1.01)})
        self.assertRaises(exceptions.MalformedGridRangeError, pdp_func)


    def test_pdp_1d_classifier_no_proba(self):
        self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                               self.classifier_predict_fn,
                                                               grid_resolution=10)
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("1D pdp without proba failed")


    def test_pdp_2d_classifier_no_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("2D pdp without proba failed")


    def test_pdp_1d_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.classifier_predict_proba_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("1D classifier with probability scores failed")


    def test_pdp_2d_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.classifier_predict_proba_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("2D classifier with probability scores failed")


    def test_pdp_1d_string_classifier_no_proba(self):
        def fail_func():
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   InMemoryModel(self.string_classifier.predict,
                                                                                 examples=self.X),
                                                                   grid_resolution=10)
        self.assertRaises(exceptions.ModelError, fail_func)


    def test_pdp_1d_string_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.string_classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail('1D string classifier pd failed')


    def test_pd_with_long_string_feature_name_issue_166(self):

        feature_names = ['longstring_{}'.format(i) for i in range(self.X.shape[1])]
        interpreter = Interpretation(self.X, feature_names=feature_names)
        try:
            interpreter.partial_dependence.partial_dependence(feature_names[0],
                                                              self.regressor_predict_fn)
        except:
            self.fail("1D Partial dependence failed when passing long string name")
Пример #8
0
def compute_feature_importance(model: Model, X: np.ndarray, z: np.ndarray):
    """ calculates the feature importance, the impact on prediction on the dataset
    Parameters
    ----------
    model : Model
        a model of
    X : array-like
        the array the importance should be calculated on
    z : array-like
        the corresponding labels
    Returns
    -------
    res : pd.Dataframe (1, D)

    """
    from skater.model import InMemoryModel
    from skater.core.explanations import Interpretation
    from matplotlib.pyplot import Figure
    wrapped_model = ModelWrapper(model)
    class_names = model.get_state_lbl_lst()
    feature_names = model.get_obs_lbl_lst()

    # this has to be done in order for skater to recognize the values as categorical and not numerical
    X = _boolean2str(X)

    # create interpretation
    interpreter = Interpretation(
        X,
        #class_names=class_names,
        feature_names=feature_names)

    # create model
    # supports classifiers with or without probability scores
    examples = X[:10]
    skater_model = InMemoryModel(
        wrapped_model.predict,
        #target_names=class_names,
        feature_names=feature_names,
        model_type='classifier',
        unique_values=class_names,
        probability=False,
        examples=examples)

    # only do this for cross_entropy
    #train_z = onehot(train_z, model.K)
    interpreter.load_data(X, training_labels=z, feature_names=feature_names)

    # todo flag for deletion (3lines below)
    #    if this can savely be deleted
    tmp = interpreter.data_set.feature_info
    for key, val in tmp.items():
        val['numeric'] = False
    fig, axes = interpreter.feature_importance.save_plot_feature_importance(
        skater_model,
        ascending=True,
        ax=None,
        progressbar=False,
        # model-scoring: difference in log_loss or MAE of training_labels
        # given perturbations. Note this vary rarely makes any significant
        # differences
        method='model-scoring')
    # corss entropy or f1 ('f1', 'cross_entropy')
    #scorer_type='cross_entropy') # type: Figure, axes
    #scorer_type='f1') # type: Figure, axes

    # cross_entropy yields zero

    fig.show()
Пример #9
0
    examples=examples)


def onehot(train_z, num_classes):
    T = len(train_z)
    res = np.zeros((T, num_classes), dtype=np.float64)
    for t in range(T):
        res[t][train_z[t]] = 1.
    return res


# only do this for cross_entropy
#train_z = onehot(train_z, model.K)

interpreter.load_data(train_x,
                      training_labels=train_z,
                      feature_names=feature_names)
tmp = interpreter.data_set.feature_info
for key, val in tmp.items():
    val['numeric'] = False

fig, axes = interpreter.feature_importance.save_plot_feature_importance(
    skater_model,
    n_samples=18,
    ascending=True,
    ax=None,
    progressbar=False,
    # model-scoring: difference in log_loss or MAE of training_labels
    # given perturbations. Note this vary rarely makes any significant
    # differences
    method='model-scoring')