예제 #1
0
    def test_partial_dependence_multiclass(self):
        # Iris data classes: ['setosa', 'versicolor', 'virginica']
        iris = datasets.load_iris()
        # 1. Using GB Classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation(training_data=iris.data, feature_names=iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)

        expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)')

        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{0} not in columns {1}".format(expected_feature_name,
                                                      pdp_df.columns.values))
        # 2. Using SVC
        from sklearn import svm
        # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false
        clf = svm.SVC(probability=True)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation(training_data=iris.data, feature_names=iris.feature_names)

        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)
        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{} not in columns {}".format(*[expected_feature_name,
                                                      pdp_df.columns.values]))
예제 #2
0
    def test_partial_dependence_binary_classification(self):
        # In the default implementation of pdp on sklearn, there is an approx. done
        # if the number of unique values for a feature space < grid_resolution specified.
        # For now, we have decided to not have that approximation. In V2, we will be benchmarking for
        # performance as well. Around that time we will revisit the same.
        # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py
        # TODO: check on the feature space approximation (V2)
        # Test partial dependence for classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x)
        interpreter = Interpretation()
        interpreter.load_data(np.array(self.sample_x), self.sample_feature_name)
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'],
                                                                   classifier_predict_fn,
                                                                   grid_resolution=5,
                                                                   sample=True)

        self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0'])))

        # now with our own grid
        ud_grid = np.unique(self.sample_x[:, 0])
        # input: array([-2, -1,  1,  2])
        # the returned grid should have only 4 values as specified by the user
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn,
                                                                   grid=ud_grid, sample=True)
        self.assertEquals(pdp_df.shape[0], 4)
예제 #3
0
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X)


        # Yet another set of input!!
        self.sample_x_categorical = np.array([['B', -1], ['A', -1], ['A', -2], ['C', 1], ['C', 2], ['A', 1]])
        self.sample_y_categorical = np.array(['A', 'A', 'A', 'B', 'B', 'B'])
        self.categorical_feature_names = ['Letters', 'Numbers']
        self.categorical_transformer = MultiColumnLabelBinarizer()
        self.categorical_transformer.fit(self.sample_x_categorical)
        self.sample_x_categorical_transormed = self.categorical_transformer.transform(self.sample_x_categorical)
        self.categorical_classifier = LogisticRegression()
        self.categorical_classifier.fit(self.sample_x_categorical_transormed, self.sample_y_categorical)
        self.categorical_predict_fn = lambda x: self.categorical_classifier.predict_proba(self.categorical_transformer.transform(x))
        self.categorical_model = InMemoryModel(self.categorical_predict_fn, examples=self.sample_x_categorical)
예제 #4
0
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(training_data=self.X, feature_names=self.features, log_level='DEBUG')
        else:
            self.interpreter = Interpretation(training_data=self.X, feature_names=self.features)  # default level is 'WARNING'

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict,
                                                   examples=self.X,
                                                   unique_values=self.classifier.classes_,
                                                   probability=False)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba,
                                                         examples=self.X,
                                                         probability=True)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba,
                                                          examples=self.X,
                                                          probability=True)
예제 #5
0
def plot_partial_dependence_skater(estimator, X_train, feature_names):
    # Initialize names and interpreter class (which serves as a 'data manager')
    interpreter = Interpretation()
    interpreter.load_data(X_train, feature_names=feature_names)
    model = InMemoryModel(estimator.predict_proba, examples=X_train)
    # Plot partial dependence plots
    pdplots = interpreter.partial_dependence.plot_partial_dependence(
        feature_names,
        model,
        n_samples=100,
        n_jobs=3,
        grid_resolution=50,
        figsize=(10, 15))
예제 #6
0
def explain(skater_exp: Explanation, training_df, test_df, explanation_target,
            prefix_target):
    job = skater_exp.job
    model = joblib.load(job.predictive_model.model_path)
    model = model[0]

    features = list(training_df.drop(['trace_id', 'label'], 1).columns.values)
    interpreter = Interpretation(training_df, feature_names=features)
    X_train = training_df.drop(['trace_id', 'label'], 1)
    Y_train = training_df['label'].values

    model_inst = InMemoryModel(model.predict,
                               examples=X_train,
                               model_type=model._estimator_type,
                               unique_values=[1, 2],
                               feature_names=features,
                               target_names=['label'])
    surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5)

    surrogate_explainer.fit(X_train,
                            Y_train,
                            use_oracle=True,
                            prune='post',
                            scorer_type='default')
    surrogate_explainer.class_names = features

    viz = dtreeviz(surrogate_explainer.estimator_,
                   X_train,
                   Y_train,
                   target_name='label',
                   feature_names=features,
                   orientation="TD",
                   class_names=list(surrogate_explainer.class_names),
                   fancy=True,
                   X=None,
                   label_fontsize=12,
                   ticks_fontsize=8,
                   fontname="Arial")
    name = create_unique_name("skater_plot.svg")
    viz.save(name)
    if os.path.getsize(name) > 15000000:
        return 'The file size is too big'
    f = open(name, "r")
    response = f.read()
    os.remove(name)
    if os.path.isfile(name.split('.svg')[0]):
        os.remove(name.split('.svg')[0])

    return response
예제 #7
0
def analyze(model_prediction, X_train, y_train):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)

    surrogate_explainer = interpreter.tree_surrogate(skater_model, seed=5)
    surrogate_explainer.fit(X_train,
                            y_train,
                            use_oracle=True,
                            prune='post',
                            scorer_type='default')
    surrogate_explainer.plot_global_decisions(
        colors=['coral', 'lightsteelblue', 'darkkhaki'],
        file_name='simple_tree_pre.png')

    return Image(filename='simple_tree_pre.png')
    def __init__(self, model, train_x, test_x, train_y, test_y, feature_names):
        '''
        This class should be initialized with a scikit-learn fitted model
        and four dataframes (or vectors) for training and testing features and labels
        '''
        
        print("Initiating FeatureSelector...")
        self.model = model
        self.train_x = train_x
        self.test_x = test_x
        self.train_y = train_y
        self.test_y = test_y
        self.feature_names = feature_names
        print("Done.")
        print("-----------------------------------")

        print("Creating interpretator for model")
        self.interpreter = Interpretation(self.test_x, feature_names = self.test_x.columns)
        print("Done.")
        print("-----------------------------------")


        print("Building SHAP explainer for Tree Model")
        self.explainer = shap.TreeExplainer(self.model)
        print("Done.")
        print("-----------------------------------")

        print("Computing SHAP values for first 100 test samples...")
        print("Attention! For a large dataset this procedure could take a while.")
        self.shap_values = self.explainer.shap_values(test_x[:100])
        print("Done.")

        shap.initjs()
예제 #9
0
    def init_skater(self, target_names=None):
        """
        Initialize skater. Set ups skater interpreter and in-memory model.
        :return: void (Sets the values of the skater_interpreter and skater_model variables)
        """
        from skater.core.explanations import Interpretation
        from skater.model import InMemoryModel

        if not self.skater_interpreter or not self.skater_model:

            log.info(
                "Initializing Skater - generating new in-memory model."
                " This operation may be time-consuming so please be patient.")

            self.skater_interpreter = Interpretation(
                training_data=self.X_train_ohe,
                training_labels=self.y_train,
                feature_names=self.features_ohe)

            self.skater_model = InMemoryModel(
                self.model[1].predict_proba,
                examples=self.X_test_ohe,
                target_names=target_names,
                unique_values=self.y_train.unique())
        else:
            log.info("Skater is already initialized.")
예제 #10
0
 def test_pdp_inputs(self):
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
     clf.fit(self.sample_x, self.sample_y)
     interpreter = Interpretation()
     self.assertRaisesRegexp(Exception, "Invalid Data",
                             interpreter.load_data, None,
                             self.sample_feature_name)
예제 #11
0
    def test_pd_with_long_string_feature_name_issue_166(self):

        feature_names = ['longstring_{}'.format(i) for i in range(self.X.shape[1])]
        interpreter = Interpretation(self.X, feature_names=feature_names)
        try:
            interpreter.partial_dependence.partial_dependence(feature_names[0],
                                                              self.regressor_predict_fn)
        except:
            self.fail("1D Partial dependence failed when passing long string name")
예제 #12
0
파일: utils.py 프로젝트: vpekar/forecastml
def get_permuted_feature_scores(model, data):
    """Computed permuted feature importances, using skater
    """
    interpreter = Interpretation(data.testX, feature_names=data.feature_names)
    pyint_model = InMemoryModel(model.predict, examples=data.testX)
    feature_scores = list(
        interpreter.feature_importance.feature_importance(
            pyint_model, ascending=False, progressbar=False).items())
    return feature_scores
예제 #13
0
def analyze(model_prediction, X_train, render=False):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)
    
    result = interpreter.feature_importance.feature_importance(skater_model, ascending=False)
    
    if render:
        return render_feature_importance(result)
    else:
        return result
예제 #14
0
 def test_pd_with_categorical_features(self):
     interpreter = Interpretation(self.sample_x_categorical, feature_names=self.categorical_feature_names)
     try:
         interpreter.partial_dependence.partial_dependence([self.categorical_feature_names[0]], self.categorical_model)
     except:
         self.fail("PD computation function failed with categorical features")
     try:
         interpreter.partial_dependence.plot_partial_dependence([self.categorical_feature_names], self.categorical_model)
     except:
         self.fail("PDP plotting function failed with categorical features")
예제 #15
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 71
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)

        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        interpreter = Interpretation(training_df, feature_names=features)
        X_train = training_df.drop(['trace_id', 'label'], 1)
        Y_train = training_df['label'].values

        model_inst = InMemoryModel(model.predict,
                                   examples=X_train,
                                   model_type='classifier',
                                   unique_values=[1, 2],
                                   feature_names=features,
                                   target_names=['label'])
        surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5)

        surrogate_explainer.fit(X_train,
                                Y_train,
                                use_oracle=True,
                                prune='post',
                                scorer_type='default')
        surrogate_explainer.class_names = features

        viz = dtreeviz(surrogate_explainer.estimator_,
                       X_train,
                       Y_train,
                       target_name='label',
                       feature_names=features,
                       orientation="TD",
                       class_names=list(surrogate_explainer.class_names),
                       fancy=True,
                       X=None,
                       label_fontsize=12,
                       ticks_fontsize=8,
                       fontname="Arial")
        viz.save("skater_plot_train_2_2.svg")
예제 #16
0
    def test_feature_importance_classifier_proba_via_preformance_decrease(self):
        interpreter = Interpretation(self.X, feature_names=self.features, training_labels=self.y_as_int)
        importances = interpreter.feature_importance.feature_importance(self.classifier_predict_proba_fn,
                                                                        method='model-scoring',
                                                                        use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(self.classifier_predict_proba_fn,
                                                                        method='model-scoring',
                                                                        use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)
예제 #17
0
 def test_feature_importance_sampling(self):
     """
     https://github.com/datascienceinc/Skater/issues/192
     We should be able to sample the data and use training labels.
     :return:
     """
     interpreter = Interpretation(self.X, feature_names=self.features, training_labels=self.y_as_int)
     importances = interpreter.feature_importance.feature_importance(self.classifier_predict_proba_fn,
                                                                     n_samples=len(self.X) - 1,
                                                                     method='model-scoring',
                                                                     use_scaling=True)
     self.assertEquals(np.isclose(importances.sum(), 1), True)
예제 #18
0
def analyze(features, model_prediction, X_train, resolution=20, render=False):
    skater_model = InMemoryModel(model_prediction, examples=X_train)
    interpreter = Interpretation(X_train, feature_names=X_train.columns)

    result = interpreter.partial_dependence.partial_dependence(
        features, skater_model, grid_resolution=resolution)
    result.rename(columns={'predicted_1': 'Prediction'}, inplace=True)

    if render:
        return render_partial_dependence(result, features)
    else:
        return result
예제 #19
0
 def __init__(self, random_forest_model, x_train, y_train):
     self.rf_model = random_forest_model
     self.x_train = x_train
     self.y_train = y_train
     self.columns = list(x_train.columns)
     self.explainer = LimeTabularExplainer(x_train.values,
                                           feature_names=self.columns)
     self.model = InMemoryModel(self.rf_model.predict_proba,
                                examples=self.x_train)
     self.interpreter = Interpretation(training_data=self.x_train,
                                       feature_names=self.columns,
                                       training_labels=self.y_train)
예제 #20
0
def _create_skater_stuff(mdl, test_x, test_z):
    from skater.model import InMemoryModel
    from skater.core.explanations import Interpretation
    from hassbrain_algorithm.benchmark.interpretation import ModelWrapper
    from hassbrain_algorithm.benchmark.interpretation import _boolean2str

    wrapped_model = ModelWrapper(mdl)
    class_names = mdl.get_state_lbl_lst()
    feature_names = mdl.get_obs_lbl_lst()

    # this has to be done in order for skater to recognize the values as categorical and not numerical
    test_x = _boolean2str(test_x)

    # create interpretation
    interpreter = Interpretation(
        test_x,
        #class_names=class_names,
        feature_names=feature_names)

    # create model
    # supports classifiers with or without probability scores
    examples = test_x[:10]
    skater_model = InMemoryModel(
        wrapped_model.predict,
        #target_names=class_names,
        feature_names=feature_names,
        model_type='classifier',
        unique_values=class_names,
        probability=False,
        examples=examples)

    interpreter.load_data(test_x,
                          training_labels=test_z,
                          feature_names=feature_names)
    # todo flag for deletion (3lines below)
    #    if this can savely be deleted
    tmp = interpreter.data_set.feature_info
    for key, val in tmp.items():
        val['numeric'] = False
    return skater_model, interpreter
예제 #21
0
def part_dep_plot(features):
    for feature in features:
        interpreter = Interpretation()
        interpreter.load_data(import_quest_demos, feature_names=[feature])
        model = InMemoryModel(rf_final.predict_proba,
                              examples=import_quest_demos)
        pdplots = interpreter.partial_dependence.plot_partial_dependence(
            [feature],
            model,
            n_samples=100,
            n_jobs=-1,
            grid_resolution=50,
            figsize=(15, 15))
        name = "images/pdp_" + feature + ".png"
        plt.title("Partial Dependency Plot of Question " + feature,
                  fontsize=20)
        plt.ylabel(
            "Average Predicted Probability of Attrition by Question Value (*0.1)",
            fontsize=15)
        plt.xlabel("Question " + feature + " Response Value", fontsize=15)
        plt.savefig(name)
        plt.close()
예제 #22
0
    def test_feature_importance_regression_via_preformance_decrease(self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y)
        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)
예제 #23
0
 def setUpClass(cls):
     # Classification use-case
     cls.X_c, cls.y_c = make_moons(1000, noise=0.5)
     cls.X_c = pd.DataFrame(cls.X_c, columns=['F1', 'F2'])
     cls.target_names = ['class 0', 'class 1']
     cls.X_train_c, cls.X_test_c, cls.y_train_c, cls.y_test_c = train_test_split(
         cls.X_c, cls.y_c)
     cls.classifier_est = DecisionTreeClassifier(max_depth=5,
                                                 random_state=5)
     cls.classifier_est.fit(cls.X_train_c, cls.y_train_c)
     cls.interpreter = Interpretation(cls.X_train_c,
                                      feature_names=cls.X_c.columns)
     cls.model_inst = InMemoryModel(cls.classifier_est.predict,
                                    examples=cls.X_train_c,
                                    model_type='classifier',
                                    unique_values=[0, 1],
                                    feature_names=cls.X_c.columns,
                                    target_names=cls.target_names,
                                    log_level=_INFO)
예제 #24
0
    def __init__(self, predictive_model, feature_labels=None, dataset=None):
        if isinstance(predictive_model, MatPipe):
            self.predictive_model = predictive_model
            self.feature_labels = predictive_model.learner.features
            self.target = predictive_model.learner.fitted_target
            self.dataset = predictive_model.post_fit_df.drop([self.target],
                                                             axis=1)
        if feature_labels is not None:
            self.feature_labels = feature_labels
        if dataset is not None:
            self.dataset = dataset

        self.interpreter = Interpretation(self.dataset,
                                          feature_names=self.feature_labels)

        def predict_func(x):
            prediction = self.predictive_model.learner.predict(x, self.target)
            return prediction[self.target + " predicted"].values

        self.model = InMemoryModel(
            predict_func, examples=self.dataset
        )
예제 #25
0
def run_explanations(csv_path, csv_columns, target_column, zero_value):
    # Read the dataset from the provided CSV and print out information about it.
    df = pd.read_csv(csv_path,
                     names=csv_columns,
                     skipinitialspace=True,
                     skiprows=1)
    #df = df.drop('Target',axis=1)
    input_features = [name for name in csv_columns if name != target_column]
    #data, labels = shap.datasets.adult(display=True)
    if target_column not in csv_columns:
        print("target column error")
        return ("target column error")
    elif zero_value not in df[target_column].tolist():
        if str.isdecimal(zero_value) and (
                np.int64(zero_value) in df[target_column].tolist()
                or np.float64(zero_value) in df[target_column].tolist()):
            print("happy")
            zero_value = np.int64(zero_value)
        else:
            print(zero_value, df[target_column].tolist(),
                  df[target_column].dtype)
            return ("zero value error")

    labels = df[target_column].tolist()
    #labels = np.array([int(label) for label in labels])
    labels2 = []
    for label in labels:
        if label == zero_value:
            labels2.append(0)
        else:
            labels2.append(1)
    labels = np.array(labels2)

    data = df[input_features]

    for feature in input_features:
        if data[feature].dtype is not np.dtype(
                np.int64) and data[feature].dtype is not np.dtype(
                    np.float64) and data[feature].dtype is not np.dtype(
                        np.float32):
            data[feature] = data[feature].astype('category')

    cat_cols = data.select_dtypes(['category']).columns
    data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42)

    data_disp, labels_disp = shap.datasets.adult(display=True)
    X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(
        data_disp, labels_disp, test_size=0.3, random_state=42)

    xgc = xgb.XGBClassifier(n_estimators=500,
                            max_depth=5,
                            base_score=0.5,
                            objective='binary:logistic',
                            random_state=42)
    xgc.fit(X_train, y_train)
    predictions = xgc.predict(X_test)

    fig = plt.figure(figsize=(16, 12))
    title = fig.suptitle("Default Feature Importances from XGBoost",
                         fontsize=14)

    ax1 = fig.add_subplot(2, 2, 1)
    xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
    t = ax1.set_title("Feature Importance - Feature Weight")

    ax2 = fig.add_subplot(2, 2, 2)
    xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
    t = ax2.set_title("Feature Importance - Split Mean Gain")

    ax3 = fig.add_subplot(2, 2, 3)
    xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
    t = ax3.set_title("Feature Importance - Sample Coverage")

    #plt.savefig('static/explanations.png')

    explanation = eli5.explain_weights(xgc.get_booster())
    explanation_html = eli5.formatters.html.format_as_html(explanation)
    print(explanation_html)

    with open("templates/explanation.html", "a+") as file:
        file.write(explanation_html)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num],
    #                     feature_names=list(data.columns) ,show_feature_values=True)
    explanation2 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html2 = eli5.formatters.html.format_as_html(explanation2)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html2)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    #eli5.show_predicon(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns) ,show_feature_values=True)
    explanation3 = eli5.explain_prediction(xgc.get_booster(),
                                           X_test.iloc[doc_num],
                                           feature_names=list(data.columns))
    explanation_html3 = eli5.formatters.html.format_as_html(explanation3)
    with open("templates/explanation.html", "a") as file:
        file.write(explanation_html3)

    #target_names = ['$50K or less', 'More than $50K']
    interpreter = Interpretation(training_data=X_test,
                                 training_labels=y_test,
                                 feature_names=list(data.columns))
    im_model = InMemoryModel(xgc.predict_proba, examples=X_train)

    plots = interpreter.feature_importance.plot_feature_importance(
        im_model, ascending=True, n_samples=23000)

    plots[0].savefig('skater.png')

    features_pdp = input_features

    xgc_np = xgb.XGBClassifier(n_estimators=500,
                               max_depth=5,
                               base_score=0.5,
                               objective='binary:logistic',
                               random_state=42)
    xgc_np.fit(X_train.values, y_train)

    # In[ ]:

    from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer

    exp = LimeTabularExplainer(X_test.values,
                               feature_names=list(data.columns),
                               discretize_continuous=True)

    doc_num = 0
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance = exp.explain_instance(X_test.iloc[doc_num].values,
                                    xgc_np.predict_proba)
    instance.save_to_file('templates/lime.html', show_all=False)

    doc_num = 2
    print('Actual Label:', y_test[doc_num])
    print('Predicted Label:', predictions[doc_num])
    instance2 = exp.explain_instance(X_test.iloc[doc_num].values,
                                     xgc_np.predict_proba)
    instance2.save_to_file('templates/lime2.html', show_all=False)

    explainer = shap.TreeExplainer(xgc)
    shap_values = explainer.shap_values(X_test)
    pd.DataFrame(shap_values).head()

    #shap.force_plot(explainer.expected_value, shap_values[:,], X_test_disp.iloc[:,],show=False,matplotlib=True)
    #plt.savefig("static/force_plot.png")

    shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
    plt.savefig("static/summary_plot.png")

    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig("static/summary_plot2.png")

    return "Everyone Happy"
                                                      n_classes=n_classes,
                                                      nlayers=nlayers,
                                                      nneurons=nneurons,
                                                      dropout_rate=dropout_rate,
                                                      l2_norm=l2_norm,
                                                      loss=loss,
                                                      batch_size=256, 
                                                      epochs=35,
                                                      verbose=1))
}

## Applying Model Agnostic Interpretation to Ensemble Models
# source:
#   - https://github.com/datascienceinc/Skater/blob/master/examples/ensemble_model.ipynb

interpreter = Interpretation(X_test, feature_names=features)

estimator = binary_pipe['kerasclassifier']
estimator.fit(X_train, y_train)

model = InMemoryModel(estimator.predict_proba, 
                      examples=X_train)

# Model-agnostic Variable Importance for global interpretation
plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True)

# Use partial dependence to understand the relationship between a variable and a model's predictions
model = InMemoryModel(estimator.predict_proba,
                      examples=X_test,
                      #unique_values=model.classes_
                      target_names=list(set(y_train)))
예제 #27
0
class TestFeatureImportance(unittest.TestCase):
    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0,
                          1,
                          size=(self.n, self.dim),
                          random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]

        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2],
                                  [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [
            str(i) for i in range(self.sample_x.shape[1])
        ]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict,
                                                  examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(
            self.classifier.predict,
            examples=self.X,
            unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(
            self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(
            self.string_classifier.predict_proba, examples=self.X)

    @staticmethod
    def feature_column_name_formatter(columnname):
        return "feature: {}".format(columnname)

    def test_feature_importance(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, n_jobs=1, progressbar=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, n_jobs=2, progressbar=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_progressbar(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_entropy_with_and_without_scaling(self):
        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True, use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = self.interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn, progressbar=True, use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_regression_via_preformance_decrease(self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y)
        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.regressor_predict_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_classifier_via_preformance_decrease(self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y_as_int)
        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_feature_importance_classifier_proba_via_preformance_decrease(
            self):
        interpreter = Interpretation(self.X,
                                     feature_names=self.features,
                                     training_labels=self.y_as_int)
        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_proba_fn,
            method='conditional-permutation',
            use_scaling=False)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

        importances = interpreter.feature_importance.feature_importance(
            self.classifier_predict_proba_fn,
            method='conditional-permutation',
            use_scaling=True)
        self.assertEquals(np.isclose(importances.sum(), 1), True)

    def test_plot_feature_importance(self):
        self.interpreter.feature_importance.plot_feature_importance(
            self.regressor_predict_fn)
예제 #28
0
            test_index]
        dummy.fit(train_data, train_target)
        prediction = dummy.predict(test_data)
        print("Dummy prediction")
        print(classification_report(test_target, prediction))
        for model, title in zip(models, titles):
            clf = model.fit(train_data, train_target)
            prediction = clf.predict(test_data)
            print(f"{title}")
            print(classification_report(test_target, prediction))
            print(
                f"Confusion Matrix: \n {confusion_matrix(test_target, prediction)}"
            )

            # ax = axs[modelno - 1, fold - 1]
            interpreter = Interpretation(test_data,
                                         feature_names=featureNames[1:9])
            # model_no_proba = InMemoryModel(model.predict, examples=test_data, unique_values=model.classes_)
            pyint_model = InMemoryModel(
                model.predict_proba,
                examples=test_data,
                target_names=["CYT", "ME3", "MIT", "NUC"])
            # interpreter.feature_importance.plot_feature_importance(pyint_model, ascending=False, ax=ax,
            #                                                        progressbar=False)
            # ax.set_title(f"{title} on fold {fold}")
            # print("\n")

            ## To avoid clutter I only produce plots for gradient boosting and one fold only
            if (fold == 2 and modelno == 5):
                # Plot PDPs of variable "alm" since it is the most important feature, for 3 of the 4 models
                ## alm not the most important feature for Gaussian Naive bayes tho, explain that
                # for other variables just change the name
예제 #29
0
class TestPartialDependence(unittest.TestCase):

    def setUp(self):
        args = create_parser().parse_args()
        debug = args.debug
        self.seed = args.seed
        self.n = args.n
        self.dim = args.dim
        self.features = [str(i) for i in range(self.dim)]
        self.X = norm.rvs(0, 1, size=(self.n, self.dim), random_state=self.seed)
        self.B = np.array([-10.1, 2.2, 6.1])
        self.y = np.dot(self.X, self.B)
        self.y_as_int = np.round(expit(self.y))
        self.y_as_string = np.array([str(i) for i in self.y_as_int])
        # example dataset for y = B.X
        # X = array([[ 1.62434536, -0.61175641, -0.52817175], ... [-0.15065961, -1.40002289, -1.30106608]])  (1000 * 3)
        # B = array([-10.1,   2.2,   6.1])
        # y = array([ -2.09736000e+01,  -1.29850618e+00,  -1.73511155e+01, ...]) (1000 * 1)
        # features = ['0', '1', '2']
        ##
        # Other output types:
        # y_as_int = array[ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1., ...]
        # y_as_string = array['0.0', '0.0', '0.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', ... ]


        # Another set of input
        # sample data
        self.sample_x = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
        self.sample_y = np.array([-1, -1, -1, 1, 1, 1])
        self.sample_feature_name = [str(i) for i in range(self.sample_x.shape[1])]

        if debug:
            self.interpreter = Interpretation(log_level='DEBUG')
        else:
            self.interpreter = Interpretation()  # default level is 'WARNING'
        self.interpreter.load_data(self.X, feature_names=self.features)

        self.regressor = LinearRegression()
        self.regressor.fit(self.X, self.y)
        self.regressor_predict_fn = InMemoryModel(self.regressor.predict, examples=self.X)

        self.classifier = LogisticRegression()
        self.classifier.fit(self.X, self.y_as_int)
        self.classifier_predict_fn = InMemoryModel(self.classifier.predict, examples=self.X, unique_values=self.classifier.classes_)
        self.classifier_predict_proba_fn = InMemoryModel(self.classifier.predict_proba, examples=self.X)

        self.string_classifier = LogisticRegression()
        self.string_classifier.fit(self.X, self.y_as_string)
        self.string_classifier_predict_fn = InMemoryModel(self.string_classifier.predict_proba, examples=self.X)


        # Yet another set of input!!
        self.sample_x_categorical = np.array([['B', -1], ['A', -1], ['A', -2], ['C', 1], ['C', 2], ['A', 1]])
        self.sample_y_categorical = np.array(['A', 'A', 'A', 'B', 'B', 'B'])
        self.categorical_feature_names = ['Letters', 'Numbers']
        self.categorical_transformer = MultiColumnLabelBinarizer()
        self.categorical_transformer.fit(self.sample_x_categorical)
        self.sample_x_categorical_transormed = self.categorical_transformer.transform(self.sample_x_categorical)
        self.categorical_classifier = LogisticRegression()
        self.categorical_classifier.fit(self.sample_x_categorical_transormed, self.sample_y_categorical)
        self.categorical_predict_fn = lambda x: self.categorical_classifier.predict_proba(self.categorical_transformer.transform(x))
        self.categorical_model = InMemoryModel(self.categorical_predict_fn, examples=self.sample_x_categorical)


    def test_pdp_with_default_sampling(self):
        pdp_df = self.interpreter.partial_dependence.partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn,
                                                                        sample=True)
        self.assertEquals(pdp_df.shape, (30, 3))  # default grid resolution is 30

    def test_pd_with_categorical_features(self):
        interpreter = Interpretation(self.sample_x_categorical, feature_names=self.categorical_feature_names)
        try:
            interpreter.partial_dependence.partial_dependence([self.categorical_feature_names[0]], self.categorical_model)
        except:
            self.fail("PD computation function failed with categorical features")
        try:
            interpreter.partial_dependence.plot_partial_dependence([self.categorical_feature_names], self.categorical_model)
        except:
            self.fail("PDP plotting function failed with categorical features")



    def test_partial_dependence_binary_classification(self):
        # In the default implementation of pdp on sklearn, there is an approx. done
        # if the number of unique values for a feature space < grid_resolution specified.
        # For now, we have decided to not have that approximation. In V2, we will be benchmarking for
        # performance as well. Around that time we will revisit the same.
        # Reference: https://github.com/scikit-learn/scikit-learn/blob/4d9a12d175a38f2bcb720389ad2213f71a3d7697/sklearn/ensemble/tests/test_partial_dependence.py
        # TODO: check on the feature space approximation (V2)
        # Test partial dependence for classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=self.sample_x)
        interpreter = Interpretation()
        interpreter.load_data(np.array(self.sample_x), self.sample_feature_name)
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'],
                                                                   classifier_predict_fn,
                                                                   grid_resolution=5,
                                                                   sample=True)

        self.assertEquals(pdp_df.shape[0], len(np.unique(interpreter.data_set['0'])))

        # now with our own grid
        ud_grid = np.unique(self.sample_x[:, 0])
        # input: array([-2, -1,  1,  2])
        # the returned grid should have only 4 values as specified by the user
        pdp_df = interpreter.partial_dependence.partial_dependence(['0'], classifier_predict_fn,
                                                                   grid=ud_grid, sample=True)
        self.assertEquals(pdp_df.shape[0], 4)


    def test_partial_dependence_multiclass(self):
        # Iris data classes: ['setosa', 'versicolor', 'virginica']
        iris = datasets.load_iris()
        # 1. Using GB Classifier
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)

        expected_feature_name = PartialDependence.feature_column_name_formatter('sepal length (cm)')

        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{0} not in columns {1}".format(expected_feature_name,
                                                      pdp_df.columns.values))
        # 2. Using SVC
        from sklearn import svm
        # With SVC, predict_proba is supported only if probability flag is enabled, by default it is false
        clf = svm.SVC(probability=True)
        clf.fit(iris.data, iris.target)
        classifier_predict_fn = InMemoryModel(clf.predict_proba, examples=iris.data)
        interpreter = Interpretation()
        interpreter.load_data(iris.data, iris.feature_names)
        pdp_df = interpreter.partial_dependence.partial_dependence([iris.feature_names[0]], classifier_predict_fn,
                                                                   grid_resolution=25, sample=True)
        self.assertIn(expected_feature_name,
                      pdp_df.columns.values,
                      "{} not in columns {}".format(*[expected_feature_name,
                                                      pdp_df.columns.values]))




    def test_pdp_regression_coefs_closeness(self, epsilon=1):
        pdp_df = self.interpreter.partial_dependence.partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn)
        val_col = PartialDependence.feature_column_name_formatter(self.features[0])

        y = np.array(pdp_df[self.regressor_predict_fn.target_names[0]])
        x = np.array(pdp_df[val_col])[:, np.newaxis]
        regressor = LinearRegression()
        regressor.fit(x, y)
        self.interpreter.logger.debug("Regressor coefs: {}".format(regressor.coef_))
        self.interpreter.logger.debug("Regressor coef shape: {}".format(regressor.coef_.shape))
        coef = regressor.coef_[0]
        self.assertTrue(abs(coef - self.B[0]) < epsilon, True)


    def test_pdp_inputs(self):
        clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
        clf.fit(self.sample_x, self.sample_y)
        interpreter = Interpretation()
        self.assertRaisesRegexp(Exception, "Invalid Data", interpreter.load_data, None, self.sample_feature_name)


    def test_2D_pdp(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.regressor_predict_fn,
                                                                   grid_resolution=10,
                                                                   sample=True)
        except:
            self.fail("2D regressor pd failed")


    def test_plot_1D_pdp(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence([self.features[0]],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10)
        except:
            self.fail("1D regressor plot failed")


    def test_plot_1D_pdp_with_sampling(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(
                [self.features[0]],
                self.regressor_predict_fn,
                grid_resolution=10,
                sample=True)
        except:
            self.fail("1D classifier plot with sampling failed")


    def test_plot_2D_pdp(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(self.features[:2],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10,
                                                                        sample=False)
        except:
            self.fail("2D partial dep plot failed")

    def test_plot_2D_pdp_with_sampling(self):
        try:
            self.interpreter.partial_dependence.plot_partial_dependence(self.features[:2],
                                                                        self.regressor_predict_fn,
                                                                        grid_resolution=10,
                                                                        sample=True)
        except:
            self.fail("2D regressor with sampling failed")


    def test_fail_when_grid_range_is_outside_0_and_1(self):
        pdp_func = partial(self.interpreter.partial_dependence.partial_dependence,
                           *[[self.features[0]], self.regressor_predict_fn],
                           **{'grid_range': (.01, 1.01)})
        self.assertRaises(exceptions.MalformedGridRangeError, pdp_func)


    def test_pdp_1d_classifier_no_proba(self):
        self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                               self.classifier_predict_fn,
                                                               grid_resolution=10)
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("1D pdp without proba failed")


    def test_pdp_2d_classifier_no_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("2D pdp without proba failed")


    def test_pdp_1d_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.classifier_predict_proba_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("1D classifier with probability scores failed")


    def test_pdp_2d_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:2],
                                                                   self.classifier_predict_proba_fn,
                                                                   grid_resolution=10)
        except:
            self.fail("2D classifier with probability scores failed")


    def test_pdp_1d_string_classifier_no_proba(self):
        def fail_func():
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   InMemoryModel(self.string_classifier.predict,
                                                                                 examples=self.X),
                                                                   grid_resolution=10)
        self.assertRaises(exceptions.ModelError, fail_func)


    def test_pdp_1d_string_classifier_with_proba(self):
        try:
            self.interpreter.partial_dependence.partial_dependence(self.features[:1],
                                                                   self.string_classifier_predict_fn,
                                                                   grid_resolution=10)
        except:
            self.fail('1D string classifier pd failed')


    def test_pd_with_long_string_feature_name_issue_166(self):

        feature_names = ['longstring_{}'.format(i) for i in range(self.X.shape[1])]
        interpreter = Interpretation(self.X, feature_names=feature_names)
        try:
            interpreter.partial_dependence.partial_dependence(feature_names[0],
                                                              self.regressor_predict_fn)
        except:
            self.fail("1D Partial dependence failed when passing long string name")
예제 #30
0
# In[45]:

meu.display_model_performance_metrics(true_labels=wtp_test_y,
                                      predicted_labels=wtp_dnn_predictions,
                                      classes=['red', 'white'])

# # Model Interpretation

# ## View Feature importances

# In[14]:

from skater.core.explanations import Interpretation
from skater.model import InMemoryModel

wtp_interpreter = Interpretation(wtp_test_SX,
                                 feature_names=wtp_features.columns)
wtp_im_model = InMemoryModel(wtp_lr.predict_proba,
                             examples=wtp_train_SX,
                             target_names=wtp_lr.classes_)
plots = wtp_interpreter.feature_importance.plot_feature_importance(
    wtp_im_model, ascending=False)

# ## View model ROC curve

# In[15]:

meu.plot_model_roc_curve(wtp_lr, wtp_test_SX, wtp_test_y)

# ## Visualize Model Decision Surface

# In[59]: