Пример #1
0
 def test_generate_grid_1_variable(self):
     """Ensures generate grid works with 1 variable"""
     data_set = DataManager(self.X,
                            feature_names=self.feature_names,
                            index=self.index)
     grid = data_set.generate_grid(data_set.feature_ids[0:1],
                                   grid_resolution=100)
     self.assertEquals(len(grid), 1)
Пример #2
0
 def test_generate_grid_2_variables(self):
     """Ensures generate grid works with 2 variables"""
     data_set = DataManager(self.X,
                            feature_names=self.feature_names,
                            index=self.index)
     grid = data_set.generate_grid(self.feature_names[0:2],
                                   grid_resolution=100)
     self.assertEquals(len(grid), 2)
Пример #3
0
 def test_datamanager_data_returns_original(self):
     """
     ensure DataManager(data).data == data
     """
     data_set = DataManager(self.X,
                            feature_names=self.feature_names,
                            index=self.index)
     assert_array_equal(data_set.data, self.X)
Пример #4
0
    def test_pandas_dataframe(self):
        """
        Ensure DataManager(data) works when data is pd.DataFrame
        """
        X_as_dataframe = pd.DataFrame(self.X,
                                      columns=self.feature_names,
                                      index=self.index)
        data_set = DataManager(X_as_dataframe, log_level=self.log_level)

        assert data_set.feature_ids == self.feature_names, "Feature Names from DataFrame " \
                                                           "not loaded properly"
        assert data_set.index == self.index, "Index from DataFrame not loaded properly"
        assert_array_equal(data_set.data, self.X)
Пример #5
0
    def test_2d_numpy_array(self):
        """
        ensure DataManager(data) works when data is 2D np.ndarray
        """

        feature_ids = [0, 1]
        array_2d = self.X[:, feature_ids]
        feature_names = [self.feature_names[i] for i in feature_ids]

        data_set = DataManager(array_2d,
                               feature_names=feature_names,
                               index=self.index,
                               log_level=self.log_level)
        assert_array_equal(data_set.data, array_2d)
        assert data_set.feature_ids == feature_names
Пример #6
0
    def test_1d_numpy_array(self):
        """
        ensure DataManager(data) works when data is 1D np.ndarray
        """

        feature_id = 0
        array_1d = self.X[:, feature_id][:, np.newaxis]
        feature_names = [self.feature_names[feature_id]]

        data_set = DataManager(array_1d,
                               feature_names=feature_names,
                               index=self.index,
                               log_level=self.log_level)
        assert_array_equal(data_set.data, array_1d)
        assert data_set.feature_ids == feature_names
Пример #7
0
    def test_pandas_dataframe(self):
        """
        Ensure DataManager(data) works when data is pd.DataFrame
        """
        X_as_dataframe = pd.DataFrame(self.X, columns=self.feature_names, index=self.index)
        data_set = DataManager(X_as_dataframe,
                               log_level=self.log_level,
                               index=self.index,
                               feature_names=self.feature_names)

        assert data_set.feature_ids == self.feature_names, "Feature Names from DataFrame " \
                                                           "not loaded properly"
        assert data_set.index == self.index, "Index from DataFrame not loaded properly"
        assert_array_equal(data_set.X, self.X)
        try:
            feature_info = data_set.feature_info
            for feature in self.feature_names:
                assert feature in feature_info
        except ValueError as e:
            raise e
        except KeyError as e:
            assert False, "{:} not found in feature_info".format(feature)
Пример #8
0
    def load_data(self,
                  training_data,
                  training_labels=None,
                  feature_names=None,
                  index=None):
        """
        Creates a DataSet object from inputs, ties to interpretation object.
        This will be exposed to all submodules.

        Parameters
        ----------
        training_data: numpy.ndarray, pandas.DataFrame
            the dataset. can be 1D or 2D

        feature_names: array-type
            names to call features.

        index: array-type
            names to call rows.


        Returns
        --------
            None
        """

        self.logger.info("Loading Data")
        self.data_set = DataManager(training_data,
                                    y=training_labels,
                                    feature_names=feature_names,
                                    index=index,
                                    log_level=self._log_level)
        self.logger.info("Data loaded")
        self.logger.info("Data shape: {}".format(self.data_set.X.shape))
        self.logger.info("Dataset Feature_ids: {}".format(
            self.data_set.feature_ids))
Пример #9
0
    def fit(self,
            X,
            Y,
            use_oracle=True,
            prune='post',
            cv=5,
            n_iter_search=10,
            scorer_type='default',
            n_jobs=1,
            param_grid=None,
            impurity_threshold=0.01,
            verbose=False):
        """ Learn an approximate representation by constructing a Decision Tree based on the results retrieved by
        querying the Oracle(base model). Instances used for training should belong to the base learners instance space.

        Parameters
        ----------
        X : numpy.ndarray, pandas.DataFrame
            Training input samples
        Y : numpy.ndarray, target values(ground truth)
        use_oracle : bool (defaul=True)
            Use of Oracle, helps the Surrogate model train on the decision boundaries learned by the base model. \
            The closer the surrogate model is to the Oracle, more faithful are the explanations.

              - True, builds a surrogate model against the predictions of the base model(Oracle).
              - False, learns an interpretable tree based model using the supplied training examples and ground truth.

        prune : None, str (default="post")
            Pruning is a useful technique to control the complexity of the tree (keeping the trees comprehensive \
            and interpretable) without compromising on model's accuracy. Avoiding to build large and deep trees \
            also helps in preventing over-fitting.

              - "pre"
              Also known as forward/online pruning. This pruning process uses a termination \
              condition(high and low thresholds) to prematurely terminate some of the branches and nodes.
              Cross Validation is applied to measure the goodness of the fit while the tree is pruned.

              - "pos"
              Also known as backward pruning. The pruning process is applied post the construction of the \
              tree using the specified model parameters. This involves reducing the branches and nodes using \
              a cost function. The current implementation support cost optimization using \
              Model's scoring metrics(e.g. r2, log-loss, f1, ...).

        cv : int, (default=5)
            Randomized cross validation used only for 'pre-pruning' right now.
        n_iter_search : int (default=10)
            Number of parameter setting combinations that are sampled for pre-pruning.
        scorer_type : str (default="default")
        n_jobs : int (default=1)
            Number of jobs to run in parallel.
        param_grid : dict
            Dictionary of parameters to specify the termination condition for pre-pruning.
        impurity_threshold : float (default=0.01)
            Specifies acceptable performance drop when using Tree based surrogates to replicate the decision policies
            learned by the Oracle
        verbose : bool (default=False)
            Helps control the verbosity.

        References
        ----------
        .. [1] Nikita Patel and Saurabh Upadhyay(2012)
               Study of Various Decision Tree Pruning Methods with their Empirical Comparison in WEKA
               (https://pdfs.semanticscholar.org/025b/8c109c38dc115024e97eb0ede5ea873fffdb.pdf)
        """

        if verbose:
            self.logger.setLevel(_DEBUG)
        else:
            self.logger.setLevel(_INFO)
        # DataManager does type checking as well
        dm = DataManager(X, Y)
        X, Y = dm.X, dm.y
        # Below is an anti-pattern but had to use it. Should fix it in the long term
        y_hat_original = self.oracle._execute(X)

        # TODO: Revisit the check on using probability or class labels
        if use_oracle and self.oracle.probability:
            y_train = np.array(list(map(np.argmax, y_hat_original)))
        elif use_oracle:
            y_train = y_hat_original
        else:
            # this is when y_train is being passed and the desire is to build an interpretable tree based model
            y_train = Y

        if prune is None:
            self.logger.info("No pruning applied ...")
            self.__model.fit(X, y_train)
        elif prune == 'pre':
            # apply randomized cross validation for pruning
            self.logger.info("pre pruning applied ...")
            self._pre_pruning(X, y_train, scorer_type, cv, n_iter_search,
                              n_jobs, param_grid, verbose)
        else:
            self.logger.info("post pruning applied ...")
            # Since, this is post pruning, we first learn a model
            # and then try to prune the tree controling the model's score using the impurity_threshold
            self._post_pruning(X,
                               y_train,
                               scorer_type,
                               impurity_threshold,
                               needs_prob=self.oracle.probability)
        y_hat_surrogate = self.__pred_func(X, self.oracle.probability)
        self.logger.info(
            'Done generating prediction using the surrogate, shape {}'.format(
                y_hat_surrogate.shape))

        # Default metrics:
        # {Classification: if probability score used --> cross entropy(log-loss) else --> F1 score}
        # {Regression: Mean Absolute Error (MAE)}
        scorer = self.oracle.scorers.get_scorer_function(
            scorer_type=scorer_type)
        self.__scorer_name = scorer.name

        oracle_score = round(scorer(Y, y_hat_original), 3)
        # Since surrogate model is build against the base model's(Oracle's) predicted
        # behavior y_true=y_train
        surrogate_score = round(scorer(y_train, y_hat_surrogate), 3)
        self.logger.info(
            'Done scoring, surrogate score {}; oracle score {}'.format(
                surrogate_score, oracle_score))

        impurity_score = round(oracle_score - surrogate_score, 3)
        if impurity_score > self.impurity_threshold:
            self.logger.warning(
                'impurity score: {} of the surrogate model is higher than the impurity threshold: {}. '
                'The higher the impurity score, lower is the fidelity/faithfulness '
                'of the surrogate model'.format(impurity_score,
                                                impurity_threshold))
        return impurity_score
Пример #10
0
 def test_generate_grid_2_variables(self):
     """Ensures generate grid works with 2 variables"""
     data_set = DataManager(self.X, feature_names=self.feature_names, index=self.index)
     grid = data_set.generate_grid(self.feature_names[0:2], grid_resolution=100)
     self.assertEquals(len(grid), 2)
Пример #11
0
 def test_generate_grid_1_variable(self):
     """Ensures generate grid works with 1 variable"""
     data_set = DataManager(self.X, feature_names=self.feature_names, index=self.index)
     grid = data_set.generate_grid(data_set.feature_ids[0:1], grid_resolution=100)
     self.assertEquals(len(grid), 1)
    def partial_dependence(self,
                           feature_ids,
                           modelinstance,
                           filter_classes=None,
                           grid=None,
                           grid_resolution=30,
                           n_jobs=-1,
                           grid_range=None,
                           sample=True,
                           sampling_strategy='random-choice',
                           n_samples=1000,
                           bin_count=50,
                           return_metadata=False,
                           progressbar=True,
                           variance_type='estimate'):
        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        variance_type: string

        return_metadata: boolean

        :Example:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \
                                           "Please call Interpretation.load_data \n" \
                                           "before running this method."
            raise (
                exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be \n" \
                      "members of modelinstance.classes. \n" \
                      "Expected members of: \n" \
                      "{0}\n" \
                      "got: \n" \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            filter_classes = list(filter_classes)
            assert all([
                i in modelinstance.target_names for i in filter_classes
            ]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise (exceptions.ModelError(
                "Incorrect estimator function used for computing partial dependence, try one \n"
                "creating one with skater.model.local.InMemoryModel or \n"
                "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise (exceptions.ModelError(
                    'If using classifier without probability scores, unique_values cannot \n'
                    'be None'))
            self.interpreter.logger.warn(
                "Classifiers with probability scores can be explained \n"
                "more granularly than those without scores. If a prediction method with \n"
                "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in \n" \
                                         "Interpretation.data_set.feature_ids \n" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise (KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(
                    grid_range)
                raise (exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples=10)

            examples = DataManager(examples,
                                   feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(
            grid.shape))
        self.interpreter.logger.debug(
            "Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples=n_samples,
                                                    bin_count=bin_count)

        assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \
                                                             "Theres a type mismatch between\n" \
                                                             "the sampled data and the origina\nl" \
                                                             "training set. Check Skater.models\n"

        _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids,
                                                  self.data_set.feature_ids,
                                                  filter_classes,
                                                  variance_type)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(
            data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise (exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]

        executor_instance = Pool(n_jobs)

        if progressbar:
            self.interpreter.logger.warn(
                "Progress bars slow down runs by 10-20%. For slightly "
                "faster runs, do progressbar=False")
            mapper = executor_instance.imap
            p = ProgressBar(len(arg_list), units='grid cells')
        else:
            mapper = executor_instance.map

        pd_list = []
        try:
            if n_jobs == 1:
                raise ValueError("Skipping to single processing")
            for pd_row in mapper(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        except:
            self.interpreter.logger.info(
                "Multiprocessing failed, going single process")
            for pd_row in map(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()

        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))