Пример #1
0
    def test_serialize_json_safe_basic(self):
        values = [0, 1, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 1, 2, 3, 4, 5]

        values = ['a', 'b', 'a', 'c', 'a', 'b']
        result = serialize_json_safe(values)
        assert result == ['a', 'b', 'a', 'c', 'a', 'b']
Пример #2
0
    def test_serialize_via_json_timestamp(self):
        timestamp_obj = pd.Timestamp(2020, 1, 1)
        assert isinstance(timestamp_obj, pd.Timestamp)
        result = json.dumps(serialize_json_safe(timestamp_obj))
        assert result is not None
        assert "2020" in result

        timestamp_obj_array = np.array([pd.Timestamp(2020, 1, 1)])
        result = json.dumps(serialize_json_safe(timestamp_obj_array))
        assert result is not None
        assert "2020" in result
Пример #3
0
    def test_serialize_json_safe_aggregate_types(self):
        o = {'a': [1, 2, 3], 'c': 'b'}
        result = serialize_json_safe(o)
        assert result == o

        o = ('a', [1, 2, 3])
        result = serialize_json_safe(o)
        assert result == o

        values = np.array([[1, 2, 3], [4, 5, 6]])
        result = serialize_json_safe(values)
        assert result == values.tolist()
Пример #4
0
    def test_serialize_json_safe_missing(self):
        values = [0, np.nan, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 0, 2, 3, 4, 5]

        values = [0, np.inf, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 0, 2, 3, 4, 5]

        values = ['a', 'b', 'a', np.nan, 'a', 'b']
        result = serialize_json_safe(values)
        assert result == ['a', 'b', 'a', 0, 'a', 'b']
Пример #5
0
    def _get_dashboard_data(self):
        """Get the Python dict representation of the dashboard object."""
        if self._dashboard_data is None:
            dashboard_object = self._get_dashboard_object()
            self._dashboard_data = serialize_json_safe(dashboard_object)

        return self._dashboard_data
    def test_embedded_object(self):
        class A:
            def __init__(self):
                self.a_data = 'a'

        class B:
            def __init__(self):
                self.b_data = A()

        result = serialize_json_safe({'B': B()})
        assert result == {'B': {'b_data': {'a_data': 'a'}}}
    def __init__(self, explanation, model, dataset, true_y, classes, features):
        """Initialize the Explanation Dashboard Input.

        :param explanation: An object that represents an explanation.
        :type explanation: ExplanationMixin
        :param model: An object that represents a model.
        It is assumed that for the classification case
            it has a method of predict_proba() returning
            the prediction probabilities for each
            class and for the regression case a method of predict()
            returning the prediction value.
        :type model: object
        :param dataset: A matrix of feature vector examples
        (# examples x # features), the same samples
            used to build the explanation.
            Will overwrite any set on explanation object already.
            Must have fewer than
            10000 rows and fewer than 1000 columns.
        :type dataset: numpy.array or list[][]
        :param true_y: The true labels for the provided dataset.
            Will overwrite any set on
            explanation object already.
        :type true_y: numpy.array or list[]
        :param classes: The class names.
        :type classes: numpy.array or list[]
        :param features: Feature names.
        :type features: numpy.array or list[]
        """
        self._model = model
        self._is_classifier = model is not None\
            and hasattr(model, SKLearn.PREDICT_PROBA) and \
            model.predict_proba is not None
        self._dataframeColumns = None
        self.dashboard_input = {}
        # List of explanations, key of explanation type is "explanation_type"
        if explanation is not None:
            self._mli_explanations = explanation.data(-1)["mli"]
        else:
            self._mli_explanations = None
        local_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_LOCAL_EXPLANATION_KEY)
        global_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_GLOBAL_EXPLANATION_KEY)
        ebm_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EBM_GLOBAL_EXPLANATION_KEY)
        dataset_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EXPLANATION_DATASET_KEY)

        if explanation is not None and hasattr(explanation, 'method'):
            self.dashboard_input[ExplanationDashboardInterface.
                                 EXPLANATION_METHOD] = explanation.method

        predicted_y = None
        feature_length = None
        if dataset_explanation is not None:
            if dataset is None:
                dataset = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_X_KEY]
            if true_y is None:
                true_y = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_Y_KEY]

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
            self._dfdtypes = dataset.dtypes
        try:
            list_dataset = self._convert_to_list(dataset)
        except Exception as ex:
            ex_str = _format_exception(ex)
            raise ValueError(
                "Unsupported dataset type, inner error: {}".format(ex_str))
        if dataset is not None and model is not None:
            try:
                predicted_y = model.predict(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                msg = "Model does not support predict method for given"
                "dataset type, inner error: {}".format(ex_str)
                raise ValueError(msg)
            try:
                predicted_y = self._convert_to_list(predicted_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model prediction output of unsupported type,"
                                 "inner error: {}".format(ex_str))
        if predicted_y is not None:
            self.dashboard_input[
                ExplanationDashboardInterface.PREDICTED_Y] = predicted_y
        row_length = 0
        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")
            if feature_length > 1000:
                raise ValueError("Exceeds maximum number of features for"
                                 " visualization (1000). Please regenerate the"
                                 " explanation using fewer features or"
                                 " initialize the dashboard without passing a"
                                 " dataset.")
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRAINING_DATA] = serialize_json_safe(
                                     list_dataset)
            self.dashboard_input[ExplanationDashboardInterface.
                                 IS_CLASSIFIER] = self._is_classifier

        local_dim = None

        if true_y is not None and len(true_y) == row_length:
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRUE_Y] = self._convert_to_list(true_y)

        if local_explanation is not None:
            try:
                local_explanation["scores"] = self._convert_to_list(
                    local_explanation["scores"])
                if np.shape(local_explanation["scores"])[-1] > 1000:
                    raise ValueError("Exceeds maximum number of features for "
                                     "visualization (1000). Please regenerate"
                                     " the explanation using fewer features.")
                local_explanation["intercept"] = self._convert_to_list(
                    local_explanation["intercept"])
                # We can ignore perf explanation data.
                # Note if it is added back at any point,
                # the numpy values will need to be converted to python,
                # otherwise serialization fails.
                local_explanation["perf"] = None
                self.dashboard_input[ExplanationDashboardInterface.
                                     LOCAL_EXPLANATIONS] = local_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported local explanation type,"
                                 "inner error: {}".format(ex_str))
            if list_dataset is not None:
                local_dim = np.shape(local_explanation["scores"])
                if len(local_dim) != 2 and len(local_dim) != 3:
                    raise ValueError(
                        "Local explanation expected to be a 2D or 3D list")
                if len(local_dim) == 2 and (local_dim[1] != feature_length
                                            or local_dim[0] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     "length differs from dataset")
                if len(local_dim) == 3 and (local_dim[2] != feature_length
                                            or local_dim[1] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     " length differs from dataset")
        if local_explanation is None and global_explanation is not None:
            try:
                global_explanation["scores"] = self._convert_to_list(
                    global_explanation["scores"])
                if 'intercept' in global_explanation:
                    global_explanation["intercept"] = self._convert_to_list(
                        global_explanation["intercept"])
                self.dashboard_input[ExplanationDashboardInterface.
                                     GLOBAL_EXPLANATION] = global_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported global explanation type,"
                                 "inner error: {}".format(ex_str))
        if ebm_explanation is not None:
            try:
                self.dashboard_input[ExplanationDashboardInterface.
                                     EBM_EXPLANATION] = ebm_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Unsupported ebm explanation type: {}".format(ex_str))

        if features is None\
                and explanation is not None\
                and hasattr(explanation, 'features')\
                and explanation.features is not None:
            features = explanation.features
        if features is not None:
            features = self._convert_to_list(features)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            self.dashboard_input[
                ExplanationDashboardInterface.FEATURE_NAMES] = features
        if classes is None\
                and explanation is not None\
                and hasattr(explanation, 'classes')\
                and explanation.classes is not None:
            classes = explanation.classes
        if classes is not None:
            classes = self._convert_to_list(classes)
            if local_dim is not None and len(classes) != local_dim[0]:
                raise ValueError("Class vector length mismatch:"
                                 "class names length differs from"
                                 "local explanations dimension")
            self.dashboard_input[
                ExplanationDashboardInterface.CLASS_NAMES] = classes
        if model is not None and hasattr(model, SKLearn.PREDICT_PROBA) \
                and model.predict_proba is not None and dataset is not None:
            try:
                probability_y = model.predict_proba(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,"
                                 " inner error: {}".format(ex_str))
            try:
                probability_y = self._convert_to_list(probability_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Model predict_proba output of unsupported type,"
                    "inner error: {}".format(ex_str))
            self.dashboard_input[
                ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
    def __init__(self, explanation, model, dataset, true_y, classes, features,
                 categorical_features, true_y_dataset, pred_y, model_task,
                 metric, max_depth, num_leaves):
        """Initialize the ErrorAnalysis Dashboard Input.

        :param explanation: An object that represents an explanation.
        :type explanation: ExplanationMixin
        :param model: An object that represents a model.
        It is assumed that for the classification case
            it has a method of predict_proba() returning
            the prediction probabilities for each
            class and for the regression case a method of predict()
            returning the prediction value.
        :type model: object
        :param dataset: A matrix of feature vector examples
        (# examples x # features), the same samples
            used to build the explanation.
            Will overwrite any set on explanation object already.
            Must have fewer than
            10000 rows and fewer than 1000 columns.
        :type dataset: numpy.array or list[][] or pandas.DataFrame
        :param true_y: The true labels for the provided explanation.
            Will overwrite any set on explanation object already.
        :type true_y: numpy.array or list[]
        :param classes: The class names.
        :type classes: numpy.array or list[]
        :param features: Feature names.
        :type features: numpy.array or list[]
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param true_y_dataset: The true labels for the provided dataset.
        Only needed if the explanation has a sample of instances from the
        original dataset.  Otherwise specify true_y parameter only.
        :type true_y_dataset: numpy.array or list[]
        :param pred_y: The predicted y values, can be passed in as an
            alternative to the model and explanation for a more limited
            view.
        :type pred_y: numpy.ndarray or list[]
        :param model_task: Optional parameter to specify whether the model
            is a classification or regression model. In most cases, the
            type of the model can be inferred based on the shape of the
            output, where a classifier has a predict_proba method and
            outputs a 2 dimensional array, while a regressor has a
            predict method and outputs a 1 dimensional array.
        :type model_task: str
        :param metric: The metric name to evaluate at each tree node or
            heatmap grid.  Currently supported classification metrics
            include 'error_rate', 'recall_score', 'precision_score',
            'f1_score', and 'accuracy_score'. Supported regression
            metrics include 'mean_absolute_error', 'mean_squared_error',
            'r2_score', and 'median_absolute_error'.
        :type metric: str
        :param max_depth: The maximum depth of the surrogate tree trained
            on errors.
        :type max_depth: int
        :param num_leaves: The number of leaves of the surrogate tree
            trained on errors.
        :type num_leaves: int
        """
        self._model = model
        full_dataset = dataset
        if true_y_dataset is None:
            full_true_y = true_y
        else:
            full_true_y = true_y_dataset
        self._categorical_features = categorical_features
        self._string_ind_data = None
        self._categories = []
        self._categorical_indexes = []
        self._is_classifier = model is not None\
            and hasattr(model, SKLearn.PREDICT_PROBA) and \
            model.predict_proba is not None
        self._dataframeColumns = None
        self.dashboard_input = {}
        has_explanation = explanation is not None
        feature_length = None
        self._max_depth = max_depth
        self._num_leaves = num_leaves

        if has_explanation:
            if classes is None:
                has_classes_attr = hasattr(explanation, 'classes')
                if has_classes_attr and explanation.classes is not None:
                    classes = explanation.classes
            dataset, true_y = self.input_explanation(explanation, dataset,
                                                     true_y)
            row_length = len(dataset)
            # Only check dataset on explanation for row length bounds
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")

        if classes is not None:
            classes = self._convert_to_list(classes)
            self.dashboard_input[
                ExplanationDashboardInterface.CLASS_NAMES] = classes
            class_to_index = {k: v for v, k in enumerate(classes)}

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
            self._dfdtypes = dataset.dtypes
        try:
            list_dataset = self._convert_to_list(dataset)
        except Exception as ex:
            ex_str = _format_exception(ex)
            raise ValueError(
                "Unsupported dataset type, inner error: {}".format(ex_str))

        if has_explanation:
            self.input_explanation_data(explanation, list_dataset, classes)
            if features is None and hasattr(explanation, 'features'):
                features = explanation.features

        model_available = model is not None

        if model_available and pred_y is not None:
            raise ValueError('Only model or pred_y can be specified, not both')

        self.dashboard_input[ENABLE_PREDICT] = model_available

        if model_available:
            predicted_y = self.compute_predicted_y(model, dataset)
        else:
            predicted_y = self.predicted_y_to_list(pred_y)

        if predicted_y is not None:
            # If classes specified, convert predicted_y to
            # numeric representation
            if classes is not None and predicted_y[0] in class_to_index:
                for i in range(len(predicted_y)):
                    predicted_y[i] = class_to_index[predicted_y[i]]
            self.dashboard_input[
                ExplanationDashboardInterface.PREDICTED_Y] = predicted_y
        row_length = 0
        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if feature_length > 1000:
                raise ValueError("Exceeds maximum number of features for"
                                 " visualization (1000). Please regenerate the"
                                 " explanation using fewer features or"
                                 " initialize the dashboard without passing a"
                                 " dataset.")
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRAINING_DATA] = serialize_json_safe(
                                     list_dataset)
            self.dashboard_input[ExplanationDashboardInterface.
                                 IS_CLASSIFIER] = self._is_classifier

        if true_y is not None and len(true_y) == row_length:
            list_true_y = self._convert_to_list(true_y)
            # If classes specified, convert true_y to numeric representation
            if classes is not None and list_true_y[0] in class_to_index:
                for i in range(len(list_true_y)):
                    list_true_y[i] = class_to_index[list_true_y[i]]
            self.dashboard_input[
                ExplanationDashboardInterface.TRUE_Y] = list_true_y

        if features is not None:
            features = self._convert_to_list(features)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            self.dashboard_input[FEATURE_NAMES] = features
        if model_available and hasattr(model, SKLearn.PREDICT_PROBA) \
                and model.predict_proba is not None and dataset is not None:
            try:
                probability_y = model.predict_proba(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,"
                                 " inner error: {}".format(ex_str))
            try:
                probability_y = self._convert_to_list(probability_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Model predict_proba output of unsupported type,"
                    "inner error: {}".format(ex_str))
            self.dashboard_input[
                ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
        if model_available:
            self._error_analyzer = ModelAnalyzer(model, full_dataset,
                                                 full_true_y, features,
                                                 categorical_features,
                                                 model_task, metric)
        else:
            # Model task cannot be unknown when passing predictions
            # Assume classification for backwards compatibility
            if model_task == ModelTask.UNKNOWN:
                model_task = ModelTask.CLASSIFICATION
            self._error_analyzer = PredictionsAnalyzer(pred_y, full_dataset,
                                                       full_true_y, features,
                                                       categorical_features,
                                                       model_task, metric)
        if self._categorical_features:
            self.dashboard_input[
                ExplanationDashboardInterface.
                CATEGORICAL_MAP] = self._error_analyzer.category_dictionary
        # Compute metrics on all data cohort
        if self._error_analyzer.model_task == ModelTask.CLASSIFICATION:
            if self._error_analyzer.metric is None:
                metric = Metrics.ERROR_RATE
            else:
                metric = self._error_analyzer.metric
        else:
            if self._error_analyzer.metric is None:
                metric = Metrics.MEAN_SQUARED_ERROR
            else:
                metric = self._error_analyzer.metric
        if model_available and true_y_dataset is not None:
            full_predicted_y = self.compute_predicted_y(model, full_dataset)
        else:
            full_predicted_y = predicted_y
        self.set_root_metric(full_predicted_y, full_true_y, metric)
 def test_unknown(self):
     c = complex(1, 2)
     result = serialize_json_safe([c, 42])
     assert result == [c, 42]
 def test_numpy(self):
     result = serialize_json_safe(np.array([1, 2, 3]))
     assert result == [1, 2, 3]
Пример #11
0
 def test_serialize_timestamp(self):
     datetime_str = "2020-10-10"
     datetime_object = datetime.datetime.strptime(datetime_str, "%Y-%m-%d")
     result = serialize_json_safe(datetime_object)
     assert datetime_str in result