Exemplo n.º 1
0
    def test_serialize_json_safe_basic(self):
        values = [0, 1, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 1, 2, 3, 4, 5]

        values = ['a', 'b', 'a', 'c', 'a', 'b']
        result = serialize_json_safe(values)
        assert result == ['a', 'b', 'a', 'c', 'a', 'b']
Exemplo n.º 2
0
    def test_serialize_via_json_timestamp(self):
        timestamp_obj = pd.Timestamp(2020, 1, 1)
        assert isinstance(timestamp_obj, pd.Timestamp)
        result = json.dumps(serialize_json_safe(timestamp_obj))
        assert result is not None
        assert "2020" in result

        timestamp_obj_array = np.array([pd.Timestamp(2020, 1, 1)])
        result = json.dumps(serialize_json_safe(timestamp_obj_array))
        assert result is not None
        assert "2020" in result
Exemplo n.º 3
0
    def test_serialize_json_safe_aggregate_types(self):
        o = {'a': [1, 2, 3], 'c': 'b'}
        result = serialize_json_safe(o)
        assert result == o

        o = ('a', [1, 2, 3])
        result = serialize_json_safe(o)
        assert result == o

        values = np.array([[1, 2, 3], [4, 5, 6]])
        result = serialize_json_safe(values)
        assert result == values.tolist()
Exemplo n.º 4
0
    def test_serialize_json_safe_missing(self):
        values = [0, np.nan, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 0, 2, 3, 4, 5]

        values = [0, np.inf, 2, 3, 4, 5]
        result = serialize_json_safe(values)
        assert result == [0, 0, 2, 3, 4, 5]

        values = ['a', 'b', 'a', np.nan, 'a', 'b']
        result = serialize_json_safe(values)
        assert result == ['a', 'b', 'a', 0, 'a', 'b']
Exemplo n.º 5
0
    def _get_dashboard_data(self):
        """Get the Python dict representation of the dashboard object."""
        if self._dashboard_data is None:
            dashboard_object = self._get_dashboard_object()
            self._dashboard_data = serialize_json_safe(dashboard_object)

        return self._dashboard_data
Exemplo n.º 6
0
    def test_embedded_object(self):
        class A:
            def __init__(self):
                self.a_data = 'a'

        class B:
            def __init__(self):
                self.b_data = A()

        result = serialize_json_safe({'B': B()})
        assert result == {'B': {'b_data': {'a_data': 'a'}}}
Exemplo n.º 7
0
def json_converter(obj):
    """Helper function to convert ErrorReport object to a dictionary.

    :param obj: Object to convert to a dictionary which can be saved as json.
    :type obj: object
    :return: The converted dictionary which can be saved as json.
    :rtype: dict
    """
    if isinstance(obj, ErrorReport):
        rdict = obj.__dict__
        return rdict
    return serialize_json_safe(obj)
Exemplo n.º 8
0
    def setup_visualization_input(self, classes, predicted_y, list_dataset,
                                  true_y, features):
        if classes is not None:
            classes = convert_to_list(classes)
            self.dashboard_input[
                ExplanationDashboardInterface.CLASS_NAMES] = classes
            class_to_index = {k: v for v, k in enumerate(classes)}

        if predicted_y is not None:
            # If classes specified, convert predicted_y to
            # numeric representation
            if classes is not None and predicted_y[0] in class_to_index:
                for i in range(len(predicted_y)):
                    predicted_y[i] = class_to_index[predicted_y[i]]
            self.dashboard_input[
                ExplanationDashboardInterface.PREDICTED_Y] = predicted_y

        row_length = 0
        feature_length = None

        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if feature_length > 1000:
                raise ValueError("Exceeds maximum number of features for"
                                 " visualization (1000). Please regenerate the"
                                 " explanation using fewer features or"
                                 " initialize the dashboard without passing a"
                                 " dataset.")
            self.dashboard_input[ExplanationDashboardInterface.
                                 TRAINING_DATA] = serialize_json_safe(
                                     list_dataset)

        if true_y is not None and len(true_y) == row_length:
            list_true_y = convert_to_list(true_y)
            # If classes specified, convert true_y to numeric representation
            if classes is not None and list_true_y[0] in class_to_index:
                for i in range(len(list_true_y)):
                    list_true_y[i] = class_to_index[list_true_y[i]]
            self.dashboard_input[
                ExplanationDashboardInterface.TRUE_Y] = list_true_y

        if features is not None:
            features = convert_to_list(features)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            self.dashboard_input[FEATURE_NAMES] = features
    def __init__(self, explanation, model, dataset, true_y, classes, features):
        """Initialize the Explanation Dashboard Input.

        :param explanation: An object that represents an explanation.
        :type explanation: ExplanationMixin
        :param model: An object that represents a model.
        It is assumed that for the classification case
            it has a method of predict_proba() returning
            the prediction probabilities for each
            class and for the regression case a method of predict()
            returning the prediction value.
        :type model: object
        :param dataset: A matrix of feature vector examples
        (# examples x # features), the same samples
            used to build the explanation.
            Will overwrite any set on explanation object already.
            Must have fewer than
            100000 rows and fewer than 1000 columns.
            Note dashboard may become slow or crash for more than 10000 rows.
        :type dataset: numpy.ndarray or list[][]
        :param true_y: The true labels for the provided dataset.
            Will overwrite any set on
            explanation object already.
        :type true_y: numpy.ndarray or list[]
        :param classes: The class names.
        :type classes: numpy.ndarray or list[]
        :param features: Feature names.
        :type features: numpy.ndarray or list[]
        """
        self._model = model
        self._is_classifier = is_classifier(model)
        self._dataframeColumns = None
        self.dashboard_input = {}
        # List of explanations, key of explanation type is "explanation_type"
        if explanation is not None:
            self._mli_explanations = explanation.data(-1)["mli"]
        else:
            self._mli_explanations = None
        local_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_LOCAL_EXPLANATION_KEY)
        global_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_GLOBAL_EXPLANATION_KEY)
        ebm_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EBM_GLOBAL_EXPLANATION_KEY)
        dataset_explanation = self._find_first_explanation(
            ExplanationDashboardInterface.MLI_EXPLANATION_DATASET_KEY)

        if explanation is not None and hasattr(explanation, 'method'):
            self.dashboard_input[ExplanationDashboardInterface.
                                 EXPLANATION_METHOD] = explanation.method

        predicted_y = None
        feature_length = None
        if dataset_explanation is not None:
            if dataset is None:
                dataset = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_X_KEY]
            if true_y is None:
                true_y = dataset_explanation[
                    ExplanationDashboardInterface.MLI_DATASET_Y_KEY]

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
            self._dfdtypes = dataset.dtypes
        try:
            list_dataset = convert_to_list(dataset, EXP_VIZ_ERR_MSG)
        except Exception as ex:
            ex_str = _format_exception(ex)
            raise ValueError(
                "Unsupported dataset type, inner error: {}".format(ex_str))
        if dataset is not None and model is not None:
            try:
                predicted_y = model.predict(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                msg = "Model does not support predict method for given"
                "dataset type, inner error: {}".format(ex_str)
                raise ValueError(msg)
            try:
                predicted_y = convert_to_list(predicted_y, EXP_VIZ_ERR_MSG)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model prediction output of unsupported type,"
                                 "inner error: {}".format(ex_str))
        if predicted_y is not None:
            self.dashboard_input[
                ExplanationDashboardInterface.PREDICTED_Y] = predicted_y
        row_length = 0
        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")
            if feature_length > 1000:
                warnings.warn("Exceeds maximum number of features for"
                              " visualization (1000)."
                              " Please regenerate the"
                              " explanation using fewer features or"
                              " initialize the dashboard without"
                              " passing a dataset. Dashboard will"
                              " show limited view.")
            else:
                self.dashboard_input[ExplanationDashboardInterface.
                                     TRAINING_DATA] = serialize_json_safe(
                                         list_dataset)
            self.dashboard_input[ExplanationDashboardInterface.
                                 IS_CLASSIFIER] = self._is_classifier

        local_dim = None

        if true_y is not None and len(true_y) == row_length:
            self.dashboard_input[
                ExplanationDashboardInterface.TRUE_Y] = convert_to_list(
                    true_y, EXP_VIZ_ERR_MSG)

        if local_explanation is not None:
            try:
                local_explanation["scores"] = convert_to_list(
                    local_explanation["scores"], EXP_VIZ_ERR_MSG)
                local_explanation["intercept"] = convert_to_list(
                    local_explanation["intercept"], EXP_VIZ_ERR_MSG)
                # We can ignore perf explanation data.
                # Note if it is added back at any point,
                # the numpy values will need to be converted to python,
                # otherwise serialization fails.
                local_explanation["perf"] = None
                self.dashboard_input[ExplanationDashboardInterface.
                                     LOCAL_EXPLANATIONS] = local_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported local explanation type,"
                                 "inner error: {}".format(ex_str))
            if list_dataset is not None:
                local_dim = np.shape(local_explanation["scores"])
                if len(local_dim) != 2 and len(local_dim) != 3:
                    raise ValueError(
                        "Local explanation expected to be a 2D or 3D list")
                if len(local_dim) == 2 and (local_dim[1] != feature_length
                                            or local_dim[0] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     "length differs from dataset")
                if len(local_dim) == 3 and (local_dim[2] != feature_length
                                            or local_dim[1] != row_length):
                    raise ValueError("Shape mismatch: local explanation"
                                     " length differs from dataset")
        if local_explanation is None and global_explanation is not None:
            try:
                global_explanation["scores"] = convert_to_list(
                    global_explanation["scores"], EXP_VIZ_ERR_MSG)
                if 'intercept' in global_explanation:
                    global_explanation["intercept"] = convert_to_list(
                        global_explanation["intercept"], EXP_VIZ_ERR_MSG)
                self.dashboard_input[ExplanationDashboardInterface.
                                     GLOBAL_EXPLANATION] = global_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Unsupported global explanation type,"
                                 "inner error: {}".format(ex_str))
        if ebm_explanation is not None:
            try:
                self.dashboard_input[ExplanationDashboardInterface.
                                     EBM_EXPLANATION] = ebm_explanation
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Unsupported ebm explanation type: {}".format(ex_str))

        if features is None\
                and explanation is not None\
                and hasattr(explanation, 'features')\
                and explanation.features is not None:
            features = explanation.features
        if features is not None:
            features = convert_to_list(features, EXP_VIZ_ERR_MSG)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            self.dashboard_input[
                ExplanationDashboardInterface.FEATURE_NAMES] = features
        if classes is None\
                and explanation is not None\
                and hasattr(explanation, 'classes')\
                and explanation.classes is not None:
            classes = explanation.classes
        if classes is not None:
            classes = convert_to_list(classes, EXP_VIZ_ERR_MSG)
            if local_dim is not None and len(classes) != local_dim[0]:
                raise ValueError("Class vector length mismatch:"
                                 "class names length differs from"
                                 "local explanations dimension")
            self.dashboard_input[
                ExplanationDashboardInterface.CLASS_NAMES] = classes
        if is_classifier(model) and dataset is not None:
            try:
                probability_y = model.predict_proba(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,"
                                 " inner error: {}".format(ex_str))
            try:
                probability_y = convert_to_list(probability_y, EXP_VIZ_ERR_MSG)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Model predict_proba output of unsupported type,"
                    "inner error: {}".format(ex_str))
            self.dashboard_input[
                ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
Exemplo n.º 10
0
    def setup_local(self, explanation, model, dataset, true_y, classes,
                    features, categorical_features, true_y_dataset, pred_y,
                    pred_y_dataset, model_task, metric, max_depth, num_leaves,
                    min_child_samples, sample_dataset, model_available):
        full_dataset = dataset
        if true_y_dataset is None:
            full_true_y = true_y
        else:
            full_true_y = true_y_dataset
        if pred_y_dataset is None:
            full_pred_y = pred_y
        else:
            full_pred_y = pred_y_dataset
        has_explanation = explanation is not None
        probability_y = None

        if has_explanation:
            if classes is None:
                has_classes_attr = hasattr(explanation, 'classes')
                if has_classes_attr and explanation.classes is not None:
                    classes = explanation.classes
            dataset, true_y = self.input_explanation(explanation, dataset,
                                                     true_y)
            row_length = len(dataset)
            # Only check dataset on explanation for row length bounds
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")
        elif sample_dataset is not None:
            dataset = sample_dataset

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
            self._dfdtypes = dataset.dtypes
        try:
            list_dataset = convert_to_list(dataset)
        except Exception as ex:
            ex_str = _format_exception(ex)
            raise ValueError(
                "Unsupported dataset type, inner error: {}".format(ex_str))

        if has_explanation:
            self.input_explanation_data(list_dataset, classes)
            if features is None and hasattr(explanation, 'features'):
                features = explanation.features

        if model_available:
            predicted_y = self.compute_predicted_y(model, dataset)
        else:
            predicted_y = self.predicted_y_to_list(pred_y)

        self.setup_visualization_input(classes, predicted_y, list_dataset,
                                       true_y, features)

        if model_available and is_classifier(model) and \
                dataset is not None:
            try:
                probability_y = model.predict_proba(dataset)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,"
                                 " inner error: {}".format(ex_str))
            try:
                probability_y = convert_to_list(probability_y)
            except Exception as ex:
                ex_str = _format_exception(ex)
                raise ValueError(
                    "Model predict_proba output of unsupported type,"
                    "inner error: {}".format(ex_str))
            self.dashboard_input[
                ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
        if model_available:
            self._error_analyzer = ModelAnalyzer(model, full_dataset,
                                                 full_true_y, features,
                                                 categorical_features,
                                                 model_task, metric, classes)
        else:
            # Model task cannot be unknown when passing predictions
            # Assume classification for backwards compatibility
            if model_task == ModelTask.UNKNOWN:
                model_task = ModelTask.CLASSIFICATION
            self._error_analyzer = PredictionsAnalyzer(
                full_pred_y, full_dataset, full_true_y, features,
                categorical_features, model_task, metric, classes)
        if self._categorical_features:
            self.dashboard_input[ExplanationDashboardInterface.
                                 CATEGORICAL_MAP] = serialize_json_safe(
                                     self._error_analyzer.category_dictionary)
        # Compute metrics on all data cohort
        if self._error_analyzer.model_task == ModelTask.CLASSIFICATION:
            if self._error_analyzer.metric is None:
                metric = Metrics.ERROR_RATE
            else:
                metric = self._error_analyzer.metric
        else:
            if self._error_analyzer.metric is None:
                metric = Metrics.MEAN_SQUARED_ERROR
            else:
                metric = self._error_analyzer.metric
        if model_available:
            full_pred_y = self.compute_predicted_y(model, full_dataset)
        # If we don't have an explanation or model/probabilities specified
        # we can try to use model task to figure out the method
        if not has_explanation and probability_y is None:
            method = MethodConstants.REGRESSION
            if self._error_analyzer.model_task == ModelTask.CLASSIFICATION:
                if (len(np.unique(predicted_y)) > 2):
                    method = MethodConstants.MULTICLASS
                else:
                    method = MethodConstants.BINARY
            self.dashboard_input[
                ErrorAnalysisDashboardInterface.METHOD] = method
Exemplo n.º 11
0
 def test_serialize_timestamp(self):
     datetime_str = "2020-10-10"
     datetime_object = datetime.datetime.strptime(datetime_str, "%Y-%m-%d")
     result = serialize_json_safe(datetime_object)
     assert datetime_str in result
Exemplo n.º 12
0
 def test_unknown(self):
     c = complex(1, 2)
     result = serialize_json_safe([c, 42])
     assert result == [c, 42]
Exemplo n.º 13
0
 def test_numpy(self):
     result = serialize_json_safe(np.array([1, 2, 3]))
     assert result == [1, 2, 3]