Пример #1
0
def _input_is_valid(dataset: np.ndarray,
                    model: object,
                    feature_index: Union[int, str],
                    treat_as_categorical: Optional[bool],
                    steps_number: Optional[int]) -> bool:  # yapf: disable
    """
    Validates input parameters of Individual Conditional Expectation function.

    For the input parameter description, warnings and exceptions please see the
    documentation of the :func`fatf.transparency.model.feature_influence.
    individual_conditional_expectation` function.

    Returns
    -------
    is_input_ok : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    is_input_ok = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a 2-dimensional '
                                  'array.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input dataset must only contain base types '
                         '(textual and numerical).')

    if not fumv.check_model_functionality(model, require_probabilities=True):
        raise IncompatibleModelError('This functionality requires the model '
                                     'to be capable of outputting '
                                     'probabilities via predict_proba method.')

    if not fuat.are_indices_valid(dataset, np.array([feature_index])):
        raise IndexError('Provided feature index is not valid for the input '
                         'dataset.')

    if isinstance(steps_number, int):
        if steps_number < 2:
            raise ValueError('steps_number has to be at least 2.')
    elif steps_number is None:
        pass
    else:
        raise TypeError('steps_number parameter has to either be None or an '
                        'integer.')

    if (not isinstance(treat_as_categorical, bool)
            and treat_as_categorical is not None):
        raise TypeError('treat_as_categorical has to either be None or a '
                        'boolean.')

    is_input_ok = True
    return is_input_ok
Пример #2
0
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray,
                  protected_features: List[Index]) -> np.ndarray:
    """
    Checks for systemic bias in a dataset.

    This function checks whether there exist data points that share the same
    unprotected features but differ in protected features. For all of these
    instances their label (ground truth) will be checked and if it is
    different, a particular data points pair will be indicated to be biased.
    This dependency is represented as a boolean, square numpy array that shows
    whether systemic bias exists (``True``) for any pair of data points.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be evaluated for systemic bias.
    ground_truth : numpy.ndarray
        The labels corresponding to the dataset.
    protected_features : List[column index]
        A list of column indices in the dataset that hold protected attributes.

    Raises
    ------
    IncorrectShapeError
        The dataset is not a 2-dimensional numpy array, the ground truth is not
        a 1-dimensional numpy array or the number of rows in the dataset is not
        equal to the number of elements in the ground truth array.
    IndexError
        Some of the column indices given in the ``protected_features`` list are
        not valid for the input dataset.
    TypeError
        The ``protected_features`` parameter is not a list.
    ValueError
        There are duplicate values in the protected feature indices list.

    Returns
    -------
    systemic_bias_matrix : numpy.ndarray
        A square, diagonally symmetrical and boolean numpy array that indicates
        which pair of data point share the same unprotected features but differ
        in protected features and the ground truth annotation.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The dataset should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth should be a 1-dimensional '
                                  'numpy array.')
    if ground_truth.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of rows in the dataset and the '
                                  'ground truth should be equal.')
    if isinstance(protected_features, list):
        pfa = np.asarray(protected_features)
        if not fuat.are_indices_valid(dataset, pfa):
            iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist()
            raise IndexError('The following protected feature indices are not '
                             'valid for the dataset array: {}.'.format(iid))
        if len(set(protected_features)) != len(protected_features):
            raise ValueError('Some of the protected indices are duplicated.')
    else:
        raise TypeError('The protected_features parameter should be a list.')

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        unprotected_features_array = recfn.drop_fields(dataset,
                                                       protected_features)
        if unprotected_features_array is None:
            unprotected_features_array = np.ones((dataset.shape[0], ),
                                                 dtype=[('ones', int)])
    else:
        unprotected_features_array = np.delete(dataset,
                                               protected_features,
                                               axis=1)
        if not unprotected_features_array.size:
            unprotected_features_array = np.ones((dataset.shape[0], 1))

    assert unprotected_features_array.shape[0] == dataset.shape[0], \
        'Must share rows number.'

    systemic_bias_columns = []
    for i in range(unprotected_features_array.shape[0]):
        if is_structured:
            equal_unprotected = (
                unprotected_features_array == unprotected_features_array[i])
        else:
            equal_unprotected = np.apply_along_axis(
                np.array_equal, 1, unprotected_features_array,
                unprotected_features_array[i, :])

        equal_unprotected_indices = np.where(equal_unprotected)

        # Check whether the ground truth is different for these rows
        equal_unprotected[equal_unprotected_indices] = (
            ground_truth[i] != ground_truth[equal_unprotected_indices])
        systemic_bias_columns.append(equal_unprotected)

    systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1)
    assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \
        'The matrix has to be diagonally symmetric.'
    assert not np.diagonal(systemic_bias_matrix).any(), \
        'Same elements cannot be systemically biased.'
    return systemic_bias_matrix
Пример #3
0
def group_by_column(
    dataset: np.ndarray,
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None
) -> Tuple[List[List[int]], List[str]]:
    """
    Groups row indices of an array based on value grouping of a chosen column.

    If selected column is numerical, by default the values are grouped into 5
    bins equally distributed between the minimum and the maximum value of the
    column. The number of bins can be changed with the
    ``numerical_bins_number`` if desired. Alternatively, the exact bin
    boundaries can be given via the ``groupings`` parameter.

    For categorical columns, the default binning is one bin for every unique
    value in the selected column. This behaviour can be changed by providing
    the ``groupings`` parameter, where multiple values can be selected to
    create one bin.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be used for grouping the row indices.
    column_index : Union[string, integer]
        A column index (a string for structured numpy arrays or an integer for
        unstructured arrays) of the column based on which the row indices will
        be partitioned.
    groupings : List[Union[number, Tuple[string]]], optional (default=None)
        A list of user-specified groupings for the selected column. The default
        grouping for categorical (textual) columns is splitting them by all the
        unique values therein. The numerical columns are, by default, binned
        into 5 bins (see the ``numerical_bins_number`` parameter) uniformly
        distributed between the minimum and the maximum value of the column.
        To introduce custom binning for a categorical column ``groupings``
        parameter should be a list of tuples, where every tuple represents a
        single group. For example, a column with the following unique values
        ``['a', 'b', 'c', 'd']`` can be split into two groups: ``['a', 'd']``
        and ``['b', 'c']`` by providing ``[('a', 'd'), ('b', 'c')]`` grouping.
        For numerical columns custom grouping should be introduced as a list of
        bucket boundaries. Every bucket includes all the values that are
        **less or equal** to the specified bucket boundary and greater than the
        previous boundary if one is given.
    numerical_bins_number : integer, optional (default=5)
        The number of bins used for default binning of numerical columns.
    treat_as_categorical : boolean, optional (default=None)
        Whether the selected column should be treated as a categorical or
        numerical feature. If set to ``None``, the type of the column will be
        inferred from the data therein. If set to ``False``, the column will be
        treated as numerical unless it is string-based in which case a warning
        will be emitted and the column will be treated as numerical despite
        this setting. Finally, if set to ``True``, the column will be treated
        as categorical.

    Warns
    -----
    UserWarning
        When grouping is done on a categorical column a warning is emitted when
        some of the values in that column are not accounted for, i.e. they are
        not included in the ``groupings`` parameter. Also, if some of the rows
        are not included in any of the groupings, a warning is shown. Missing
        row indices may be a result of some of the values being not-a-number
        for a numerical column and missing some of the unique values for a
        categorical column. ``treat_as_categorical`` parameter is set to
        ``False``, however the feature selected is string-based
        (i.e. categorical), therefore cannot be treated as a numerical one.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not 2-dimensional.
    IndexError
        The supplied ``column_index`` is not valid for the input ``dataset``.
    TypeError
        The column index is neither a string nor an integer. The numerical bins
        number is not an integer. The ``groupings`` parameter is neither a list
        not ``None``. One of the grouping bin boundaries (for a numerical
        feature column) is not a number. One of the groupings (for a
        categorical feature column) is not a tuple. The
        ``treat_as_categorical`` parameter is neither a boolean nor ``None``.
    ValueError
        The input ``dataset`` is not of a base type. The numerical bins number
        is less than 2. The ``groupings`` list is empty. The numbers in the
        ``groupings`` parameter are not monotonically increasing (for a
        numerical column). There are duplicate values shared among tuples in
        the ``grouping`` parameter or one of the values does not appear in the
        selected column (for a categorical column).

    Returns
    -------
    indices_per_bin : List[List[integer]]
        A list of lists with the latter one holding row indices of a particular
        group.
    bin_names : List[string]
        A list holding a description of each group.
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input array should be 2-dimensional.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    # Check index validity
    if isinstance(column_index, (str, int)):
        if not fuat.are_indices_valid(dataset, np.array([column_index])):
            raise IndexError('*{}* is not a valid column index for the input '
                             'dataset.'.format(column_index))
    else:
        raise TypeError('The column index can either be a string or an '
                        'integer.')

    # Check the number of numerical bins
    if isinstance(numerical_bins_number, int):
        if numerical_bins_number < 2:
            raise ValueError('The numerical_bins_number needs to be at least '
                             '2.')
    else:
        raise TypeError('The numerical_bins_number parameter has to be an '
                        'integer.')

    # Check treat_as_categorical
    if treat_as_categorical is not None:
        if not isinstance(treat_as_categorical, bool):
            raise TypeError('The treat_as_categorical parameter has to be a '
                            'boolean.')

    if fuav.is_structured_array(dataset):
        column = dataset[column_index]
    else:
        column = dataset[:, column_index]
    assert fuav.is_1d_array(column), 'This must be a 1D numpy array.'

    # Get a list of all the row indices
    all_row_indices = set(range(column.shape[0]))

    indices_per_bin = []
    bin_names = []

    is_numerical_column = fuav.is_numerical_array(column)
    is_categorical_column = fuav.is_textual_array(column)
    assert is_numerical_column is not is_categorical_column, \
        'The column must be a base array.'

    # Sort out numerical/categorical column treatment
    if treat_as_categorical is None:
        go_numerical = is_numerical_column
    else:
        if treat_as_categorical:
            go_numerical = False
        else:  # Treat as numerical
            if is_numerical_column:
                go_numerical = True
            else:  # Is not numerical
                warnings.warn(
                    'Selected feature is categorical, therefore cannot be '
                    'treated as numerical. The feature will be treated as '
                    'categorical despite the treat_as_categorical parameter '
                    'set to False.', UserWarning)
                go_numerical = False

    if go_numerical:
        if groupings is None:
            # Get default bins
            bins = np.linspace(column.min(),
                               column.max(),
                               num=numerical_bins_number,
                               endpoint=False)[1:].tolist()
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A numerical grouping list has to contain at '
                                 'least one element.')

            # Every element in the groupings list must be a number
            for i, number in enumerate(groupings):
                if not isinstance(number, Number):
                    raise TypeError('For a numerical column all of the '
                                    'grouping items must be numbers. *{}* '
                                    'is not a number.'.format(number))
                if i != 0:
                    if number <= groupings[i - 1]:
                        raise ValueError('The numbers in the groupings list '
                                         'have to be monotonically '
                                         'increasing.')
            bins = groupings
        else:
            raise TypeError('Since a numerical column was chosen the grouping '
                            'must be a list of bin boundaries or None.')

        lower_edge = 'x <= {}'
        middle = '{} < x <= {}'
        upper_edge = '{} < x'

        indices_seen_so_far = set()  # type: Set[int]

        for i, edge in enumerate(bins):
            if i == 0:
                indices = np.where(column <= edge)[0].tolist()

                indices_per_bin.append(indices)
                bin_names.append(lower_edge.format(edge))
            else:
                edge_lower = bins[i - 1]

                indices_l = set(np.where(column <= edge)[0].tolist())
                indices_u = set(np.where(column > edge_lower)[0].tolist())
                indices = list(indices_l.intersection(indices_u))

                indices_per_bin.append(indices)
                bin_names.append(middle.format(edge_lower, edge))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

        assert bins, 'If bins is empty, i and edge will not be defined.'
        # pylint: disable=undefined-loop-variable
        indices = np.where(column > edge)[0].tolist()

        indices_per_bin.append(indices)
        bin_names.append(upper_edge.format(edge))

        assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
        indices_seen_so_far = indices_seen_so_far.union(indices)
    else:
        unique_elements = np.sort(np.unique(column)).tolist()

        if groupings is None:
            bins = [(i, ) for i in unique_elements]
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A categorical grouping list has to contain '
                                 'at least one element.')

            values_seen_so_far = set()  # type: Set[str]

            # Every element in the groupings list must be a valid tuple
            for value_tuple in groupings:
                if not isinstance(value_tuple, tuple):
                    raise TypeError('For a categorical column all of the '
                                    'grouping items must be tuples. *{}* '
                                    'is not a tuple.'.format(value_tuple))
                for value in value_tuple:
                    if value not in unique_elements:
                        raise ValueError('*{}* value is not present in the '
                                         'selected column.'.format(value))

                if values_seen_so_far.intersection(value_tuple):
                    raise ValueError('Some values are duplicated across '
                                     'tuples.')
                values_seen_so_far = values_seen_so_far.union(value_tuple)

            unaccounted_values = set(unique_elements).difference(
                values_seen_so_far)
            if unaccounted_values:
                warnings.warn(
                    'The following values in the selected column were not '
                    'accounted for in the grouping '
                    'tuples:\n{}.'.format(unaccounted_values), UserWarning)

            bins = [tuple(sorted(i)) for i in groupings]  # type: ignore
            bins = sorted(bins)
        else:
            raise TypeError('Since a categorical column was chosen the '
                            'grouping must be a list of tuples representing '
                            'categorical values grouping or None for the '
                            'default grouping.')

        indices_seen_so_far = set()

        for bin_values in bins:
            indices = set()
            for value in bin_values:
                vid = np.where(column == value)[0].tolist()
                indices = indices.union(vid)

            indices_per_bin.append(list(indices))
            bin_names.append('{}'.format(bin_values))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

    # Validate that all of the row indices were accounted for
    missed_indices = all_row_indices.difference(indices_seen_so_far)
    if missed_indices:
        warnings.warn(
            'The following row indices could not be accounted for:\n{}.\n For '
            'a numerical column there may have been some numpy.nan therein. '
            'For a categorical column some of the column values were probably '
            'not specified in the grouping, in which case there should be a '
            'separate user warning.'.format(missed_indices), UserWarning)

    return indices_per_bin, bin_names
Пример #4
0
    def __init__(self,
                 data: np.ndarray,
                 local_explanation: bool = True,
                 model: object = None,
                 **kwargs: Any) -> None:
        """
        Initialises a tabular LIME wrapper.
        """
        # pylint: disable=too-many-branches,too-many-statements

        warnings.warn(
            'The LIME wrapper will be deprecated in FAT Forensics version '
            '0.0.3. Please consider using the TabularBlimeyLime explainer '
            'class implemented in the fatf.transparency.predictions.'
            'surrogate_explainers module instead. Alternatively, you may '
            'consider building a custom surrogate explainer using the '
            'functionality implemented in FAT Forensics -- see the *Tabular '
            'Surrogates* how-to guide for more details.', FutureWarning)

        valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS)
        invalid_params = set(kwargs.keys()).difference(valid_params)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        # Split parameters
        init_params = {
            key: kwargs[key]
            for key in kwargs if key in self._INIT_PARAMS
        }
        explain_params = {
            key: kwargs[key]
            for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS
        }

        # Check data
        if not fuav.is_2d_array(data):
            raise IncorrectShapeError('The data parameter must be a '
                                      '2-dimensional numpy array.')
        if not fuav.is_numerical_array(data):
            raise ValueError('LIME does not support non-numerical data '
                             'arrays.')

        # Honour native local explanation keyword
        local_explanation_keyword = 'sample_around_instance'
        if local_explanation_keyword not in init_params:
            init_params[local_explanation_keyword] = local_explanation

        # Sort out a structured data array
        if fuav.is_structured_array(data):
            categorical_indices_keyword = 'categorical_features'
            categorical_indices = init_params.get(categorical_indices_keyword,
                                                  None)

            if categorical_indices is not None:
                if isinstance(categorical_indices, list):
                    categorical_indices = np.array(categorical_indices)
                elif isinstance(categorical_indices, np.ndarray):
                    pass
                else:
                    raise TypeError('The {} parameter either has to be a '
                                    'list, a numpy array or None.'.format(
                                        categorical_indices_keyword))

                if not fuav.is_1d_array(categorical_indices):
                    raise IncorrectShapeError(
                        '{} array/list is not '
                        '1-dimensional.'.format(categorical_indices_keyword))
                if not fuav.is_textual_array(categorical_indices):
                    raise ValueError('Since {} is an array of indices for '
                                     'a structured array, all of its elements '
                                     'should be strings.'.format(
                                         categorical_indices_keyword))

                # Check categorical indices
                if not fuat.are_indices_valid(data, categorical_indices):
                    raise ValueError(
                        'Indices given in the {} parameter '
                        'are not valid for the input data '
                        'array.'.format(categorical_indices_keyword))
                init_params[categorical_indices_keyword] = np.array(
                    [data.dtype.names.index(y) for y in categorical_indices])

            data = fuat.as_unstructured(data)

        # Get a LIME tabular explainer
        self.mode = init_params.get('mode', 'classification')
        if self.mode not in ['classification', 'regression']:
            raise ValueError("The mode must be either 'classification' or "
                             "'regression'. '{}' given.".format(self.mode))

        self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer(
            data, **init_params)

        # Check the model
        self.model = model
        self.model_is_probabilistic = False
        if model is not None:
            if fumv.check_model_functionality(
                    model, require_probabilities=True, suppress_warning=True):
                self.model_is_probabilistic = True
            elif fumv.check_model_functionality(
                    model, require_probabilities=False, suppress_warning=True):
                self.model_is_probabilistic = False
                logger.warning('The model can only be used for LIME in a '
                               'regressor mode.')
            else:
                raise IncompatibleModelError('LIME requires a model object to '
                                             'have a fit method and '
                                             'optionally a predict_proba '
                                             'method.')

        # Check the predictive function and memorise parameters that may be
        # useful for explaining an instance
        pred_fn_name = 'predict_fn'
        if pred_fn_name in explain_params:
            prediction_function = explain_params[pred_fn_name]
            # Make sure that its a function
            if not callable(prediction_function):
                raise TypeError('The {} parameter is not callable -- it has '
                                'to be a function.'.format(pred_fn_name))

            # Warn the user if both a model and a function are provided
            if self.model is not None:
                warnings.warn(
                    'Since both, a model and a predictive function, are '
                    'provided only the latter will be used.', UserWarning)

        self.explain_instance_params = explain_params
def test_are_indices_valid():
    """
    Tests :func:`fatf.utils.array.tools.are_indices_valid` function.
    """
    type_error = 'Input arrays should be numpy array-like objects.'
    incorrect_shape_array = 'The input array should be 2-dimensional.'
    incorrect_shape_indices = 'The indices array should be 1-dimensional.'
    with pytest.raises(TypeError) as exin:
        fuat.are_indices_valid(None, np.ones((4, )))
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.are_indices_valid(None, np.ones((4, 4)))
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.are_indices_valid(np.ones((4, )), None)
    assert str(exin.value) == type_error
    with pytest.raises(TypeError) as exin:
        fuat.are_indices_valid(None, np.ones((4, 4)))
    assert str(exin.value) == type_error
    # Incorrect shape array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.are_indices_valid(np.ones((5, )), np.ones((4, 4)))
    assert str(exin.value) == incorrect_shape_array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.are_indices_valid(np.ones((5, )), np.ones((4, )))
    assert str(exin.value) == incorrect_shape_array
    with pytest.raises(IncorrectShapeError) as exin:
        fuat.are_indices_valid(np.ones((5, 3)), np.ones((4, 4)))
    assert str(exin.value) == incorrect_shape_indices

    assert not fuat.are_indices_valid(NUMERICAL_NP_ARRAY, np.array([0, 2]))
    assert not fuat.are_indices_valid(NUMERICAL_NP_ARRAY, np.array(['a', 1]))
    assert fuat.are_indices_valid(NUMERICAL_NP_ARRAY, np.array([1, 0]))
    #
    assert not fuat.are_indices_valid(NOT_NUMERICAL_NP_ARRAY, np.array([0, 2]))
    assert not fuat.are_indices_valid(NOT_NUMERICAL_NP_ARRAY,
                                      np.array(['a', 1]))  # yapf: disable
    assert fuat.are_indices_valid(NOT_NUMERICAL_NP_ARRAY, np.array([0, 1]))
    #
    assert not fuat.are_indices_valid(NUMERICAL_STRUCTURED_ARRAY,
                                      np.array([0, 'numbers']))
    assert not fuat.are_indices_valid(NUMERICAL_STRUCTURED_ARRAY,
                                      np.array([0]))  # yapf: disable
    assert fuat.are_indices_valid(NUMERICAL_STRUCTURED_ARRAY,
                                  np.array(['complex', 'numbers']))
    #
    assert fuat.are_indices_valid(WIDE_STRUCTURED_ARRAY,
                                  np.array(['complex', 'numbers']))