예제 #1
0
def systemic_bias(dataset: np.ndarray, ground_truth: np.ndarray,
                  protected_features: List[Index]) -> np.ndarray:
    """
    Checks for systemic bias in a dataset.

    This function checks whether there exist data points that share the same
    unprotected features but differ in protected features. For all of these
    instances their label (ground truth) will be checked and if it is
    different, a particular data points pair will be indicated to be biased.
    This dependency is represented as a boolean, square numpy array that shows
    whether systemic bias exists (``True``) for any pair of data points.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be evaluated for systemic bias.
    ground_truth : numpy.ndarray
        The labels corresponding to the dataset.
    protected_features : List[column index]
        A list of column indices in the dataset that hold protected attributes.

    Raises
    ------
    IncorrectShapeError
        The dataset is not a 2-dimensional numpy array, the ground truth is not
        a 1-dimensional numpy array or the number of rows in the dataset is not
        equal to the number of elements in the ground truth array.
    IndexError
        Some of the column indices given in the ``protected_features`` list are
        not valid for the input dataset.
    TypeError
        The ``protected_features`` parameter is not a list.
    ValueError
        There are duplicate values in the protected feature indices list.

    Returns
    -------
    systemic_bias_matrix : numpy.ndarray
        A square, diagonally symmetrical and boolean numpy array that indicates
        which pair of data point share the same unprotected features but differ
        in protected features and the ground truth annotation.
    """
    # pylint: disable=too-many-branches
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The dataset should be a 2-dimensional '
                                  'numpy array.')
    if not fuav.is_1d_array(ground_truth):
        raise IncorrectShapeError('The ground truth should be a 1-dimensional '
                                  'numpy array.')
    if ground_truth.shape[0] != dataset.shape[0]:
        raise IncorrectShapeError('The number of rows in the dataset and the '
                                  'ground truth should be equal.')
    if isinstance(protected_features, list):
        pfa = np.asarray(protected_features)
        if not fuat.are_indices_valid(dataset, pfa):
            iid = np.sort(fuat.get_invalid_indices(dataset, pfa)).tolist()
            raise IndexError('The following protected feature indices are not '
                             'valid for the dataset array: {}.'.format(iid))
        if len(set(protected_features)) != len(protected_features):
            raise ValueError('Some of the protected indices are duplicated.')
    else:
        raise TypeError('The protected_features parameter should be a list.')

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        unprotected_features_array = recfn.drop_fields(dataset,
                                                       protected_features)
        # Needed for numpy<1.18
        if unprotected_features_array is None:
            unprotected_features_array = np.ones(  # pragma: nocover
                (dataset.shape[0], ),
                dtype=[('ones', int)])
    else:
        unprotected_features_array = np.delete(dataset,
                                               protected_features,
                                               axis=1)
        if not unprotected_features_array.size:
            unprotected_features_array = np.ones((dataset.shape[0], 1))

    assert unprotected_features_array.shape[0] == dataset.shape[0], \
        'Must share rows number.'

    systemic_bias_columns = []
    for i in range(unprotected_features_array.shape[0]):
        if is_structured:
            equal_unprotected = (
                unprotected_features_array == unprotected_features_array[i])
        else:
            equal_unprotected = np.apply_along_axis(
                np.array_equal, 1, unprotected_features_array,
                unprotected_features_array[i, :])

        equal_unprotected_indices = np.where(equal_unprotected)

        # Check whether the ground truth is different for these rows
        equal_unprotected[equal_unprotected_indices] = (
            ground_truth[i] != ground_truth[equal_unprotected_indices])
        systemic_bias_columns.append(equal_unprotected)

    systemic_bias_matrix = np.stack(systemic_bias_columns, axis=1)
    assert np.array_equal(systemic_bias_matrix, systemic_bias_matrix.T), \
        'The matrix has to be diagonally symmetric.'
    assert not np.diagonal(systemic_bias_matrix).any(), \
        'Same elements cannot be systemically biased.'
    return systemic_bias_matrix
예제 #2
0
def get_point_distance(
        data_array: np.ndarray, data_point: Union[np.ndarray, np.void],
        distance_function: Callable[[np.ndarray, np.ndarray], float]
) -> np.ndarray:
    """
    Computes the distance between a data point and an array of data.

    This function computes the distances between the ``data_point`` and all
    rows of the ``data_array``.

    Parameters
    ----------
    data_array : numpy.ndarray
        A 2-dimensional numpy array to which rows distances will be computed.
    data_point : Union[numpy.ndarray, numpy.void]
        A 1-dimensional numpy array or numpy void (for structured data points)
        for which distances to every row of the ``data_array`` will be
        computed.
    distance_function : Callable[[numpy.ndarray, numpy.ndarray], number]
        A Python function that takes as an input two 1-dimensional numpy arrays
        of equal length and outputs a number representing a distance between
        them. **The distance function is assumed to return the same distance
        regardless of the order in which parameters are given.**

    Raises
    ------
    AttributeError
        The distance function does not require exactly two parameters.
    IncorrectShapeError
        The data array is not a 2-dimensional numpy array. The data point is
        not 1-dimensional. The number of columns in the data array is different
        to the number of elements in the data point.
    TypeError
        The data array or the data point is not of a base type (numbers and/or
        strings). The data point and the data array have incomparable dtypes.
        The distance function is not a Python callable (function).

    Returns
    -------
    distances : numpy.ndarray
        A 1-dimensional numerical numpy array with distances between
        ``data_point`` and every row of the ``data_array``.
    """
    assert _validate_get_distance(data_array,
                                  distance_function), 'Invalid input.'

    is_structured = fuav.is_structured_array(data_array)

    if not fuav.is_1d_like(data_point):
        raise IncorrectShapeError('The data point has to be 1-dimensional '
                                  'numpy array or numpy void (for structured '
                                  'arrays).')
    data_point_array = np.asarray([data_point])
    if not fuav.is_base_array(data_point_array):
        raise TypeError('The data point has to be of a base type (strings '
                        'and/or numbers).')
    if not fuav.are_similar_dtype_arrays(data_array, data_point_array):
        raise TypeError('The dtypes of the data set and the data point are '
                        'too different.')
    # Testing only for unstructured as the dtype comparison picks up on a
    # different number of columns in a structured array
    if not is_structured:
        if data_array.shape[1] != data_point_array.shape[1]:
            raise IncorrectShapeError('The data point has different number of '
                                      'columns (features) than the data set.')

    if is_structured:
        distances = np.zeros((data_array.shape[0], ), dtype=np.float64)
        for row_i in range(data_array.shape[0]):
            distances[row_i] = distance_function(data_array[row_i], data_point)
    else:
        distances = np.apply_along_axis(distance_function, 1, data_array,
                                        data_point)

    return distances
예제 #3
0
    def __init__(self,
                 data: np.ndarray,
                 local_explanation: bool = True,
                 model: object = None,
                 **kwargs: Any) -> None:
        """
        Initialises a tabular LIME wrapper.
        """
        # pylint: disable=too-many-branches,too-many-statements
        valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS)
        invalid_params = set(kwargs.keys()).difference(valid_params)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        # Split parameters
        init_params = {
            key: kwargs[key]
            for key in kwargs if key in self._INIT_PARAMS
        }
        explain_params = {
            key: kwargs[key]
            for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS
        }

        # Check data
        if not fuav.is_2d_array(data):
            raise IncorrectShapeError('The data parameter must be a '
                                      '2-dimensional numpy array.')
        if not fuav.is_numerical_array(data):
            raise ValueError('LIME does not support non-numerical data '
                             'arrays.')

        # Honour native local explanation keyword
        local_explanation_keyword = 'sample_around_instance'
        if local_explanation_keyword not in init_params:
            init_params[local_explanation_keyword] = local_explanation

        # Sort out a structured data array
        if fuav.is_structured_array(data):
            categorical_indices_keyword = 'categorical_features'
            categorical_indices = init_params.get(categorical_indices_keyword,
                                                  None)

            if categorical_indices is not None:
                if isinstance(categorical_indices, list):
                    categorical_indices = np.array(categorical_indices)
                elif isinstance(categorical_indices, np.ndarray):
                    pass
                else:
                    raise TypeError('The {} parameter either has to be a '
                                    'list, a numpy array or None.'.format(
                                        categorical_indices_keyword))

                if not fuav.is_1d_array(categorical_indices):
                    raise IncorrectShapeError(
                        '{} array/list is not '
                        '1-dimensional.'.format(categorical_indices_keyword))
                if not fuav.is_textual_array(categorical_indices):
                    raise ValueError('Since {} is an array of indices for '
                                     'a structured array, all of its elements '
                                     'should be strings.'.format(
                                         categorical_indices_keyword))

                # Check categorical indices
                if not fuat.are_indices_valid(data, categorical_indices):
                    raise ValueError(
                        'Indices given in the {} parameter '
                        'are not valid for the input data '
                        'array.'.format(categorical_indices_keyword))
                init_params[categorical_indices_keyword] = np.array(
                    [data.dtype.names.index(y) for y in categorical_indices])

            data = fuat.as_unstructured(data)

        # Get a LIME tabular explainer
        self.mode = init_params.get('mode', 'classification')
        if self.mode not in ['classification', 'regression']:
            raise ValueError("The mode must be either 'classification' or "
                             "'regression'. '{}' given.".format(self.mode))

        self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer(
            data, **init_params)

        # Check the model
        self.model = model
        self.model_is_probabilistic = False
        if model is not None:
            if fumv.check_model_functionality(
                    model, require_probabilities=True, suppress_warning=True):
                self.model_is_probabilistic = True
            elif fumv.check_model_functionality(
                    model, require_probabilities=False, suppress_warning=True):
                self.model_is_probabilistic = False
                logger.warning('The model can only be used for LIME in a '
                               'regressor mode.')
            else:
                raise IncompatibleModelError('LIME requires a model object to '
                                             'have a fit method and '
                                             'optionally a predict_proba '
                                             'method.')

        # Check the predictive function and memorise parameters that may be
        # useful for explaining an instance
        pred_fn_name = 'predict_fn'
        if pred_fn_name in explain_params:
            prediction_function = explain_params[pred_fn_name]
            # Make sure that its a function
            if not callable(prediction_function):
                raise TypeError('The {} parameter is not callable -- it has '
                                'to be a function.'.format(pred_fn_name))

            # Warn the user if both a model and a function are provided
            if self.model is not None:
                warnings.warn(
                    'Since both, a model and a predictive function, are '
                    'provided only the latter will be used.', UserWarning)

        self.explain_instance_params = explain_params
def _validate_input_discretiser(
        dataset: np.ndarray,
        categorical_indices: Optional[List[Index]] = None,
        feature_names: Optional[List[str]] = None) -> bool:
    """
    Validates the input parameters of an arbitrary discretiser class.

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with a dataset to be discretised.
    categorical_indices : List[column indices], optional (default=None)
        A list of column indices that should be treat as categorical features.
    feature_names : List[strings], optional (default=None)
        A list of feature names in order they appear in the ``dataset`` array.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not a 2-dimensional numpy array.
    IndexError
        Some of the column indices given in the ``categorical_indices`` list
        are invalid for the input ``dataset``.
    TypeError
        The ``dataset`` is not of a base (numerical and/or string) type.
        The ``categorical_indices`` is neither a Python list nor ``None``.
        The ``feature_names`` is neither a Python list nor ``None`` or one of
        its elements (if it is a list) is not a string.
    ValueError
        The length of the ``feature_names`` list is different than the number
        of columns (features) in the input ``dataset``.

    Returns
    -------
    is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-branches
    is_valid = False

    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input dataset must be a '
                                  '2-dimensional numpy array.')
    if not fuav.is_base_array(dataset):
        raise TypeError('The input dataset must be of a base type.')

    if categorical_indices is not None:
        if isinstance(categorical_indices, list):
            invalid_indices = fuat.get_invalid_indices(
                dataset, np.asarray(categorical_indices))
            if invalid_indices.size:
                raise IndexError('The following indices are invalid for the '
                                 'input dataset: {}.'.format(
                                     invalid_indices.tolist()))
        else:
            raise TypeError('The categorical_indices parameter must be a '
                            'Python list or None.')

    if feature_names is not None:
        if isinstance(feature_names, list):
            if fuav.is_structured_array(dataset):
                features_number = len(dataset.dtype.names)
            else:
                features_number = dataset.shape[1]
            if len(feature_names) != features_number:
                raise ValueError('The length of feature_names list must be '
                                 'equal to the number of features (columns) '
                                 'in the input dataset.')

            for name in feature_names:
                if not isinstance(name, str):
                    raise TypeError('All of the feature_names must be '
                                    'strings. The *{}* feature name is not a '
                                    'string.'.format(name))
        else:
            raise TypeError('The feature_names parameter must be a Python '
                            'list or None.')

    is_valid = True
    return is_valid
예제 #5
0
def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:
    `fatf.transparency.data.describe_functions.describe_categorical_array`
    functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:
        `fatf.transparency.data.describe_functions.describe_numerical_array`
        function responsible for describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int],
                       Dict[str, Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore
예제 #6
0
def _validate_input(ice_pdp_array: np.ndarray,
                    feature_linespace: np.ndarray,
                    class_index: int,
                    feature_name: Union[None, str],
                    class_name: Union[None, str],
                    plot_axis: Union[None, plt.Axes],
                    test_partial_dependence: bool = False) -> bool:
    """
    Validates input parameters for ICE and PD plotting functions.

    Validates input parameters for
    :func:`fatf.vis.feature_influence.plot_individual_conditional_expectation`
    and :func:`fatf.vis.feature_influence.plot_partial_dependence` functions.

    Parameters
    ----------
    ice_pdp_array : numpy.ndarray
        An array that contains ICE or PD calculations.
    feature_linespace : numpy.ndarray
        An array that contains the values for which the selected feature was
        sampled.
    class_index : integer
        The index of the class for which the plot will be created.
    feature_name : string or None
        The name of the feature for which ICE or PD was originally calculated.
    class_name : string or None
        The name of the class that ``class_index`` parameter points to.
    plot_axis : matplotlib.pyplot.Axes or None
        A matplotlib axis object to plot on top of.
    test_partial_dependence : boolean
        Whether to treat the input array as PD or ICE calculation result.

    Raises
    ------
    IncorrectShapeError
        The ICE or the PD array has a wrong number of dimensions (3 and 2
        respectively). The feature linespace array has a wrong number of
        dimensions -- 1 is expected.
    IndexError
        The class index is invalid for the input array.
    TypeError
        The class index is not an integer; the feature name is not a string or
        a ``None``; the class name is not a string or a ``None``; the plot axis
        is not a matplotlib.pyplot.Axes type object or a ``None``.
    ValueError
        The input array is structured or is not numerical. The linespace array
        is structured, not numerical or its length does not agree with the
        number of steps in the input array.

    Returns
    -------
    input_is_valid : boolean
        ``True`` if the input is valid, ``False`` otherwise.
    """
    # pylint: disable=too-many-arguments,too-many-branches
    input_is_valid = False

    assert isinstance(test_partial_dependence, bool), \
        'test_partial_dependence is not a boolean.'

    if fuav.is_structured_array(ice_pdp_array):
        raise ValueError('The input array cannot be a structured array.')
    if not fuav.is_numerical_array(ice_pdp_array):
        raise ValueError('The input array has to be a numerical array.')

    if test_partial_dependence:
        if len(ice_pdp_array.shape) != 2:
            raise IncorrectShapeError('plot_partial_depenedence expects a '
                                      '2-dimensional array of shape (n_steps, '
                                      'n_classes).')
    else:
        if len(ice_pdp_array.shape) != 3:
            raise IncorrectShapeError('plot_individual_condtional_expectation '
                                      'expects a 3-dimensional array of shape '
                                      '(n_samples, n_steps, n_classes).')

    if fuav.is_structured_array(feature_linespace):
        raise ValueError('The linespace array cannot be a structured array.')
    if not fuav.is_1d_array(feature_linespace):
        raise IncorrectShapeError('The linespace array has to be a '
                                  '1-dimensional array of shape (n_steps, ).')
    if not fuav.is_numerical_array(feature_linespace):
        raise ValueError('The linespace array has to be numerical.')
    if feature_linespace.shape[0] != ice_pdp_array.shape[-2]:
        raise ValueError('The length of the linespace array ({}) does not '
                         'agree with the number of linespace steps ({}) in '
                         'the input array.'.format(feature_linespace.shape[0],
                                                   ice_pdp_array.shape[-2]))

    # Is the index valid for the array
    if not isinstance(class_index, int):
        raise TypeError('Class index has to be an integer.')
    if class_index < 0 or class_index >= ice_pdp_array.shape[-1]:
        raise IndexError('Class index {} is not a valid index for the '
                         'input array. There are only {} classes '
                         'available.'.format(class_index,
                                             ice_pdp_array.shape[-1]))

    if feature_name is not None and not isinstance(feature_name, str):
        raise TypeError('The feature name has to be either None or a string.')

    if class_name is not None and not isinstance(class_name, str):
        raise TypeError('The class name has to be either None or a string.')

    if plot_axis is not None and not isinstance(plot_axis, plt.Axes):
        raise TypeError('The plot axis has to be either None or a matplotlib.'
                        'pyplot.Axes type object.')

    input_is_valid = True
    return input_is_valid
예제 #7
0
def partial_dependence_ice(
        ice_array: np.ndarray,
        include_rows: Optional[Union[int, List[int]]] = None,
        exclude_rows: Optional[Union[int, List[int]]] = None) -> np.ndarray:
    """
    Calculates Partial Dependence based on Individual Conditional Expectations.

    .. note:: If you want to calculate Partial Dependence directly from a
       dataset and a classifier please see
       :func:`fatf.transparency.models.feature_influence.partial_dependence`
       function.

    Parameters
    ----------
    ice_array : numpy.ndarray
        Individual Conditional Expectation array for which Partial Dependence
        is desired.
    include_rows : Union[int, List[int]], optional (default=None)
        Indices of rows that will be included in the PD calculation. If this
        parameter is specified, PD will only be calculated for the selected
        rows. If additionally ``exclude_rows`` is specified the selected rows
        will be a set difference between the two. This parameter can either be
        a *list* of indices or a single index (integer).
    exclude_rows : Union[int, List[int]], optional (default=None)
        The indices of rows to be excluded from the PD calculation. If this
        parameter is specified and ``include_rows`` is not, these indices will
        be excluded from all of the rows. If both include and exclude
        parameters are specified, the rows included in the PD calculation will
        be a set difference of the two. This parameter can either be a *list*
        of indices or a single index (integer).

    Raises
    ------
    IncorrectShapeError
        The input array is not a 3-dimensional numpy array.
    TypeError
        Either ``include_rows`` or ``exclude_rows`` parameter is not ``None``,
        an integer or a list of integers.
    ValueError
        The ``ice_array`` is not an unstructured numpy array or it is not a
        numerical array. One of the ``include_rows`` or ``exclude_rows``
        indices is not valid for the input array.

    Returns
    -------
    partial_dependence_array : numpy.ndarray
        A 2-dimensional array of (steps_number, n_classes) shape representing
        Partial Dependence for all of the classes for selected rows (data
        points).
    """
    if fuav.is_structured_array(ice_array):
        raise ValueError('The ice_array should not be structured.')
    if not fuav.is_numerical_array(ice_array):
        raise ValueError('The ice_array should be purely numerical.')
    if len(ice_array.shape) != 3:
        raise IncorrectShapeError('The ice_array should be 3-dimensional.')

    rows_number = ice_array.shape[0]
    include_r = _filter_rows(include_rows, exclude_rows, rows_number)
    filtered_ice_array = ice_array[include_r]

    partial_dependence_array = filtered_ice_array.mean(axis=0)

    return partial_dependence_array
예제 #8
0
def _interpolate_array(
        dataset: np.ndarray,
        feature_index: Union[int, str],  # yapf: disable
        treat_as_categorical: bool,
        steps_number: Union[int, None]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Generates a 3-D array with interpolated values for the selected feature.

    If the selected feature is numerical the interpolated values are a
    numerical array with evenly spaced numbers between the minimum and the
    maximum value in that column. Otherwise, when the feature is categorical
    the interpolated values are all the unique elements of the that column.

    To get the interpolation the original 2-D dataset is stacked on top of
    itself the number of times equal to the number of desired interpolation
    samples. Then, for every copy of that dataset the selected feature is fixed
    to consecutive values of the interpolated array (the same value for the
    whole copy of the dataset).

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset based on which interpolation will be done.
    feature_index : Union[integer, string]
        An index of the feature column in the input dataset for which the
        interpolation will be computed.
    treat_as_categorical : boolean
        Whether to treat the selected feature as categorical or numerical.
    steps_number : Union[integer, None]
        The number of evenly spaced samples between the minimum and the maximum
        value of the selected feature for which the model's prediction will be
        evaluated. This parameter applies only to numerical features, for
        categorical features regardless whether it is a number or ``None``, it
        will be ignored.

    Returns
    -------
    interpolated_data : numpy.ndarray
        Numpy array of shape (n_samples, steps_number, n_features) -- where the
        (n_samples, n_features) is the dimension of the input ``dataset`` --
        holding the input ``dataset`` augmented with the interpolated values.
    interpolated_values : numpy.ndarray
        A 1-dimensional array of shape (steps_number, ) holding the
        interpolated values. If a numerical column is selected this will be a
        series of uniformly distributed ``steps_number`` values between the
        minimum and the maximum value of that column. For categorical (textual)
        columns it will hold all the unique values from that column.
    """
    assert isinstance(dataset, np.ndarray), 'Dataset -> numpy array.'
    assert isinstance(feature_index, (int, str)), 'Feature index -> str/ int.'
    assert isinstance(treat_as_categorical, bool), 'As categorical -> bool.'
    assert steps_number is None or isinstance(steps_number, int), \
        'Steps number -> None/ int.'

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        column = dataset[feature_index]
    else:
        column = dataset[:, feature_index]

    if treat_as_categorical:
        interpolated_values = np.unique(column)
        interpolated_values.sort()
        # Ignoring steps number -- not needed for categorical.
        steps_number = interpolated_values.shape[0]
    else:
        assert isinstance(steps_number, int), 'Steps number must be an int.'
        interpolated_values = np.linspace(column.min(), column.max(),
                                          steps_number)

        # Give float type to this column if it is a structured array
        if (is_structured
                and dataset.dtype[feature_index] != interpolated_values.dtype):
            new_types = []
            for name in dataset.dtype.names:
                if name == feature_index:
                    dtype = fuat.generalise_dtype(interpolated_values.dtype,
                                                  dataset.dtype[name])
                    new_types.append((name, dtype))
                else:
                    new_types.append((name, dataset.dtype[name]))
            dataset = dataset.astype(new_types)
        elif not is_structured and dataset.dtype != interpolated_values.dtype:
            dtype = fuat.generalise_dtype(interpolated_values.dtype,
                                          dataset.dtype)
            dataset = dataset.astype(dtype)

    interpolated_data = np.repeat(dataset[:, np.newaxis], steps_number, axis=1)
    assert len(interpolated_values) == steps_number, 'Required for broadcast.'
    if is_structured:
        for idx in range(steps_number):
            # Broadcast the new value.
            interpolated_data[:, idx][feature_index] = interpolated_values[idx]
    else:
        # Broadcast the new vector.
        interpolated_data[:, :, feature_index] = interpolated_values

    return interpolated_data, interpolated_values
예제 #9
0
def individual_conditional_expectation(
    dataset: np.ndarray,
    model: object,
    feature_index: Union[int, str],
    treat_as_categorical: Optional[bool] = None,
    steps_number: Optional[int] = None,
    include_rows: Optional[Union[int, List[int]]] = None,
    exclude_rows: Optional[Union[int, List[int]]] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculates Individual Conditional Expectation for a selected feature.

    Based on the provided dataset and model this function computes Individual
    Conditional Expectation (ICE) of a selected feature for all target classes.
    If ``treat_as_categorical`` parameter is not provided the function will
    infer the type of the selected feature and compute the appropriate ICE.
    Otherwise, the user can specify whether the selected feature should be
    treated as a categorical or numerical feature. If the selected feature is
    numerical, you can specify the number of samples between this feature's
    minimum and maximum value for which the input model will be evaluated.
    By default this value is set to 100.

    Finally, it is possible to filter the rows of the input dataset that will
    be used to calculate ICE with ``include_rows`` and ``exclude_rows``
    parameters. If ``include_rows`` is specified ICE will only be calculated
    for these rows. If both include and exclude parameters are given, ICE will
    be computed for the set difference. Finally, if only the exclude parameter
    is specified, these rows will be subtracted from the whole dataset.

    This approach is an implementation of a method introduced by
    [GOLDSTEIN2015PEEKING]_. It is intended to be used with probabilistic
    models, therefore the input model must have a ``predict_proba`` method.

    .. [GOLDSTEIN2015PEEKING] Goldstein, A., Kapelner, A., Bleich, J. and
       Pitkin, E., 2015. Peeking inside the black box: Visualizing statistical
       learning with plots of individual conditional expectation. Journal of
       Computational and Graphical Statistics, 24(1), pp.44-65.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset based on which ICE will be computed.
    model : object
        A fitted model which predictions will be used to calculate ICE. (Please
        see :class:`fatf.utils.models.models.Model` class documentation for the
        expected model object specification.)
    feature_index : Union[integer, string]
        An index of the feature column in the input dataset for which ICE will
        be computed.
    treat_as_categorical : boolean, optional (default=None)
        Whether to treat the selected feature as categorical or numerical.
    steps_number : integer, optional (default=None, i.e. 100)
        The number of evenly spaced samples between the minimum and the maximum
        value of the selected feature for which the model's prediction will be
        evaluated. (This parameter applies only to numerical features.)
    include_rows : Union[int, List[int]], optional (default=None)
        Indices of rows that will be included in the ICE calculation. If this
        parameter is specified, ICE will only be calculated for the selected
        rows. If additionally ``exclude_rows`` is specified the selected rows
        will be a set difference between the two. This parameter can either be
        a *list* of indices or a single index (integer).
    exclude_rows : Union[int, List[int]], optional (default=None)
        The indices of rows to be excluded from the ICE calculation. If this
        parameter is specified and ``include_rows`` is not, these indices will
        be excluded from all of the rows. If both include and exclude
        parameters are specified, the rows included in the ICE calculation will
        be a set difference of the two. This parameter can either be a *list*
        of indices or a single index (integer).

    Warns
    -----
    UserWarning
        The feature is treated as categorical but the number of steps parameter
        is provided (not ``None``). In this case the ``steps_number`` parameter
        is ignored. Also, the user is warned when the selected feature is
        detected to be categorical (textual) while the user indicated that it
        is numerical.

    Raises
    ------
    IncompatibleModelError
        The model does not have required functionality -- it needs to be able
        to output probabilities via ``predict_proba`` method.
    IncorrectShapeError
        The input dataset is not a 2-dimensional numpy array.
    IndexError
        Provided feature (column) index is invalid for the input dataset.
    TypeError
        ``treat_as_categorical`` is not ``None`` or boolean. The
        ``steps_number`` parameter is not ``None`` or integer. Either
        ``include_rows`` or ``exclude_rows`` parameter is not ``None``, an
        integer or a list of integers.
    ValueError
        The input dataset must only contain base types (textual and numerical
        values). One of the ``include_rows`` or ``exclude_rows`` indices is not
        valid for the input dataset. The ``steps_number`` is smaller than 2.

    Returns
    -------
    ice : numpy.ndarray
        An array of Individual Conditional Expectations for all of the selected
        dataset rows and the feature (dataset column) of choice. It's of the
        (n_samples, steps_number, n_classes) shape where n_samples is the
        number of rows selected from the dataset for the ICE computation,
        steps_number is the number of generated samples for the selected
        feature and n_classes is the number of classes in the target of the
        dataset. The numbers in this array represent the probability of every
        class for every selected data point when the selected feature is fixed
        to one of the values in the generated feature linespace (see below).
    feature_linespace : numpy.ndarray
        A one-dimensional array -- (steps_number, ) -- with the values for
        which the selected feature was substituted when the dataset was
        evaluated with the specified model.
    """
    # pylint: disable=too-many-arguments,too-many-locals
    assert _input_is_valid(dataset, model, feature_index, treat_as_categorical,
                           steps_number), 'Input must be valid.'

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        column = dataset[feature_index]
    else:
        column = dataset[:, feature_index]
    assert fuav.is_1d_array(column), 'Column must be a 1-dimensional array.'

    if fuav.is_numerical_array(column):
        is_categorical_column = False
    elif fuav.is_textual_array(column):
        is_categorical_column = True
    else:
        assert False, 'Must be an array of a base type.'  # pragma: nocover

    # If needed, infer the column type.
    if treat_as_categorical is None:
        treat_as_categorical = is_categorical_column
    elif not treat_as_categorical and is_categorical_column:
        message = ('Selected feature is categorical (string-base elements), '
                   'however the treat_as_categorical was set to False. Such '
                   'a combination is not possible. The feature will be '
                   'treated as categorical.')
        warnings.warn(message, category=UserWarning)
        treat_as_categorical = True
        steps_number = None

    if treat_as_categorical and steps_number is not None:
        warnings.warn(
            'The steps_number parameter will be ignored as the feature is '
            'being treated as categorical.',
            category=UserWarning)

    # If needed, get the default steps number.
    if not treat_as_categorical and steps_number is None:
        steps_number = 100

    rows_number = dataset.shape[0]
    include_r = _filter_rows(include_rows, exclude_rows, rows_number)
    filtered_dataset = dataset[include_r]

    sampled_data, feature_linespace = _interpolate_array(
        filtered_dataset, feature_index, treat_as_categorical, steps_number)

    ice = [
        model.predict_proba(data_slice)  # type: ignore
        for data_slice in sampled_data
    ]
    ice = np.stack(ice, axis=0)

    return ice, feature_linespace
예제 #10
0
def merge_ice_arrays(ice_arrays_list: List[np.ndarray]) -> np.ndarray:
    """
    Merges multiple Individual Conditional Expectation arrays.

    This function allows you to merge Individual Conditional Expectation arrays
    into a single array as long as they were calculated for the same feature
    and for the same number of classes. This may be helpful when evaluating ICE
    for a model over multiple cross-validation folds or for multiple models.

    Parameters
    ----------
    ice_arrays_list : List[numpy.ndarray]
        A list of Individual Conditional Expectation arrays to be merged.

    Raises
    ------
    IncorrectShapeError
        One of the ICE arrays is not 3-dimensional.
    TypeError
        The ``ice_arrays_list`` input parameter is not a list.
    ValueError
        The list of ICE arrays to be merged is empty. One of the ICE arrays is
        not a numerical array. One of the ICE arrays is structured. Some of the
        ICE arrays do not share the same second (number of steps) or third
        (number of classes) dimension or type.

    Returns
    -------
    ice_arrays : numpy.ndarray
        All of the ICE arrays merged together alongside the first dimension
        (number of instances).
    """
    if isinstance(ice_arrays_list, list):
        if not ice_arrays_list:
            raise ValueError('Cannot merge 0 arrays.')

        previous_shape = None
        for ice_array in ice_arrays_list:
            if not fuav.is_numerical_array(ice_array):
                raise ValueError('The ice_array list should only contain '
                                 'numerical arrays.')
            if fuav.is_structured_array(ice_array):
                raise ValueError('The ice_array list should only contain '
                                 'unstructured arrays.')
            if len(ice_array.shape) != 3:
                raise IncorrectShapeError('The ice_array should be '
                                          '3-dimensional.')

            if previous_shape is None:
                previous_shape = (ice_array.shape[1], ice_array.shape[2],
                                  ice_array.dtype)  # yapf: disable
            elif (previous_shape[:2] != ice_array.shape[1:]
                  or previous_shape[2] != ice_array.dtype):
                raise ValueError('All of the ICE arrays need to be '
                                 'constructed for the same number of classes '
                                 'and the same number of samples for the '
                                 'selected feature (the second and the third '
                                 'dimension of the ice array).')
    else:
        raise TypeError('The ice_arrays_list should be a list of numpy arrays '
                        'that represent Individual Conditional Expectation.')

    ice_arrays = np.concatenate(ice_arrays_list, axis=0)
    return ice_arrays
예제 #11
0
def as_unstructured(
        array_like: Union[np.ndarray, np.void],
        **kwargs: Optional[np.dtype]) -> Union[np.dtype, np.ndarray]:
    """
    Converts an array like object into an unstructured array.

    If the input array is unstructured, it is return without any
    transformations. Otherwise, if the input array is either a structured array
    or a structured array row, appropriate structured to unstructured function
    is called.

    .. warning:: Since this function either calls a local implementation or a
       builtin numpy function there may be some inconsistencies in its
       behaviour. One that we are aware of is conversion of arrays that contain
       ``'V'`` -- raw data (void), ``'O'`` -- (Python) objects, ``'M'`` --
       datetime or ``'m'`` -- timedelta dtypes. These types are not supported
       by the local implementation, however some of them are supported by the
       numpy built-in, e.g. the ``'V'`` type.

    Parameters
    ----------
    array_like : Union[numpy.ndarray, numpy.void]
        An array, a structured array or a row of a structured numpy array to be
        converted into a plane numpy array representation.
    **kwargs : Optional[numpy.dtype]
        Named parameters that are passed to the appropriate structured to
        unstructured array converter. These parameters are ignored when calling
        any of the local implementations -- see either
        :func:`fatf.utils.array.tools.structured_to_unstructured_row` or
        :func:`fatf.utils.array.tools.structured_to_unstructured` documentation
        for details.

    Raises
    ------
    TypeError
        The input array is not a numpy array, a structured numpy array or a row
        of a structured numpy array.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.

    Returns
    -------
    classic_array : Union[numpy.dtype, numpy.ndarray]
        A classic numpy array or numpy dtype (in case the structured row has
        just one element) representation of the ``structured_row`` with the
        most generic type out of the input row's dtypes.
    """
    if isinstance(array_like, np.void):
        assert fuav.is_structured_row(array_like), \
            'numpy.void has to be a row of a structured numpy array.'
        classic_array = structured_to_unstructured_row(array_like, **kwargs)
    elif isinstance(array_like, np.ndarray):
        if fuav.is_structured_array(array_like):
            classic_array = structured_to_unstructured(array_like, **kwargs)
        else:
            if fuav.is_base_array(array_like):
                classic_array = array_like
            else:
                raise ValueError('as_unstructured only supports conversion of '
                                 'arrays that hold base numpy types, i.e. '
                                 'numerical and string-like -- numpy void and '
                                 'object-like types are not allowed.')
    else:
        raise TypeError('The input should either be a numpy (structured or '
                        'unstructured) array-like object (numpy.ndarray) or a '
                        'row of a structured numpy array (numpy.void).')
    return classic_array
예제 #12
0
def indices_by_type(array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Identifies indices of columns with numerical and non-numerical values.

    Checks whether a numpy array is purely numerical or a structured array
    and returns two numpy arrays: the first-one with indices of numerical
    columns and the second-one with indices of non-numerical columns.

    Parameters
    ----------
    array : numpy.ndarray
        A numpy array to be checked (it has to be a 2-dimensional array).

    Raises
    ------
    TypeError
        The input array is not a numpy array-like object.
    ValueError
        The input array consists of complex types such as numpy void and
        object-like types that are not supported by this function.
    IncorrectShapeError
        The input array is not 2-dimensional.

    Returns
    -------
    numerical_indices : numpy.ndarray
        A numpy array containing indices of the numerical columns of the input
        array.
    non_numerical_indices : numpy.ndarray
        A numpy array containing indices of the non-numerical columns of the
        input array.
    """
    if not isinstance(array, np.ndarray):
        raise TypeError('The input should be a numpy array-like.')
    if not fuav.is_2d_array(array):
        raise IncorrectShapeError('The input array should be 2-dimensional.')
    if not fuav.is_base_array(array):
        raise ValueError('indices_by_type only supports input arrays that '
                         'hold base numpy types, i.e. numerical and '
                         'string-like -- numpy void and object-like types are '
                         'not allowed.')

    if fuav.is_structured_array(array):
        assert len(array.dtype) > 1, 'This should be a 2D array.'
        numerical_indices_list = []
        non_numerical_indices_list = []

        for column_name in array.dtype.names:
            column_dtype = array.dtype[column_name]
            if fuav.is_numerical_dtype(column_dtype):
                numerical_indices_list.append(column_name)
            else:
                non_numerical_indices_list.append(column_name)

        numerical_indices = np.array(numerical_indices_list)
        non_numerical_indices = np.array(non_numerical_indices_list)
    else:
        if fuav.is_numerical_array(array):
            numerical_indices = np.array(range(array.shape[1]))
            non_numerical_indices = np.empty((0, ), dtype='i8')
        else:
            numerical_indices = np.empty((0, ), dtype='i8')
            non_numerical_indices = np.array(range(array.shape[1]))

    return numerical_indices, non_numerical_indices
예제 #13
0
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """
        Calculates label probabilities for new instances with the fitted model.

        Parameters
        ----------
        X : numpy.ndarray
            The data for which labels probabilities will be predicted.

        Raises
        ------
        IncorrectShapeError
            X is not a 2-dimensional array, it has 0 rows or it has a different
            number of columns than the training data.
        UnfittedModelError
            Raised when trying to predict data when the model has not been
            fitted yet. Try using the ``fit`` method to fit the model first.
        RuntimeError
            Raised when trying to use this method when the predictor is
            initialised as a regressor.
        ValueError
            X has a different dtype than the data used to fit the model.

        Returns
        -------
        probabilities : numpy.ndarray
            Probabilities of each instance belonging to every class. The labels
            in the return array are ordered by lexicographic order.
        """
        if not self._is_classifier:
            raise RuntimeError('This functionality is not available for a '
                               'regressor.')

        if not self._is_fitted:
            raise UnfittedModelError('This model has not been fitted yet.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('X must be a 2-dimensional array. If '
                                      'you want to predict a single data '
                                      'point please format it as a single row '
                                      'in a 2-dimensional array.')
        if not fuav.are_similar_dtype_arrays(X, self._X):
            raise ValueError('X must have the same dtype as the training '
                             'data.')
        if not X.shape[0]:
            raise IncorrectShapeError('X must have at least one row.')
        # No need to check for columns in a structured array -> this is handled
        # by the dtype checker.
        if not fuav.is_structured_array(X):
            if X.shape[1] != self._X.shape[1]:
                raise IncorrectShapeError(('X must have the same number of '
                                           'columns as the training data '
                                           '({}).').format(self._X.shape[1]))

        probabilities = np.empty((X.shape[0], self._unique_y.shape[0]))

        if self._k < self._X_n:
            distances = self._get_distances(X)
            knn = np.argpartition(distances, self._k, axis=0)
            probabilities = []
            for column in knn.T:
                close_labels = self._y[column[:self._k]]
                values, counts = np.unique(close_labels, return_counts=True)
                total_counts = np.sum(counts)
                probs = np.zeros((self._unique_y.shape[0], ))
                for i in range(values.shape[0]):
                    ind = np.where(self._unique_y == values[i])[0]
                    probs[ind] = counts[i] / total_counts
                probabilities.append(probs)
            probabilities = np.array(probabilities)
        else:
            probabilities = np.tile(self._unique_y_probabilities,
                                    (X.shape[0], 1))
        return probabilities
예제 #14
0
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predicts labels of new instances with the fitted model.

        Parameters
        ----------
        X : numpy.ndarray
            The data for which labels will be predicted.

        Raises
        ------
        IncorrectShapeError
            X is not a 2-dimensional array, it has 0 rows or it has a different
            number of columns than the training data.
        UnfittedModelError
            Raised when trying to predict data when the model has not been
            fitted yet. Try using the ``fit`` method to fit the model first.
        ValueError
            X has a different dtype than the data used to fit the model.

        Returns
        -------
        predictions : numpy.ndarray
            Predicted class labels for each data point.
        """
        # pylint: disable=too-many-locals,too-many-branches
        if not self._is_fitted:
            raise UnfittedModelError('This model has not been fitted yet.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('X must be a 2-dimensional array. If '
                                      'you want to predict a single data '
                                      'point please format it as a single row '
                                      'in a 2-dimensional array.')
        if not fuav.are_similar_dtype_arrays(X, self._X):
            raise ValueError('X must have the same dtype as the training '
                             'data.')
        if not X.shape[0]:
            raise IncorrectShapeError('X must have at least one row.')
        # No need to check for columns in a structured array -> this is handled
        # by the dtype checker.
        if not fuav.is_structured_array(X):
            if X.shape[1] != self._X.shape[1]:
                raise IncorrectShapeError(('X must have the same number of '
                                           'columns as the training data '
                                           '({}).').format(self._X.shape[1]))

        predictions = np.empty((X.shape[0], ))

        if self._k < self._X_n:
            distances = self._get_distances(X)
            # If there are 3 nearest neighbours within distances 1, 2 and 2 and
            # k is set to 2, then argpartition will always take the first
            # within distance 2.
            knn = np.argpartition(distances, self._k, axis=0)
            predictions = []
            for column in knn.T:
                close_labels = self._y[column[:self._k]]
                if self._is_classifier:
                    values, counts = np.unique(close_labels,
                                               return_counts=True)
                    # If there is a tie in the counts take into consideration
                    # the overall label count in the training data to resolve
                    # it.
                    top_label_index = counts == counts.max()
                    top_label_unique_sorted = np.sort(values[top_label_index])
                    assert len(top_label_unique_sorted.shape) == 1, \
                        'This should be a flat array.'
                    if top_label_unique_sorted.shape[0] > 1:
                        # Resolve the tie.
                        # Get count of these label for the training data.
                        labels_filter = np.array(self._unique_y.shape[0] *
                                                 [False])
                        for top_prediction in top_label_unique_sorted:
                            unique_y_filter = self._unique_y == top_prediction
                            np.logical_or(labels_filter,
                                          unique_y_filter,
                                          out=labels_filter)
                        g_top_label = self._unique_y[labels_filter]
                        g_top_label_counts = (
                            self._unique_y_counts[labels_filter])

                        # What if any of the global labels have the same count?
                        g_top_label_index = g_top_label_counts == np.max(
                            g_top_label_counts)
                        g_top_label_sorted = np.sort(
                            g_top_label[g_top_label_index])

                        prediction = g_top_label_sorted[0]
                    else:
                        prediction = top_label_unique_sorted[0]
                else:
                    prediction = close_labels.mean()

                predictions.append(prediction)
            predictions = np.array(predictions)
        else:
            predictions = np.array(X.shape[0] * [self._majority_label])

        return predictions
예제 #15
0
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Fits the model.

        Parameters
        ----------
        X : numpy.ndarray
            The KNN training data.
        y : numpy.ndarray
            The KNN training labels.

        Raises
        ------
        IncorrectShapeError
            Either the ``X`` array is not 2-dimensional, the ``y`` array is not
            1-dimensional, the number of rows in ``X`` is not the same as the
            number of elements in ``y`` or the ``X`` array has 0 rows or 0
            columns.
        PrefittedModelError
            Trying to fit the model when it has already been fitted. Usually
            raised when calling the ``fit`` method for the second time without
            clearing the model first.
        TypeError
            Trying to fit a KNN predictor in a regressor mode with
            non-numerical target variable.
        """
        if self._is_fitted:
            raise PrefittedModelError('This model has already been fitted.')
        if not fuav.is_2d_array(X):
            raise IncorrectShapeError('The training data must be a 2-'
                                      'dimensional array.')
        if not fuav.is_1d_array(y):
            raise IncorrectShapeError('The training data labels must be a 1-'
                                      'dimensional array.')
        if X.shape[0] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one data point.')
        # If the array is structured the fuav.is_2d_array function takes care
        # of checking whether there is at least one column
        if not fuav.is_structured_array(X) and X.shape[1] == 0:
            raise IncorrectShapeError('The data array has to have at least '
                                      'one feature.')
        if X.shape[0] != y.shape[0]:
            raise IncorrectShapeError('The number of samples in X must be the '
                                      'same as the number of labels in y.')
        if not self._is_classifier and not fuav.is_numerical_array(y):
            raise TypeError('Regressor can only be fitted for a numerical '
                            'target vector.')

        numerical_indices, categorical_indices = fuat.indices_by_type(X)
        self._numerical_indices = numerical_indices
        self._categorical_indices = categorical_indices

        self._is_structured = fuav.is_structured_array(X)
        self._X = X
        self._y = y

        if self._is_classifier:
            unique_y, unique_y_counts = np.unique(self._y, return_counts=True)
            # Order labels lexicographically.
            unique_y_sort_index = np.argsort(unique_y)
            self._unique_y = unique_y[unique_y_sort_index]
            self._unique_y_counts = unique_y_counts[unique_y_sort_index]

            # How many other labels have the same count.
            top_y_index = self._unique_y_counts == np.max(
                self._unique_y_counts)
            top_y_unique_sorted = np.sort(self._unique_y[top_y_index])
            self._majority_label = top_y_unique_sorted[0]

            self._unique_y_probabilities = (self._unique_y_counts /
                                            self._y.shape[0])
        else:
            self._majority_label = self._y.mean()
            self._unique_y = np.ndarray((0, ))
            self._unique_y_counts = np.ndarray((0, ))
            self._unique_y_probabilities = np.ndarray((0, ))

        self._X_n = self._X.shape[0]
        self._is_fitted = True