Exemplo n.º 1
0
def hamming_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and non-numerical) used to
        calculate the distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and non-numerical) to which
        rows the distances are calculated.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distances : numpy.ndarray
        An array of Hamming distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')
    if not fuav.is_textual_array(X_array):
        raise ValueError('The X array should be textual.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(hamming_distance, 1, X_array, y_array,
                                    **kwargs)
    return distances
Exemplo n.º 2
0
def hamming_distance(x: Union[np.ndarray, np.void],
                     y: Union[np.ndarray, np.void],
                     **kwargs: bool) -> Union[int, float]:
    """
    Computes the Hamming distance between 1-dimensional non-numerical arrays.

    Each of the input arrays can be either a 1D numpy array or a row of a
    structured numpy array, i.e. numpy's void.

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional and non-numerical).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional and non-numerical).
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are of a
        different length.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance : Union[integer, float]
        Hamming distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if not fuav.is_textual_array(x_array):
        raise ValueError('The x array should be textual.')
    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError('The x and y arrays should have the same '
                                  'length.')

    def kw_hamming_distance(vec):
        return hamming_distance_base(vec[0], vec[1], **kwargs)

    distance = np.apply_along_axis(kw_hamming_distance, 0,
                                   np.vstack((x_array, y_array)))
    distance = distance.sum()
    return distance
Exemplo n.º 3
0
def hamming_array_distance(X: np.ndarray, Y: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the
    same width.

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of Hamming distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    if not fuav.is_textual_array(X):
        raise ValueError('The X array should be textual.')
    if not fuav.is_textual_array(Y):
        raise ValueError('The Y array should be textual.')

    # Transform the arrays to unstructured
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name
    Y_array = fuat.as_unstructured(Y)  # pylint: disable=invalid-name

    # Compare shapes
    if X_array.shape[1] != Y_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(hamming_point_distance, 1, X_array,
                                          Y_array, **kwargs)
    return distance_matrix
Exemplo n.º 4
0
def euclidean_point_distance(y: Union[np.ndarray, np.void],
                             X: np.ndarray) -> np.ndarray:
    """
    Calculates the Euclidean distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and purely numerical) used to
        calculate distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and purely numerical) to which
        rows distances are calculated.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distances : numpy.ndarray
        An array of Euclidean distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_numerical_array(y_array):
        raise ValueError('The y array should be purely numerical.')
    if not fuav.is_numerical_array(X_array):
        raise ValueError('The X array should be purely numerical.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(euclidean_distance, 1, X_array, y_array)
    return distances
Exemplo n.º 5
0
def binary_distance(x: Union[np.ndarray, np.void],
                    y: Union[np.ndarray, np.void],
                    normalise: bool = False) -> Union[int, float]:
    """
    Computes the binary distance between two 1-dimensional arrays.

    The distance is incremented by one for every position in the two input
    arrays where the value does not match. Each of the input arrays can be
    either a 1D numpy array or a row of a structured numpy array, i.e. numpy's
    void.

    Either of the input arrays is not of a base dtype. (See
    :func:`fatf.utils.array.validation.is_base_array` function description for
    the explanation of a base dtype.)

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional).
    normalise : boolean, optional (default=False)
        Whether to normalise the binary distance using the input array length.

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are of a
        different length.

    Returns
    -------
    distance : Union[integer, float]
        Binary distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError('The x and y arrays should have the same '
                                  'length.')

    distance = (x_array != y_array).sum()
    if normalise:
        logger.debug('Binary distance is being normalised.')
        distance /= x_array.shape[0]
    return distance
Exemplo n.º 6
0
def binary_array_distance(X: np.ndarray, Y: np.ndarray,
                          **kwargs: bool) -> np.ndarray:
    """
    Calculates the binary distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numpy arrays of the same
    width.

    Either of the input arrays is not of a base dtype. (See
    :func:`fatf.utils.array.validation.is_base_array` function description for
    the explanation of a base dtype.)

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.binary_distance` function responsible for
        calculating the binary distance.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of binary distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    # Transform the arrays to unstructured
    X_array = fuat.as_unstructured(X)
    Y_array = fuat.as_unstructured(Y)

    # Compare shapes
    if X_array.shape[1] != Y_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(binary_point_distance, 1, X_array,
                                          Y_array, **kwargs)
    return distance_matrix
Exemplo n.º 7
0
def euclidean_array_distance(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    """
    Calculates the Euclidean distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the
    same width.

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional and purely numerical.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional and purely numerical.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of Euclidean distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    if not fuav.is_numerical_array(X):
        raise ValueError('The X array should be purely numerical.')
    if not fuav.is_numerical_array(Y):
        raise ValueError('The Y array should be purely numerical.')

    # Transform the arrays to unstructured
    Y_array = fuat.as_unstructured(Y)  # pylint: disable=invalid-name
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    # Compare shapes
    if Y_array.shape[1] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(euclidean_point_distance, 1, X_array,
                                          Y_array)

    return distance_matrix
Exemplo n.º 8
0
def euclidean_distance(x: Union[np.ndarray, np.void],
                       y: Union[np.ndarray, np.void]) -> float:
    """
    Calculates the Euclidean distance between two 1-dimensional numpy "arrays".

    Each of the input arrays can be either a 1D numpy array or a row of a
    structured numpy array, i.e. numpy's void.

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional and purely numerical).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional and purely numerical).

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are not of the
        same length.
    ValueError
        Either of the input arrays is not purely numerical.

    Returns
    -------
    distance : float
        Euclidean distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if not fuav.is_numerical_array(x_array):
        raise ValueError('The x array should be purely numerical.')
    if not fuav.is_numerical_array(y_array):
        raise ValueError('The y array should be purely numerical.')

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError(('The x and y arrays should have the same '
                                   'length.'))

    distance = np.linalg.norm(x_array - y_array)
    return distance
def test_as_unstructured():
    """
    Tests :func:`fatf.utils.array.tools.as_unstructured`.
    """
    type_error = ('The input should either be a numpy (structured or '
                  'unstructured) array-like object (numpy.ndarray) or a row '
                  'of a structured numpy array (numpy.void).')
    value_error = ('as_unstructured only supports conversion of arrays that '
                   'hold base numpy types, i.e. numerical and string-like -- '
                   'numpy void and object-like types are not allowed.')
    # Test incompatible -- None -- type
    with pytest.raises(TypeError) as exin:
        fuat.as_unstructured(None)
    assert str(exin.value) == type_error

    # Test np.void -- a structured array's row
    simple = fuat.as_unstructured(NUMERICAL_STRUCTURED_ARRAY[0])
    assert _compare_nan_arrays(simple, NUMERICAL_UNSTRUCTURED_ARRAY[0])

    # Test structured array
    simple = fuat.as_unstructured(NOT_NUMERICAL_STRUCTURED_ARRAY)
    assert np.array_equal(simple, NOT_NUMERICAL_UNSTRUCTURED_ARRAY)
    # Test unstructured -- base type
    simple = fuat.as_unstructured(BASE_NP_ARRAY)
    assert np.array_equal(simple, BASE_NP_ARRAY)
    # Test unstructured -- not base type
    with pytest.raises(ValueError) as exin:
        fuat.as_unstructured(NOT_BASE_NP_ARRAY)
    assert str(exin.value) == value_error
Exemplo n.º 10
0
    def __init__(self,
                 dataset: np.ndarray,
                 categorical_indices: Optional[List[Index]] = None,
                 int_to_float: bool = True) -> None:
        """
        Constructs a ``NormalSampling`` data augmentation class.
        """
        # pylint: disable=too-many-locals,too-many-branches
        super().__init__(dataset,
                         categorical_indices=categorical_indices,
                         int_to_float=int_to_float)

        # Get sampling parameters for numerical features.
        numerical_sampling_values = dict()
        if self.numerical_indices:
            if self.is_structured:
                num_features_array = fuat.as_unstructured(
                    self.dataset[self.numerical_indices])
            else:
                num_features_array = self.dataset[:, self.numerical_indices]

            num_features_mean = num_features_array.mean(axis=0)
            num_features_std = num_features_array.std(axis=0)

            for i, index in enumerate(self.numerical_indices):
                numerical_sampling_values[index] = (num_features_mean[i],
                                                    num_features_std[i])
        self.numerical_sampling_values = numerical_sampling_values

        # Get sampling parameters for categorical features.
        categorical_sampling_values = dict()
        for column_name in self.categorical_indices:
            if self.is_structured:
                feature_column = self.dataset[column_name]
            else:
                feature_column = self.dataset[:, column_name]

            feature_values, values_counts = np.unique(feature_column,
                                                      return_counts=True)
            values_frequencies = values_counts / values_counts.sum()

            categorical_sampling_values[column_name] = (feature_values,
                                                        values_frequencies)
        self.categorical_sampling_values = categorical_sampling_values
Exemplo n.º 11
0
def lasso_path(dataset: np.ndarray,
               target: np.ndarray,
               weights: Optional[np.ndarray] = None,
               features_number: Optional[int] = None,
               features_percentage: int = 100) -> List[Index]:
    """
    Selects the specified number of features based on Lasso path coefficients.

    .. versionadded:: 0.0.2

    It may be the case that the specified number of features cannot be selected
    as a lasso path does not give enough non-zero coefficients, in which case
    the biggest number of features (smaller than the specified number) will be
    returned. In case all of the features are assigned 0 weight or all of the
    paths have a non-zero number of coefficients larger than the specified
    number, all of the features are selected. If the exact number of features
    specified by the user cannot be selected an appropriate message will be
    logged. Also, if the value of ``feature_percentage`` results in selecting
    0 features, 1 feature will be selected and a warning will be logged.

    The ``weights`` provided as the input parameter are incorporated into the
    feature selection process by centering the ``dataset`` around their
    weighted average (if no weights are provided, the average is simply not
    weighted) and scaling by the square root of the ``weights``.
    The ``target`` array is treated in the same way.

    This feature selection method is based on the default feature selection
    mechanism implemented by LIME_ (Local Interpretable Model-agnostic
    Explanations. The original implementation can be found in the
    ``lime.lime_base.LimeBase.feature_selection`` method in the
    `official LIME package`_.

    .. _LIME: https://github.com/marcotcr/lime
    .. _`official LIME package`: https://github.com/marcotcr/lime/blob/master/
       lime/lime_base.py#L116

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array with holding a data set.
    target : numpy.ndarray
        The class/probabilities/regression values of each row in the input data
        set.
    weights : numpy.ndarray, optional (default=None)
        An array of (importance) weights for each data point in the input data
        set. If ``None``, all of the data points are the same important when
        computing the Lasso path.
    features_number : integer, optional (default=None)
        The number of (top) features to be selected. If ``None``, the top x% of
        the features are selected where x is given by the
        ``features_percentage`` parameter. It may be the case that exactly the
        exact number of features cannot be extracted in which case a warning
        will be logged and the next biggest subset of features will be
        selected.
    features_percentage : integer, optional (default=100)
        The percentage of (top) features to be selected. By default all of the
        features are returned if ``features_number`` is ``None``.

    Warns
    -----
    UserWarning
        The specified ``features_number`` is larger than the number of features
        in the ``dataset`` array; all of the features are selected.

    Raises
    ------
    IncorrectShapeError
        The ``dataset`` array is not 2-dimensional. The ``target`` array is not
        1-dimensional. The number of labels in the ``target`` array is
        different than the number of samples in the ``dataset`` array. The
        ``weights`` array is not 1-dimensional. The number of weights in the
        ``weights`` array does not agree with the number of samples in the
        ``dataset`` array.
    TypeError
        The one of the ``dataset``, ``target`` or ``weights`` array is not
        purely numerical. The ``features_number`` parameter is not an integer.
        The ``features_percentage`` parameter is not an integer.
    ValueError
        The ``features_number`` parameter is not a positive integer. The
        ``features_percentage`` parameter is outside of the allowed range
        0--100 (inclusive).

    Returns
    -------
    feature_indices : List[Index]
        List of indices indicating features selected by the Lasso path.
    """
    # pylint: disable=too-many-branches,too-many-locals
    assert _validate_input_lasso_path(
        dataset, target, weights, features_number, features_percentage), \
        'Input is invalid.'

    if fuav.is_structured_array(dataset):
        indices = np.array(dataset.dtype.names)
        dataset_array = fuat.as_unstructured(dataset)
    else:
        indices = np.array(range(0, dataset.shape[1]))
        dataset_array = dataset

    indices_number = indices.shape[0]
    if features_number is None:
        feature_proportion = int((features_percentage / 100) * indices_number)
        if feature_proportion:
            features_number = feature_proportion
        else:
            logger.warning(
                'Since the number of features to be extracted was not given '
                '%d%% of features will be used. This percentage translates to '
                '0 features, therefore the number of features to be used is '
                'overwritten to 1. To prevent this from happening, you should '
                'either explicitly set the number of features via the '
                'features_number parameter or increase the value of the '
                'features_percentage parameter.', features_percentage)
            features_number = feature_proportion + 1

    if features_number == indices_number:
        feature_indices = indices
    elif features_number > indices_number:
        feature_indices = indices
        warnings.warn(
            'The selected number of features is larger than the total number '
            'of features in the dataset array. All of the features are being '
            'selected.', UserWarning)
    else:
        if weights is not None:
            weights_scaled = np.sqrt(weights)
        else:
            weights_scaled = np.ones_like(target)

        dataset_avg = np.average(dataset_array, axis=0, weights=weights)
        weighted_data = (
            (dataset_array - dataset_avg) * weights_scaled[:, np.newaxis])

        target_avg = np.average(target, weights=weights)
        weighted_target = (target - target_avg) * weights_scaled

        fitted_lars_path = sklearn.linear_model.lars_path(
            weighted_data, weighted_target, method='lasso', verbose=False)
        coefs = fitted_lars_path[2]

        # numpy.count_nonzero returns a scalar (despite specifying the axis)
        # in early versions of numpy, hence the workaround of:
        # np.count_nonzero(coefs, axis=0).
        nonzero_count = (coefs != 0).sum(axis=0)

        matching_paths_user = (nonzero_count <= features_number)
        matching_paths_nonzero = (nonzero_count > 0)
        matching_paths = np.where(
            np.logical_and(matching_paths_user, matching_paths_nonzero))[0]

        if matching_paths.size:
            biggest_path = matching_paths[-1]
            nonzero_indices = coefs[:, biggest_path].nonzero()[0]
            feature_indices = indices[nonzero_indices]
            if nonzero_indices.shape[0] != features_number:
                logger.warning(
                    'The lasso path feature selection could not pick %d '
                    'features. Only %d were selected.', features_number,
                    nonzero_indices.shape[0])
        else:
            feature_indices = indices
            logger.warning('The lasso path feature selection could not pick '
                           'any feature subset. All of the features were '
                           'selected.')
    return feature_indices
Exemplo n.º 12
0
def forward_selection(dataset: np.ndarray,
                      target: np.ndarray,
                      weights: Optional[np.ndarray] = None,
                      features_number: Optional[int] = None,
                      features_percentage: int = 100) -> np.ndarray:
    """
    Selects the specified number of features based on iterative importance.

    .. versionadded:: 0.1.0

    The ``weights`` provided as the input parameter are incorporated into the
    feature selection via the ridge regression training procedure.
    If the value of ``feature_percentage`` results in selecting
    0 features, 1 feature will be selected and a warning will be logged.

    .. note::

       This feature selection procedure is computationally expensive when
       the number of features to be selected is large.

    This feature selection method is based on LIME_ (Local Interpretable
    Model-agnostic Explanations). The original implementation can be found in
    the ``lime.lime_base.LimeBase.forward_selection`` method in the
    `official LIME package`_.

    .. _LIME: https://github.com/marcotcr/lime
    .. _`official LIME package`: https://github.com/marcotcr/lime/blob/0.2.0.0/
       lime/lime_base.py#L49

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array holding a data set.
    target : numpy.ndarray
        The class/probability/regression values of each row in the input data
        set.
    weights : numpy.ndarray, optional (default=None)
        An array of (importance) weights for each data point in the input data
        set. If ``None``, all of the data points are treated equally important.
    features_number : integer, optional (default=None)
        The number of (top) features to be selected. If ``None``, the top x% of
        the features are selected where x is given by the
        ``features_percentage`` parameter.
    features_percentage : integer, optional (default=100)
        The percentage of (top) features to be selected. By default all of the
        features are returned if ``features_number`` is ``None``.

    Warns
    -----
    UserWarning
        The specified ``features_number`` is larger than the number of features
        in the ``dataset`` array; all of the features are selected.

    Raises
    ------
    IncorrectShapeError
        The ``dataset`` array is not 2-dimensional. The ``target`` array is not
        1-dimensional. The number of elements in the ``target`` array is
        different than the number of samples in the ``dataset`` array. The
        ``weights`` array is not 1-dimensional. The number of weights in the
        ``weights`` array does not agree with the number of samples in the
        ``dataset`` array.
    TypeError
        One of the ``dataset``, ``target`` or ``weights`` array is not
        purely numerical. The ``features_number`` parameter is not an integer.
        The ``features_percentage`` parameter is not an integer.
    ValueError
        The ``features_number`` parameter is not a positive integer. The
        ``features_percentage`` parameter is outside of the allowed range
        0--100 (inclusive).

    Returns
    -------
    feature_indices : numpy.ndarray
        Array with indices of features chosen with forward selection.
    """
    # pylint: disable=too-many-locals
    assert _validate_input_lasso_path(dataset, target, weights,
                                      features_number,
                                      features_percentage), 'Input is invalid.'

    if fuav.is_structured_array(dataset):
        indices = np.array(dataset.dtype.names)
        dataset_array = fuat.as_unstructured(dataset)
    else:
        indices = np.array(range(0, dataset.shape[1]))
        dataset_array = dataset

    indices_number = indices.shape[0]
    if features_number is None:
        features_number = _get_feature_proportion(features_percentage,
                                                  indices_number)

    if features_number == indices_number:
        feature_indices = indices
    elif features_number > indices_number:
        feature_indices = indices
        warnings.warn(
            'The selected number of features is larger than the total number '
            'of features in the dataset array. All of the features are being '
            'selected.', UserWarning)
    else:
        if weights is None:
            weights_ = np.ones_like(target)
        else:
            weights_ = weights

        clf = sklearn.linear_model.Ridge(alpha=0, fit_intercept=True)
        feature_indices_i = []  # type: List[int]

        for _ in range(features_number):
            max_score = -np.inf
            selected_feature_i = None

            for feature_i in range(indices_number):
                if feature_i in feature_indices_i:
                    continue

                feature_subset = feature_indices_i + [feature_i]
                dataset_subset = dataset_array[:, feature_subset]

                clf.fit(dataset_subset, target, sample_weight=weights_)
                score = clf.score(dataset_subset,
                                  target,
                                  sample_weight=weights_)

                if score > max_score:
                    max_score = score
                    selected_feature_i = feature_i

            assert selected_feature_i is not None
            feature_indices_i.append(selected_feature_i)

        feature_indices_sorting = np.sort(feature_indices_i)
        feature_indices = indices[feature_indices_sorting]

    return feature_indices
Exemplo n.º 13
0
def highest_weights(dataset: np.ndarray,
                    target: np.ndarray,
                    weights: Optional[np.ndarray] = None,
                    features_number: Optional[int] = None,
                    features_percentage: int = 100) -> np.ndarray:
    """
    Selects the specified number of features based on their absolute weight.

    .. versionadded:: 0.1.0

    This feature selection procedure chooses the user-specified number of
    features based on their highest absolute weight given by a ridge regression
    fitted to all the features.

    The ``weights`` provided as the input parameter are incorporated into the
    feature selection via the ridge regression training procedure.
    If the value of ``feature_percentage`` results in selecting
    0 features, 1 feature will be selected and a warning will be logged.

    This feature selection method is based on LIME_ (Local Interpretable
    Model-agnostic Explanations). The original implementation can be found in
    the ``lime.lime_base.LimeBase.feature_selection`` method in the
    `official LIME package`_.

    .. _LIME: https://github.com/marcotcr/lime
    .. _`official LIME package`: https://github.com/marcotcr/lime/blob/0.2.0.0/
       lime/lime_base.py#L77

    Parameters
    ----------
    dataset : numpy.ndarray
        A 2-dimensional numpy array holding a data set.
    target : numpy.ndarray
        The class/probability/regression values of each row in the input data
        set.
    weights : numpy.ndarray, optional (default=None)
        An array of (importance) weights for each data point in the input data
        set. If ``None``, all of the data points are treated equally important.
    features_number : integer, optional (default=None)
        The number of (top) features to be selected. If ``None``, the top x% of
        the features are selected where x is given by the
        ``features_percentage`` parameter.
    features_percentage : integer, optional (default=100)
        The percentage of (top) features to be selected. By default all of the
        features are returned if ``features_number`` is ``None``.

    Warns
    -----
    UserWarning
        The specified ``features_number`` is larger than the number of features
        in the ``dataset`` array; all of the features are selected.

    Raises
    ------
    IncorrectShapeError
        The ``dataset`` array is not 2-dimensional. The ``target`` array is not
        1-dimensional. The number of elements in the ``target`` array is
        different than the number of samples in the ``dataset`` array. The
        ``weights`` array is not 1-dimensional. The number of weights in the
        ``weights`` array does not agree with the number of samples in the
        ``dataset`` array.
    TypeError
        One of the ``dataset``, ``target`` or ``weights`` array is not
        purely numerical. The ``features_number`` parameter is not an integer.
        The ``features_percentage`` parameter is not an integer.
    ValueError
        The ``features_number`` parameter is not a positive integer. The
        ``features_percentage`` parameter is outside of the allowed range
        0--100 (inclusive).

    Returns
    -------
    feature_indices : numpy.ndarray
        Array with indices of features with highest coefficients.
    """
    assert _validate_input_lasso_path(dataset, target, weights,
                                      features_number,
                                      features_percentage), 'Input is invalid.'

    if fuav.is_structured_array(dataset):
        indices = np.array(dataset.dtype.names)
        dataset_array = fuat.as_unstructured(dataset)
    else:
        indices = np.array(range(0, dataset.shape[1]))
        dataset_array = dataset

    indices_number = indices.shape[0]
    if features_number is None:
        features_number = _get_feature_proportion(features_percentage,
                                                  indices_number)

    if features_number == indices_number:
        feature_indices = indices
    elif features_number > indices_number:
        feature_indices = indices
        warnings.warn(
            'The selected number of features is larger than the total number '
            'of features in the dataset array. All of the features are being '
            'selected.', UserWarning)
    else:
        if weights is None:
            weights_ = np.ones_like(target)
        else:
            weights_ = weights

        clf = sklearn.linear_model.Ridge(alpha=0.01, fit_intercept=True)
        clf.fit(dataset_array, target, sample_weight=weights_)

        importance_ordering = np.flipud(np.argsort(np.abs(clf.coef_)))
        selected_indices = importance_ordering[:features_number]
        selected_indices_sorted = np.sort(selected_indices)
        feature_indices = indices[selected_indices_sorted]

    return feature_indices
Exemplo n.º 14
0
    def explain_instance(
            self, instance: np.ndarray, **kwargs: Any
    ) -> Union[Dict[str, Tuple[str, float]], List[Tuple[str, float]]]:
        """
        Explains an instance with the LIME tabular explainer.

        This method wraps around ``explain_instance`` method_ in the LIME
        tabular explainer object.

        .. warning::
            Contrarily to the LIME tabular explainer this wrapper produces
            explanations for all of the classes for a classification task by
            default.

        If any of the named parameters for this function were specified when
        initialising this object they will be used unless they are also defined
        when calling this method, in which case the latter take the precedence.

        If all: a class-wide model, a class-wide prediction function and a
        local prediction function (via named parameter to this function) are
        specified, they are used in the following order:

        - local prediction function,

        - global prediction function, and finally

        - the model.

        Based on whether the task at hand is classification or regression
        either ``predict`` (regression) or ``predict_proba`` (classification)
        method of the model is used.

        .. _method: https://lime-ml.readthedocs.io/en/latest/lime.html
           #lime.lime_tabular.LimeTabularExplainer.explain_instance

        Parameters
        ----------
        instance : numpy.ndarray
            A 1-dimensional data point (numpy array) to be explained.
        **kwargs : lime.lime_tabular.LimeTabularExplainer.explain_instance
            LIME tabular explainer's ``explain_instance`` optional parameters.

        Raises
        ------
        AttributeError
            One of the named parameters is invalid for the ``explain_instance``
            method of the LIME tabular explainer.
        IncorrectShapeError
            The input ``instance`` is not a 1-dimensional numpy array.
        RuntimeError
            A predictive function is not available (neither as a ``model``
            attribute of this class, nor as a ``predict_fn`` parameter).
        ValueError
            The input ``instance`` is not purely numerical.

        Returns
        -------
        explanation : Dictionary[string, Tuple[string, float]] or \
List[Tuple[string, float]]
            For classification a dictionary where the keys correspond to class
            names and the values are tuples (string and float), which represent
            an explanation in terms of one of the features and the importance
            of this explanation. For regression a list of tuples (string and
            float) with the same meaning.
        """
        # pylint: disable=too-many-locals,too-many-branches
        invalid_params = set(kwargs.keys()).difference(
            self._EXPLAIN_INSTANCE_PARAMS)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        if not fuav.is_1d_like(instance):
            raise IncorrectShapeError('The instance to be explained should be '
                                      '1-dimensional.')
        instance = fuat.as_unstructured(instance)
        if not fuav.is_numerical_array(instance):
            raise ValueError('The instance to be explained should be purely '
                             'numerical -- LIME does not support categorical '
                             'features.')

        # Merge local kwargs and object's kwargs
        named_arguments = dict(self.explain_instance_params)
        for kwarg in self._EXPLAIN_INSTANCE_PARAMS:
            if kwarg in kwargs:
                named_arguments[kwarg] = kwargs[kwarg]

        # If both a model and a predictor function is supplied
        pred_fn_name = 'predict_fn'
        if pred_fn_name in named_arguments:
            pred_fn = named_arguments[pred_fn_name]
            del named_arguments[pred_fn_name]
        elif self.model is not None:
            if self.mode == 'classification':
                if self.model_is_probabilistic:
                    pred_fn = self.model.predict_proba  # type: ignore
                else:
                    raise RuntimeError('The predictive model is not '
                                       'probabilistic. Please specify a '
                                       'predictive function instead.')
            else:
                pred_fn = self.model.predict  # type: ignore
        else:
            raise RuntimeError('A predictive function is not available.')

        # If unspecified, get explanations for all classes for classification
        lbls_name = 'labels'
        if lbls_name not in named_arguments and self.mode == 'classification':
            # Since we cannot get all of the class names/indices/quantity,
            # we need to resort to this dirty trick
            n_classes = pred_fn(np.array([instance])).shape[1]
            named_arguments[lbls_name] = range(n_classes)

        exp = self.tabular_explainer.explain_instance(instance, pred_fn,
                                                      **named_arguments)

        if self.mode == 'classification':
            explanation = {}
            for label in exp.available_labels():
                class_name = exp.class_names[label]
                class_explanation = exp.as_list(label=label)

                explanation[class_name] = class_explanation
        else:
            explanation = exp.as_list()

        return explanation
Exemplo n.º 15
0
def describe_numerical_array(
        array: Union[np.ndarray, np.void],
        skip_nans: bool = True) -> Dict[str, Union[int, float, np.ndarray]]:
    """
    Describes a numerical numpy array with basic statistics.

    If the ``skip_nans`` parameter is set to ``True``, any ``numpy.nan``
    present in the input array is skipped for calculating the statistics.
    Otherwise, they are included, affecting most of the statistics and possibly
    equating them to ``numpy.nan``.

    The description output by this function is a dictionary with the
    following keys:

    ``count`` : integer
        The number of elements in the array.

    ``mean`` : float
        The *mean* (average) value of the array.

    ``std`` : float
        The *standard deviation* of the array.

    ``min`` : float
        The *minimum value* in the array.

    ``25%`` : float
        The *25 percentile* of the array.

    ``50%`` : float
        The *50 percentile* of the array, which is equivalent to its
        **median**.

    ``75%`` : float
        The *75 percentile* of the array.

    ``max`` : float
        The *maximum value* in the array.

    ``nan_count`` : integer
        The count of ``numpy.nan`` (not-a-number) values in the array.

    Parameters
    ----------
    array : Union[numpy.ndarray, numpy.void]
        An array for which a description is desired.
    skip_nans : boolean, optional (default=True)
        If set to ``True``, ``numpy.nan``\\ s present in the input array will
        be excluded while computing the statistics.

    Raises
    ------
    IncorrectShapeError
        The input array is not 1-dimensional.
    ValueError
        The input array is not purely numerical or it is empty.

    Returns
    -------
    numerical_description : Dict[string, Union[integer, float, numpy.ndarray]]
        A dictionary describing the numerical input array.
    """
    if not fuav.is_1d_like(array):
        raise IncorrectShapeError('The input array should be 1-dimensional.')

    classic_array = fuat.as_unstructured(array)
    assert len(classic_array.shape) == 1, '1D arrays only at this point.'

    if not classic_array.shape[0]:
        raise ValueError('The input array cannot be empty.')
    if not fuav.is_numerical_array(classic_array):
        raise ValueError('The input array should be purely numerical.')

    nan_indices = np.isnan(classic_array)
    n_elements = classic_array.shape[0]

    if skip_nans:
        classic_array = classic_array[~nan_indices]

    numerical_description = {
        'count': n_elements,
        'mean': np.mean(classic_array),
        'std': np.std(classic_array),
        'min': np.min(classic_array),
        '25%': np.percentile(classic_array, 25),
        '50%': np.percentile(classic_array, 50),
        '75%': np.percentile(classic_array, 75),
        'max': np.max(classic_array),
        'nan_count': nan_indices.sum()
    }

    return numerical_description
Exemplo n.º 16
0
def describe_categorical_array(
    array: Union[np.ndarray, np.void]
) -> Dict[str, Union[str, int, bool, np.ndarray]]:
    """
    Describes a categorical numpy array with basic statistics.

    The description output by this function is a dictionary with the
    following keys:

    ``count`` : integer
        The number of elements in the array.

    ``unique`` : numpy.ndarray
        The unique values in the array, ordered lexicographically.

    ``unique_counts`` : numpy.ndarray
        The counts of the unique values in the array.

    ``top`` : string
        The most frequent value in the array.

    ``freq`` : integer
        The count of the most frequent value in the array.

    ``is_top_unique`` : boolean
        Indicates whether the most frequent value (``freq``) in the array is
        the only one with that count.

    Parameters
    ----------
    array : Union[numpy.ndarray, numpy.void]
        An array for which a description is desired.

    Raises
    ------
    IncorrectShapeError
        The input array is not 1-dimensinoal.
    ValueError
        The input array is empty.

    Warns
    -----
    UserWarning
        When the input array is not purely textual it needs to be converted to
        a string type before it can be described.

    Returns
    -------
    categorical_description : Dict[string, Union[string, integer, \
boolean, numpy.ndarray]]
        A dictionary describing the categorical input array.
    """
    if not fuav.is_1d_like(array):
        raise IncorrectShapeError('The input array should be 1-dimensional.')

    classic_array = fuat.as_unstructured(array)
    assert len(classic_array.shape) == 1, '1D arrays only at this point.'

    if not classic_array.shape[0]:
        raise ValueError('The input array cannot be empty.')
    if not fuav.is_textual_array(classic_array):
        warnings.warn(
            'The input array is not purely categorical. Converting the input '
            'array into a textual type to facilitate a categorical '
            'description.',
            category=UserWarning)
        classic_array = classic_array.astype(str)

    unique, unique_counts = np.unique(classic_array, return_counts=True)

    unique_sort_index = np.argsort(unique)
    unique = unique[unique_sort_index]
    unique_counts = unique_counts[unique_sort_index]

    top_index = np.argmax(unique_counts)

    top = unique[top_index]
    freq = unique_counts[top_index]

    is_top_unique = (unique_counts == freq).sum() < 2

    categorical_description = {
        'count': classic_array.shape[0],
        'unique': unique,
        'unique_counts': unique_counts,
        'top': top,
        'freq': freq,
        'is_top_unique': is_top_unique
    }

    return categorical_description
Exemplo n.º 17
0
def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:`fatf.transparency.data.describe_functions.\
describe_categorical_array` functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:`fatf.transparency.\
data.describe_functions.describe_numerical_array` function responsible for
        describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int], Dict[str, \
Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore
Exemplo n.º 18
0
    def __init__(self,
                 data: np.ndarray,
                 local_explanation: bool = True,
                 model: object = None,
                 **kwargs: Any) -> None:
        """
        Initialises a tabular LIME wrapper.
        """
        # pylint: disable=too-many-branches,too-many-statements

        warnings.warn(
            'The LIME wrapper will be deprecated in FAT Forensics version '
            '0.0.3. Please consider using the TabularBlimeyLime explainer '
            'class implemented in the fatf.transparency.predictions.'
            'surrogate_explainers module instead. Alternatively, you may '
            'consider building a custom surrogate explainer using the '
            'functionality implemented in FAT Forensics -- see the *Tabular '
            'Surrogates* how-to guide for more details.', FutureWarning)

        valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS)
        invalid_params = set(kwargs.keys()).difference(valid_params)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        # Split parameters
        init_params = {
            key: kwargs[key]
            for key in kwargs if key in self._INIT_PARAMS
        }
        explain_params = {
            key: kwargs[key]
            for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS
        }

        # Check data
        if not fuav.is_2d_array(data):
            raise IncorrectShapeError('The data parameter must be a '
                                      '2-dimensional numpy array.')
        if not fuav.is_numerical_array(data):
            raise ValueError('LIME does not support non-numerical data '
                             'arrays.')

        # Honour native local explanation keyword
        local_explanation_keyword = 'sample_around_instance'
        if local_explanation_keyword not in init_params:
            init_params[local_explanation_keyword] = local_explanation

        # Sort out a structured data array
        if fuav.is_structured_array(data):
            categorical_indices_keyword = 'categorical_features'
            categorical_indices = init_params.get(categorical_indices_keyword,
                                                  None)

            if categorical_indices is not None:
                if isinstance(categorical_indices, list):
                    categorical_indices = np.array(categorical_indices)
                elif isinstance(categorical_indices, np.ndarray):
                    pass
                else:
                    raise TypeError('The {} parameter either has to be a '
                                    'list, a numpy array or None.'.format(
                                        categorical_indices_keyword))

                if not fuav.is_1d_array(categorical_indices):
                    raise IncorrectShapeError(
                        '{} array/list is not '
                        '1-dimensional.'.format(categorical_indices_keyword))
                if not fuav.is_textual_array(categorical_indices):
                    raise ValueError('Since {} is an array of indices for '
                                     'a structured array, all of its elements '
                                     'should be strings.'.format(
                                         categorical_indices_keyword))

                # Check categorical indices
                if not fuat.are_indices_valid(data, categorical_indices):
                    raise ValueError(
                        'Indices given in the {} parameter '
                        'are not valid for the input data '
                        'array.'.format(categorical_indices_keyword))
                init_params[categorical_indices_keyword] = np.array(
                    [data.dtype.names.index(y) for y in categorical_indices])

            data = fuat.as_unstructured(data)

        # Get a LIME tabular explainer
        self.mode = init_params.get('mode', 'classification')
        if self.mode not in ['classification', 'regression']:
            raise ValueError("The mode must be either 'classification' or "
                             "'regression'. '{}' given.".format(self.mode))

        self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer(
            data, **init_params)

        # Check the model
        self.model = model
        self.model_is_probabilistic = False
        if model is not None:
            if fumv.check_model_functionality(
                    model, require_probabilities=True, suppress_warning=True):
                self.model_is_probabilistic = True
            elif fumv.check_model_functionality(
                    model, require_probabilities=False, suppress_warning=True):
                self.model_is_probabilistic = False
                logger.warning('The model can only be used for LIME in a '
                               'regressor mode.')
            else:
                raise IncompatibleModelError('LIME requires a model object to '
                                             'have a fit method and '
                                             'optionally a predict_proba '
                                             'method.')

        # Check the predictive function and memorise parameters that may be
        # useful for explaining an instance
        pred_fn_name = 'predict_fn'
        if pred_fn_name in explain_params:
            prediction_function = explain_params[pred_fn_name]
            # Make sure that its a function
            if not callable(prediction_function):
                raise TypeError('The {} parameter is not callable -- it has '
                                'to be a function.'.format(pred_fn_name))

            # Warn the user if both a model and a function are provided
            if self.model is not None:
                warnings.warn(
                    'Since both, a model and a predictive function, are '
                    'provided only the latter will be used.', UserWarning)

        self.explain_instance_params = explain_params