Python is_textual_array示例

编程语言: Python

命名空间/包名称: fatf.utils.array.validation

方法/功能: is_textual_array

hotexamples.com的示例: 9

Python is_textual_array - 已找到9个示例。这些是从开源项目中提取的最受好评的fatf.utils.array.validation.is_textual_array现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： distances.py 项目： mattclifford1/fat-forensics-1

def hamming_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance between ``y`` and every row of ``X``.

    ``y`` has to be a 1-dimensional numerical numpy array or a row of a
    structured numpy array (i.e. numpy's void) and ``X`` has to be a
    2-dimensional numerical numpy array. The length of ``y`` has to be the same
    as the width of ``X``.

    Parameters
    ----------
    y : Union[numpy.ndarray, numpy.void]
        A numpy array (has to be 1-dimensional and non-numerical) used to
        calculate the distances from.
    X : numpy.ndarray
        A numpy array (has to be 2-dimensional and non-numerical) to which
        rows the distances are calculated.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the
        length of ``y`` is not equal to the number of columns in ``X``.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distances : numpy.ndarray
        An array of Hamming distances between ``y`` and every row of ``X``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')

    # Transform the arrays to unstructured
    y_array = fuat.as_unstructured(y)
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name

    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')
    if not fuav.is_textual_array(X_array):
        raise ValueError('The X array should be textual.')

    # Compare shapes
    if y_array.shape[0] != X_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of elements '
                                  'in the y array.')

    distances = np.apply_along_axis(hamming_distance, 1, X_array, y_array,
                                    **kwargs)
    return distances

示例#2

显示文件

文件： distances.py 项目： mattclifford1/fat-forensics-1

def hamming_distance(x: Union[np.ndarray, np.void],
                     y: Union[np.ndarray, np.void],
                     **kwargs: bool) -> Union[int, float]:
    """
    Computes the Hamming distance between 1-dimensional non-numerical arrays.

    Each of the input arrays can be either a 1D numpy array or a row of a
    structured numpy array, i.e. numpy's void.

    Parameters
    ----------
    x : Union[numpy.ndarray, numpy.void]
        The first numpy array (has to be 1-dimensional and non-numerical).
    y : Union[numpy.ndarray, numpy.void]
        The second numpy array (has to be 1-dimensional and non-numerical).
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either of the input arrays is not 1-dimensional or they are of a
        different length.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance : Union[integer, float]
        Hamming distance between the two numpy arrays.
    """
    # pylint: disable=invalid-name
    if not fuav.is_1d_like(x):
        raise IncorrectShapeError('The x array should be 1-dimensional.')
    if not fuav.is_1d_like(y):
        raise IncorrectShapeError('The y array should be 1-dimensional.')

    # Transform the arrays to unstructured
    x_array = fuat.as_unstructured(x)
    y_array = fuat.as_unstructured(y)

    if not fuav.is_textual_array(x_array):
        raise ValueError('The x array should be textual.')
    if not fuav.is_textual_array(y_array):
        raise ValueError('The y array should be textual.')

    if x_array.shape[0] != y_array.shape[0]:
        raise IncorrectShapeError('The x and y arrays should have the same '
                                  'length.')

    def kw_hamming_distance(vec):
        return hamming_distance_base(vec[0], vec[1], **kwargs)

    distance = np.apply_along_axis(kw_hamming_distance, 0,
                                   np.vstack((x_array, y_array)))
    distance = distance.sum()
    return distance

示例#3

显示文件

文件： distances.py 项目： mattclifford1/fat-forensics-1

def hamming_array_distance(X: np.ndarray, Y: np.ndarray,
                           **kwargs: bool) -> np.ndarray:
    """
    Calculates the Hamming distance matrix between rows in ``X`` and ``Y``.

    Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the
    same width.

    Parameters
    ----------
    X : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    Y : numpy.ndarray
        A numpy array -- has to be 2-dimensional and non-numerical.
    **kwargs : boolean
        Keyword arguments that are passed to the
        :func:`fatf.utils.distances.hamming_distance_base` function responsible
        for calculating the Hamming distance.

    Raises
    ------
    IncorrectShapeError
        Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not
        have the same number of columns.
    ValueError
        Either of the input arrays is not purely textual.

    Returns
    -------
    distance_matrix : numpy.ndarray
        An matrix of Hamming distances between rows in ``X` and ``Y``.
    """
    # pylint: disable=invalid-name
    if not fuav.is_2d_array(X):
        raise IncorrectShapeError('The X array should be 2-dimensional.')
    if not fuav.is_2d_array(Y):
        raise IncorrectShapeError('The Y array should be 2-dimensional.')

    if not fuav.is_textual_array(X):
        raise ValueError('The X array should be textual.')
    if not fuav.is_textual_array(Y):
        raise ValueError('The Y array should be textual.')

    # Transform the arrays to unstructured
    X_array = fuat.as_unstructured(X)  # pylint: disable=invalid-name
    Y_array = fuat.as_unstructured(Y)  # pylint: disable=invalid-name

    # Compare shapes
    if X_array.shape[1] != Y_array.shape[1]:
        raise IncorrectShapeError('The number of columns in the X array '
                                  'should the same as the number of columns '
                                  'in Y array.')

    distance_matrix = np.apply_along_axis(hamming_point_distance, 1, X_array,
                                          Y_array, **kwargs)
    return distance_matrix

示例#4

显示文件

    def __init__(self,
                 clf: sklearn.base.BaseEstimator,
                 feature_names: Optional[List[str]] = None,
                 class_names: Optional[List[str]] = None) -> None:
        """
        Initialises the ``SKLearnExplainer`` class.
        """
        # Validate the input
        assert _validate_input(clf, feature_names,
                               class_names), 'Invalid init parameters.'
        self.clf = clf
        self.feature_names = feature_names
        self.class_names = class_names

        # Check whether the model is of the right type and is fitted
        assert self._validate_kind_fitted(), 'Unfitted or wrong type model.'

        # Classifier or regressor
        self.is_classifier = self._is_classifier()
        assert isinstance(self.is_classifier, bool), 'Has to be boolean.'

        # The number of features (number of columns in a data array) expected
        # by the classifier
        self.features_number = self._get_features_number()
        if self.features_number is not None:
            assert isinstance(self.features_number, int), 'Wrong type.'

        # Get the list of classes that the predictive model can output
        self.classes_array = self._get_classes_array()
        if self.classes_array is not None:
            assert isinstance(self.classes_array, np.ndarray), 'Bad type.'
        if self.classes_array is not None:
            assert fuav.is_1d_array(self.classes_array), 'Must be 1-D array.'
            assert (fuav.is_numerical_array(self.classes_array)
                    or fuav.is_textual_array(self.classes_array)), 'Bad type.'

        # A regressor must not have class names
        if not self.is_classifier:
            assert self.classes_array is None and self.class_names is None, \
                "Regressor's class_names and classes_array must both be None."

        # Validate feature names length
        if self.feature_names is None:
            if self.features_number is not None:
                logger.info('Generating missing feature names from the number '
                            'of features using "feature %d" pattern.')
                self.feature_names = [
                    'feature {}'.format(i) for i in range(self.features_number)
                ]
        else:
            if self.features_number is None:
                warnings.warn(
                    'Cannot validate the length of feature names list since '
                    'the _get_features_number method '
                    'returned None.', UserWarning)
            else:
                if len(self.feature_names) != self.features_number:
                    raise ValueError('The length of the feature_names list '
                                     'is different than the number of '
                                     'features extracted from the classifier.')

        # Validate class names length
        if self.class_names is None:
            if self.classes_array is not None:
                logger.info('Generating missing class names from the array of '
                            'classes output by the classifier using '
                            '"class %s" pattern.')
                self.class_names = [
                    'class {}'.format(i) for i in self.classes_array
                ]
        else:
            if self.classes_array is None:
                warnings.warn(
                    'Cannot validate the length of class names list since the '
                    '_get_classes_array method returned None.', UserWarning)
            else:
                if self.classes_array.shape[0] != len(self.class_names):
                    raise ValueError('The length of the class_names list is '
                                     'different than the length of the '
                                     'classes array extracted from the '
                                     'classifier.')

示例#5

显示文件

def individual_conditional_expectation(
        dataset: np.ndarray,
        model: object,
        feature_index: Union[int, str],
        treat_as_categorical: Optional[bool] = None,
        steps_number: Optional[int] = None,
        include_rows: Optional[Union[int, List[int]]] = None,
        exclude_rows: Optional[Union[int, List[int]]] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Calculates Individual Conditional Expectation for a selected feature.

    Based on the provided dataset and model this function computes Individual
    Conditional Expectation (ICE) of a selected feature for all target classes.
    If ``treat_as_categorical`` parameter is not provided the function will
    infer the type of the selected feature and compute the appropriate ICE.
    Otherwise, the user can specify whether the selected feature should be
    treated as a categorical or numerical feature. If the selected feature is
    numerical, you can specify the number of samples between this feature's
    minimum and maximum value for which the input model will be evaluated.
    By default this value is set to 100.

    Finally, it is possible to filter the rows of the input dataset that will
    be used to calculate ICE with ``include_rows`` and ``exclude_rows``
    parameters. If ``include_rows`` is specified ICE will only be calculated
    for these rows. If both include and exclude parameters are given, ICE will
    be computed for the set difference. Finally, if only the exclude parameter
    is specified, these rows will be subtracted from the whole dataset.

    This approach is an implementation of a method introduced by
    [GOLDSTEIN2015PEEKING]_. It is intended to be used with probabilistic
    models, therefore the input model must have a ``predict_proba`` method.

    .. [GOLDSTEIN2015PEEKING] Goldstein, A., Kapelner, A., Bleich, J. and
       Pitkin, E., 2015. Peeking inside the black box: Visualizing statistical
       learning with plots of individual conditional expectation. Journal of
       Computational and Graphical Statistics, 24(1), pp.44-65.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset based on which ICE will be computed.
    model : object
        A fitted model which predictions will be used to calculate ICE. (Please
        see :class:`fatf.utils.models.models.Model` class documentation for the
        expected model object specification.)
    feature_index : Union[integer, string]
        An index of the feature column in the input dataset for which ICE will
        be computed.
    treat_as_categorical : boolean, optional (default=None)
        Whether to treat the selected feature as categorical or numerical.
    steps_number : integer, optional (default=None, i.e. 100)
        The number of evenly spaced samples between the minimum and the maximum
        value of the selected feature for which the model's prediction will be
        evaluated. (This parameter applies only to numerical features.)
    include_rows : Union[int, List[int]], optional (default=None)
        Indices of rows that will be included in the ICE calculation. If this
        parameter is specified, ICE will only be calculated for the selected
        rows. If additionally ``exclude_rows`` is specified the selected rows
        will be a set difference between the two. This parameter can either be
        a *list* of indices or a single index (integer).
    exclude_rows : Union[int, List[int]], optional (default=None)
        The indices of rows to be excluded from the ICE calculation. If this
        parameter is specified and ``include_rows`` is not, these indices will
        be excluded from all of the rows. If both include and exclude
        parameters are specified, the rows included in the ICE calculation will
        be a set difference of the two. This parameter can either be a *list*
        of indices or a single index (integer).

    Warns
    -----
    UserWarning
        The feature is treated as categorical but the number of steps parameter
        is provided (not ``None``). In this case the ``steps_number`` parameter
        is ignored. Also, the user is warned when the selected feature is
        detected to be categorical (textual) while the user indicated that it
        is numerical.

    Raises
    ------
    IncompatibleModelError
        The model does not have required functionality -- it needs to be able
        to output probabilities via ``predict_proba`` method.
    IncorrectShapeError
        The input dataset is not a 2-dimensional numpy array.
    IndexError
        Provided feature (column) index is invalid for the input dataset.
    TypeError
        ``treat_as_categorical`` is not ``None`` or boolean. The
        ``steps_number`` parameter is not ``None`` or integer. Either
        ``include_rows`` or ``exclude_rows`` parameter is not ``None``, an
        integer or a list of integers.
    ValueError
        The input dataset must only contain base types (textual and numerical
        values). One of the ``include_rows`` or ``exclude_rows`` indices is not
        valid for the input dataset. The ``steps_number`` is smaller than 2.

    Returns
    -------
    ice : numpy.ndarray
        An array of Individual Conditional Expectations for all of the selected
        dataset rows and the feature (dataset column) of choice. It's of the
        (n_samples, steps_number, n_classes) shape where n_samples is the
        number of rows selected from the dataset for the ICE computation,
        steps_number is the number of generated samples for the selected
        feature and n_classes is the number of classes in the target of the
        dataset. The numbers in this array represent the probability of every
        class for every selected data point when the selected feature is fixed
        to one of the values in the generated feature linespace (see below).
    feature_linespace : numpy.ndarray
        A one-dimensional array -- (steps_number, ) -- with the values for
        which the selected feature was substituted when the dataset was
        evaluated with the specified model.
    """
    # pylint: disable=too-many-arguments,too-many-locals
    assert _input_is_valid(dataset, model, feature_index, treat_as_categorical,
                           steps_number), 'Input must be valid.'

    is_structured = fuav.is_structured_array(dataset)

    if is_structured:
        column = dataset[feature_index]
    else:
        column = dataset[:, feature_index]
    assert fuav.is_1d_array(column), 'Column must be a 1-dimensional array.'

    if fuav.is_numerical_array(column):
        is_categorical_column = False
    elif fuav.is_textual_array(column):
        is_categorical_column = True
    else:
        assert False, 'Must be an array of a base type.'  # pragma: nocover

    # If needed, infer the column type.
    if treat_as_categorical is None:
        treat_as_categorical = is_categorical_column
    elif not treat_as_categorical and is_categorical_column:
        message = ('Selected feature is categorical (string-base elements), '
                   'however the treat_as_categorical was set to False. Such '
                   'a combination is not possible. The feature will be '
                   'treated as categorical.')
        warnings.warn(message, category=UserWarning)
        treat_as_categorical = True
        steps_number = None

    if treat_as_categorical and steps_number is not None:
        warnings.warn(
            'The steps_number parameter will be ignored as the feature is '
            'being treated as categorical.',
            category=UserWarning)

    # If needed, get the default steps number.
    if not treat_as_categorical and steps_number is None:
        steps_number = 100

    rows_number = dataset.shape[0]
    include_r = _filter_rows(include_rows, exclude_rows, rows_number)
    filtered_dataset = dataset[include_r]

    sampled_data, feature_linespace = _interpolate_array(
        filtered_dataset, feature_index, treat_as_categorical, steps_number)

    ice = [
        model.predict_proba(data_slice)  # type: ignore
        for data_slice in sampled_data
    ]
    ice = np.stack(ice, axis=0)

    return ice, feature_linespace

示例#6

显示文件

def describe_array(
        array: np.ndarray,
        include: Optional[Union[str, int, List[Union[str, int]]]] = None,
        exclude: Optional[Union[str, int, List[Union[str, int]]]] = None,
        **kwargs: bool
) -> Dict[Union[str, int],
          Union[str, int, float, bool, np.ndarray,
                Dict[str, Union[str, int, float, bool, np.ndarray]]]
          ]:  # yapf: disable
    """
    Describes categorical (textual) and numerical columns in the input array.

    The details of numerical and categorical descriptions can be found in
    :func:`fatf.transparency.data.describe_functions.describe_numerical_array`
    and :func:`fatf.transparency.data.describe_functions.\
describe_categorical_array` functions documentation respectively.

    To filter out the columns that will be described you can use ``include``
    and ``exclude`` parameters. Either of these can be a list with columns
    indices, a string or an integer when excluding or including just one
    column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to
    indicate that only numerical or categorical columns should be included/
    excluded. By default all columns are described.

    Parameters
    ----------
    array : numpy.ndarray
        The array to be described.
    include : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be included in the description. If
        ``None`` (the default value), all of the columns will be included.
        Alternatively this can be set to a single index (either a string or an
        integer) to compute statistics just for this one column. It is also
        possible to set it to ``'numerical'`` or ``'categorical'`` to just
        include numerical or categorical columns respectively.
    exclude : Union[str, int, List[Union[str, int]]], optional (default=None)
        A list of column indices to be excluded from the description. If
        ``None`` (the default value), none of the columns will be excluded.
        Alternatively this can be set to a single index (either a string or an
        integer) to exclude just one column. It is also possible to set it to
        ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or
        all categorical columns respectively.
    **kwargs : bool
        Keyword arguments that are passed to the :func:`fatf.transparency.\
data.describe_functions.describe_numerical_array` function responsible for
        describing numerical arrays.

    Warns
    -----
    UserWarning
        When using ``include`` or ``exclude`` parameters for 1-dimensional
        input arrays (in which case these parameters are ignored).

    Raises
    ------
    IncorrectShapeError
        The input array is neither 1- not 2-dimensional.
    RuntimeError
        None of the columns were selected to be described.
    ValueError
        The input array is not of a base type (textual and numerical elements).
        The input array has 0 columns.

    Returns
    -------
    description : Dict[Union[str, int], Dict[str, \
Union[str, int, float bool, np.ndarray]]]
        For 2-dimensional arrays a dictionary describing every column under a
        key corresponding to its index in the input array. For a 1-dimensional
        input array a dictionary describing that array.
    """
    # pylint: disable=too-many-locals,too-many-branches
    is_1d = fuav.is_1d_like(array)
    if is_1d:
        array = fuat.as_unstructured(array)
        is_2d = False
    else:
        is_2d = fuav.is_2d_array(array)

    if not is_1d and not is_2d:
        raise IncorrectShapeError('The input array should be 1- or '
                                  '2-dimensional.')

    if not fuav.is_base_array(array):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    if is_1d:
        if include is not None or exclude is not None:
            warnings.warn(
                'The input array is 1-dimensional. Ignoring include and '
                'exclude parameters.',
                category=UserWarning)

        if fuav.is_numerical_array(array):
            description = describe_numerical_array(array, **kwargs)
        elif fuav.is_textual_array(array):
            description = describe_categorical_array(array)
        else:  # pragma: no cover
            assert False, 'A base array should either be numerical or textual.'
    elif is_2d:
        numerical_indices, categorical_indices = fuat.indices_by_type(array)
        is_structured_array = fuav.is_structured_array(array)

        if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0:
            raise ValueError('The input array cannot have 0 columns.')

        numerical_indices_set = set(numerical_indices)
        categorical_indices_set = set(categorical_indices)
        all_indices = categorical_indices_set.union(numerical_indices_set)
        # Indices to be included
        include_indices = _filter_include_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  include, all_indices)
        categorical_indices_set, numerical_indices_set = include_indices

        # Indices to be included
        exclude_indices = _filter_exclude_indices(categorical_indices_set,
                                                  numerical_indices_set,
                                                  exclude, all_indices)
        categorical_indices_set, numerical_indices_set = exclude_indices

        all_indices = numerical_indices_set.union(categorical_indices_set)
        if len(all_indices) == 0:  # pylint: disable=len-as-condition
            raise RuntimeError('None of the columns were selected to be '
                               'described.')

        description = dict()
        for idx in numerical_indices_set:
            if is_structured_array:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[idx], **kwargs)
            else:
                description[idx] = describe_numerical_array(  # type: ignore
                    array[:, idx], **kwargs)
        for idx in categorical_indices_set:
            if is_structured_array:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[idx])
            else:
                description[idx] = describe_categorical_array(  # type: ignore
                    array[:, idx])
    else:  # pragma: no cover
        assert False, 'The input array can only be 1- or 2-dimensional.'

    return description  # type: ignore

示例#7

显示文件

def describe_categorical_array(
    array: Union[np.ndarray, np.void]
) -> Dict[str, Union[str, int, bool, np.ndarray]]:
    """
    Describes a categorical numpy array with basic statistics.

    The description output by this function is a dictionary with the
    following keys:

    ``count`` : integer
        The number of elements in the array.

    ``unique`` : numpy.ndarray
        The unique values in the array, ordered lexicographically.

    ``unique_counts`` : numpy.ndarray
        The counts of the unique values in the array.

    ``top`` : string
        The most frequent value in the array.

    ``freq`` : integer
        The count of the most frequent value in the array.

    ``is_top_unique`` : boolean
        Indicates whether the most frequent value (``freq``) in the array is
        the only one with that count.

    Parameters
    ----------
    array : Union[numpy.ndarray, numpy.void]
        An array for which a description is desired.

    Raises
    ------
    IncorrectShapeError
        The input array is not 1-dimensinoal.
    ValueError
        The input array is empty.

    Warns
    -----
    UserWarning
        When the input array is not purely textual it needs to be converted to
        a string type before it can be described.

    Returns
    -------
    categorical_description : Dict[string, Union[string, integer, \
boolean, numpy.ndarray]]
        A dictionary describing the categorical input array.
    """
    if not fuav.is_1d_like(array):
        raise IncorrectShapeError('The input array should be 1-dimensional.')

    classic_array = fuat.as_unstructured(array)
    assert len(classic_array.shape) == 1, '1D arrays only at this point.'

    if not classic_array.shape[0]:
        raise ValueError('The input array cannot be empty.')
    if not fuav.is_textual_array(classic_array):
        warnings.warn(
            'The input array is not purely categorical. Converting the input '
            'array into a textual type to facilitate a categorical '
            'description.',
            category=UserWarning)
        classic_array = classic_array.astype(str)

    unique, unique_counts = np.unique(classic_array, return_counts=True)

    unique_sort_index = np.argsort(unique)
    unique = unique[unique_sort_index]
    unique_counts = unique_counts[unique_sort_index]

    top_index = np.argmax(unique_counts)

    top = unique[top_index]
    freq = unique_counts[top_index]

    is_top_unique = (unique_counts == freq).sum() < 2

    categorical_description = {
        'count': classic_array.shape[0],
        'unique': unique,
        'unique_counts': unique_counts,
        'top': top,
        'freq': freq,
        'is_top_unique': is_top_unique
    }

    return categorical_description

示例#8

显示文件

    def __init__(self,
                 data: np.ndarray,
                 local_explanation: bool = True,
                 model: object = None,
                 **kwargs: Any) -> None:
        """
        Initialises a tabular LIME wrapper.
        """
        # pylint: disable=too-many-branches,too-many-statements

        warnings.warn(
            'The LIME wrapper will be deprecated in FAT Forensics version '
            '0.0.3. Please consider using the TabularBlimeyLime explainer '
            'class implemented in the fatf.transparency.predictions.'
            'surrogate_explainers module instead. Alternatively, you may '
            'consider building a custom surrogate explainer using the '
            'functionality implemented in FAT Forensics -- see the *Tabular '
            'Surrogates* how-to guide for more details.', FutureWarning)

        valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS)
        invalid_params = set(kwargs.keys()).difference(valid_params)
        if invalid_params:
            raise AttributeError('The following named parameters are not '
                                 'valid: {}.'.format(invalid_params))

        # Split parameters
        init_params = {
            key: kwargs[key]
            for key in kwargs if key in self._INIT_PARAMS
        }
        explain_params = {
            key: kwargs[key]
            for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS
        }

        # Check data
        if not fuav.is_2d_array(data):
            raise IncorrectShapeError('The data parameter must be a '
                                      '2-dimensional numpy array.')
        if not fuav.is_numerical_array(data):
            raise ValueError('LIME does not support non-numerical data '
                             'arrays.')

        # Honour native local explanation keyword
        local_explanation_keyword = 'sample_around_instance'
        if local_explanation_keyword not in init_params:
            init_params[local_explanation_keyword] = local_explanation

        # Sort out a structured data array
        if fuav.is_structured_array(data):
            categorical_indices_keyword = 'categorical_features'
            categorical_indices = init_params.get(categorical_indices_keyword,
                                                  None)

            if categorical_indices is not None:
                if isinstance(categorical_indices, list):
                    categorical_indices = np.array(categorical_indices)
                elif isinstance(categorical_indices, np.ndarray):
                    pass
                else:
                    raise TypeError('The {} parameter either has to be a '
                                    'list, a numpy array or None.'.format(
                                        categorical_indices_keyword))

                if not fuav.is_1d_array(categorical_indices):
                    raise IncorrectShapeError(
                        '{} array/list is not '
                        '1-dimensional.'.format(categorical_indices_keyword))
                if not fuav.is_textual_array(categorical_indices):
                    raise ValueError('Since {} is an array of indices for '
                                     'a structured array, all of its elements '
                                     'should be strings.'.format(
                                         categorical_indices_keyword))

                # Check categorical indices
                if not fuat.are_indices_valid(data, categorical_indices):
                    raise ValueError(
                        'Indices given in the {} parameter '
                        'are not valid for the input data '
                        'array.'.format(categorical_indices_keyword))
                init_params[categorical_indices_keyword] = np.array(
                    [data.dtype.names.index(y) for y in categorical_indices])

            data = fuat.as_unstructured(data)

        # Get a LIME tabular explainer
        self.mode = init_params.get('mode', 'classification')
        if self.mode not in ['classification', 'regression']:
            raise ValueError("The mode must be either 'classification' or "
                             "'regression'. '{}' given.".format(self.mode))

        self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer(
            data, **init_params)

        # Check the model
        self.model = model
        self.model_is_probabilistic = False
        if model is not None:
            if fumv.check_model_functionality(
                    model, require_probabilities=True, suppress_warning=True):
                self.model_is_probabilistic = True
            elif fumv.check_model_functionality(
                    model, require_probabilities=False, suppress_warning=True):
                self.model_is_probabilistic = False
                logger.warning('The model can only be used for LIME in a '
                               'regressor mode.')
            else:
                raise IncompatibleModelError('LIME requires a model object to '
                                             'have a fit method and '
                                             'optionally a predict_proba '
                                             'method.')

        # Check the predictive function and memorise parameters that may be
        # useful for explaining an instance
        pred_fn_name = 'predict_fn'
        if pred_fn_name in explain_params:
            prediction_function = explain_params[pred_fn_name]
            # Make sure that its a function
            if not callable(prediction_function):
                raise TypeError('The {} parameter is not callable -- it has '
                                'to be a function.'.format(pred_fn_name))

            # Warn the user if both a model and a function are provided
            if self.model is not None:
                warnings.warn(
                    'Since both, a model and a predictive function, are '
                    'provided only the latter will be used.', UserWarning)

        self.explain_instance_params = explain_params

示例#9

显示文件

文件： tools.py 项目： enrsr/fat-forensics-1

def group_by_column(
    dataset: np.ndarray,
    column_index: Index,
    groupings: Optional[List[Union[float, Tuple[str]]]] = None,
    numerical_bins_number: int = 5,
    treat_as_categorical: Optional[bool] = None
) -> Tuple[List[List[int]], List[str]]:
    """
    Groups row indices of an array based on value grouping of a chosen column.

    If selected column is numerical, by default the values are grouped into 5
    bins equally distributed between the minimum and the maximum value of the
    column. The number of bins can be changed with the
    ``numerical_bins_number`` if desired. Alternatively, the exact bin
    boundaries can be given via the ``groupings`` parameter.

    For categorical columns, the default binning is one bin for every unique
    value in the selected column. This behaviour can be changed by providing
    the ``groupings`` parameter, where multiple values can be selected to
    create one bin.

    Parameters
    ----------
    dataset : numpy.ndarray
        A dataset to be used for grouping the row indices.
    column_index : Union[string, integer]
        A column index (a string for structured numpy arrays or an integer for
        unstructured arrays) of the column based on which the row indices will
        be partitioned.
    groupings : List[Union[number, Tuple[string]]], optional (default=None)
        A list of user-specified groupings for the selected column. The default
        grouping for categorical (textual) columns is splitting them by all the
        unique values therein. The numerical columns are, by default, binned
        into 5 bins (see the ``numerical_bins_number`` parameter) uniformly
        distributed between the minimum and the maximum value of the column.
        To introduce custom binning for a categorical column ``groupings``
        parameter should be a list of tuples, where every tuple represents a
        single group. For example, a column with the following unique values
        ``['a', 'b', 'c', 'd']`` can be split into two groups: ``['a', 'd']``
        and ``['b', 'c']`` by providing ``[('a', 'd'), ('b', 'c')]`` grouping.
        For numerical columns custom grouping should be introduced as a list of
        bucket boundaries. Every bucket includes all the values that are
        **less or equal** to the specified bucket boundary and greater than the
        previous boundary if one is given.
    numerical_bins_number : integer, optional (default=5)
        The number of bins used for default binning of numerical columns.
    treat_as_categorical : boolean, optional (default=None)
        Whether the selected column should be treated as a categorical or
        numerical feature. If set to ``None``, the type of the column will be
        inferred from the data therein. If set to ``False``, the column will be
        treated as numerical unless it is string-based in which case a warning
        will be emitted and the column will be treated as numerical despite
        this setting. Finally, if set to ``True``, the column will be treated
        as categorical.

    Warns
    -----
    UserWarning
        When grouping is done on a categorical column a warning is emitted when
        some of the values in that column are not accounted for, i.e. they are
        not included in the ``groupings`` parameter. Also, if some of the rows
        are not included in any of the groupings, a warning is shown. Missing
        row indices may be a result of some of the values being not-a-number
        for a numerical column and missing some of the unique values for a
        categorical column. ``treat_as_categorical`` parameter is set to
        ``False``, however the feature selected is string-based
        (i.e. categorical), therefore cannot be treated as a numerical one.

    Raises
    ------
    IncorrectShapeError
        The input ``dataset`` is not 2-dimensional.
    IndexError
        The supplied ``column_index`` is not valid for the input ``dataset``.
    TypeError
        The column index is neither a string nor an integer. The numerical bins
        number is not an integer. The ``groupings`` parameter is neither a list
        not ``None``. One of the grouping bin boundaries (for a numerical
        feature column) is not a number. One of the groupings (for a
        categorical feature column) is not a tuple. The
        ``treat_as_categorical`` parameter is neither a boolean nor ``None``.
    ValueError
        The input ``dataset`` is not of a base type. The numerical bins number
        is less than 2. The ``groupings`` list is empty. The numbers in the
        ``groupings`` parameter are not monotonically increasing (for a
        numerical column). There are duplicate values shared among tuples in
        the ``grouping`` parameter or one of the values does not appear in the
        selected column (for a categorical column).

    Returns
    -------
    indices_per_bin : List[List[integer]]
        A list of lists with the latter one holding row indices of a particular
        group.
    bin_names : List[string]
        A list holding a description of each group.
    """
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    if not fuav.is_2d_array(dataset):
        raise IncorrectShapeError('The input array should be 2-dimensional.')

    if not fuav.is_base_array(dataset):
        raise ValueError('The input array should be of a base type (a mixture '
                         'of numerical and textual types).')

    # Check index validity
    if isinstance(column_index, (str, int)):
        if not fuat.are_indices_valid(dataset, np.array([column_index])):
            raise IndexError('*{}* is not a valid column index for the input '
                             'dataset.'.format(column_index))
    else:
        raise TypeError('The column index can either be a string or an '
                        'integer.')

    # Check the number of numerical bins
    if isinstance(numerical_bins_number, int):
        if numerical_bins_number < 2:
            raise ValueError('The numerical_bins_number needs to be at least '
                             '2.')
    else:
        raise TypeError('The numerical_bins_number parameter has to be an '
                        'integer.')

    # Check treat_as_categorical
    if treat_as_categorical is not None:
        if not isinstance(treat_as_categorical, bool):
            raise TypeError('The treat_as_categorical parameter has to be a '
                            'boolean.')

    if fuav.is_structured_array(dataset):
        column = dataset[column_index]
    else:
        column = dataset[:, column_index]
    assert fuav.is_1d_array(column), 'This must be a 1D numpy array.'

    # Get a list of all the row indices
    all_row_indices = set(range(column.shape[0]))

    indices_per_bin = []
    bin_names = []

    is_numerical_column = fuav.is_numerical_array(column)
    is_categorical_column = fuav.is_textual_array(column)
    assert is_numerical_column is not is_categorical_column, \
        'The column must be a base array.'

    # Sort out numerical/categorical column treatment
    if treat_as_categorical is None:
        go_numerical = is_numerical_column
    else:
        if treat_as_categorical:
            go_numerical = False
        else:  # Treat as numerical
            if is_numerical_column:
                go_numerical = True
            else:  # Is not numerical
                warnings.warn(
                    'Selected feature is categorical, therefore cannot be '
                    'treated as numerical. The feature will be treated as '
                    'categorical despite the treat_as_categorical parameter '
                    'set to False.', UserWarning)
                go_numerical = False

    if go_numerical:
        if groupings is None:
            # Get default bins
            bins = np.linspace(column.min(),
                               column.max(),
                               num=numerical_bins_number,
                               endpoint=False)[1:].tolist()
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A numerical grouping list has to contain at '
                                 'least one element.')

            # Every element in the groupings list must be a number
            for i, number in enumerate(groupings):
                if not isinstance(number, Number):
                    raise TypeError('For a numerical column all of the '
                                    'grouping items must be numbers. *{}* '
                                    'is not a number.'.format(number))
                if i != 0:
                    if number <= groupings[i - 1]:
                        raise ValueError('The numbers in the groupings list '
                                         'have to be monotonically '
                                         'increasing.')
            bins = groupings
        else:
            raise TypeError('Since a numerical column was chosen the grouping '
                            'must be a list of bin boundaries or None.')

        lower_edge = 'x <= {}'
        middle = '{} < x <= {}'
        upper_edge = '{} < x'

        indices_seen_so_far = set()  # type: Set[int]

        for i, edge in enumerate(bins):
            if i == 0:
                indices = np.where(column <= edge)[0].tolist()

                indices_per_bin.append(indices)
                bin_names.append(lower_edge.format(edge))
            else:
                edge_lower = bins[i - 1]

                indices_l = set(np.where(column <= edge)[0].tolist())
                indices_u = set(np.where(column > edge_lower)[0].tolist())
                indices = list(indices_l.intersection(indices_u))

                indices_per_bin.append(indices)
                bin_names.append(middle.format(edge_lower, edge))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

        assert bins, 'If bins is empty, i and edge will not be defined.'
        # pylint: disable=undefined-loop-variable
        indices = np.where(column > edge)[0].tolist()

        indices_per_bin.append(indices)
        bin_names.append(upper_edge.format(edge))

        assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
        indices_seen_so_far = indices_seen_so_far.union(indices)
    else:
        unique_elements = np.sort(np.unique(column)).tolist()

        if groupings is None:
            bins = [(i, ) for i in unique_elements]
        elif isinstance(groupings, list):
            if not groupings:
                raise ValueError('A categorical grouping list has to contain '
                                 'at least one element.')

            values_seen_so_far = set()  # type: Set[str]

            # Every element in the groupings list must be a valid tuple
            for value_tuple in groupings:
                if not isinstance(value_tuple, tuple):
                    raise TypeError('For a categorical column all of the '
                                    'grouping items must be tuples. *{}* '
                                    'is not a tuple.'.format(value_tuple))
                for value in value_tuple:
                    if value not in unique_elements:
                        raise ValueError('*{}* value is not present in the '
                                         'selected column.'.format(value))

                if values_seen_so_far.intersection(value_tuple):
                    raise ValueError('Some values are duplicated across '
                                     'tuples.')
                values_seen_so_far = values_seen_so_far.union(value_tuple)

            unaccounted_values = set(unique_elements).difference(
                values_seen_so_far)
            if unaccounted_values:
                warnings.warn(
                    'The following values in the selected column were not '
                    'accounted for in the grouping '
                    'tuples:\n{}.'.format(unaccounted_values), UserWarning)

            bins = [tuple(sorted(i)) for i in groupings]  # type: ignore
            bins = sorted(bins)
        else:
            raise TypeError('Since a categorical column was chosen the '
                            'grouping must be a list of tuples representing '
                            'categorical values grouping or None for the '
                            'default grouping.')

        indices_seen_so_far = set()

        for bin_values in bins:
            indices = set()
            for value in bin_values:
                vid = np.where(column == value)[0].tolist()
                indices = indices.union(vid)

            indices_per_bin.append(list(indices))
            bin_names.append('{}'.format(bin_values))

            assert not indices_seen_so_far.intersection(indices), 'Duplicates.'
            indices_seen_so_far = indices_seen_so_far.union(indices)

    # Validate that all of the row indices were accounted for
    missed_indices = all_row_indices.difference(indices_seen_so_far)
    if missed_indices:
        warnings.warn(
            'The following row indices could not be accounted for:\n{}.\n For '
            'a numerical column there may have been some numpy.nan therein. '
            'For a categorical column some of the column values were probably '
            'not specified in the grouping, in which case there should be a '
            'separate user warning.'.format(missed_indices), UserWarning)

    return indices_per_bin, bin_names