def euclidean_point_distance(y: Union[np.ndarray, np.void], X: np.ndarray) -> np.ndarray: """ Calculates the Euclidean distance between ``y`` and every row of ``X``. ``y`` has to be a 1-dimensional numerical numpy array or a row of a structured numpy array (i.e. numpy's void) and ``X`` has to be a 2-dimensional numerical numpy array. The length of ``y`` has to be the same as the width of ``X``. Parameters ---------- y : Union[numpy.ndarray, numpy.void] A numpy array (has to be 1-dimensional and purely numerical) used to calculate distances from. X : numpy.ndarray A numpy array (has to be 2-dimensional and purely numerical) to which rows distances are calculated. Raises ------ IncorrectShapeError Either ``y`` is not 1-dimensional or ``X`` is not 2-dimensional or the length of ``y`` is not equal to the number of columns in ``X``. ValueError Either of the input arrays is not purely numerical. Returns ------- distances : numpy.ndarray An array of Euclidean distances between ``y`` and every row of ``X``. """ # pylint: disable=invalid-name if not fuav.is_1d_like(y): raise IncorrectShapeError('The y array should be 1-dimensional.') if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') # Transform the arrays to unstructured y_array = fuat.as_unstructured(y) X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name if not fuav.is_numerical_array(y_array): raise ValueError('The y array should be purely numerical.') if not fuav.is_numerical_array(X_array): raise ValueError('The X array should be purely numerical.') # Compare shapes if y_array.shape[0] != X_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of elements ' 'in the y array.') distances = np.apply_along_axis(euclidean_distance, 1, X_array, y_array) return distances
def euclidean_array_distance(X: np.ndarray, Y: np.ndarray) -> np.ndarray: """ Calculates the Euclidean distance matrix between rows in ``X`` and ``Y``. Both ``X`` and ``Y`` have to be 2-dimensional numerical numpy arrays of the same width. Parameters ---------- X : numpy.ndarray A numpy array -- has to be 2-dimensional and purely numerical. Y : numpy.ndarray A numpy array -- has to be 2-dimensional and purely numerical. Raises ------ IncorrectShapeError Either ``X`` or ``Y`` is not 2-dimensional or ``X`` and ``Y`` do not have the same number of columns. ValueError Either of the input arrays is not purely numerical. Returns ------- distance_matrix : numpy.ndarray An matrix of Euclidean distances between rows in ``X` and ``Y``. """ # pylint: disable=invalid-name if not fuav.is_2d_array(X): raise IncorrectShapeError('The X array should be 2-dimensional.') if not fuav.is_2d_array(Y): raise IncorrectShapeError('The Y array should be 2-dimensional.') if not fuav.is_numerical_array(X): raise ValueError('The X array should be purely numerical.') if not fuav.is_numerical_array(Y): raise ValueError('The Y array should be purely numerical.') # Transform the arrays to unstructured Y_array = fuat.as_unstructured(Y) # pylint: disable=invalid-name X_array = fuat.as_unstructured(X) # pylint: disable=invalid-name # Compare shapes if Y_array.shape[1] != X_array.shape[1]: raise IncorrectShapeError('The number of columns in the X array ' 'should the same as the number of columns ' 'in Y array.') distance_matrix = np.apply_along_axis(euclidean_point_distance, 1, X_array, Y_array) return distance_matrix
def euclidean_distance(x: Union[np.ndarray, np.void], y: Union[np.ndarray, np.void]) -> float: """ Calculates the Euclidean distance between two 1-dimensional numpy "arrays". Each of the input arrays can be either a 1D numpy array or a row of a structured numpy array, i.e. numpy's void. Parameters ---------- x : Union[numpy.ndarray, numpy.void] The first numpy array (has to be 1-dimensional and purely numerical). y : Union[numpy.ndarray, numpy.void] The second numpy array (has to be 1-dimensional and purely numerical). Raises ------ IncorrectShapeError Either of the input arrays is not 1-dimensional or they are not of the same length. ValueError Either of the input arrays is not purely numerical. Returns ------- distance : float Euclidean distance between the two numpy arrays. """ # pylint: disable=invalid-name if not fuav.is_1d_like(x): raise IncorrectShapeError('The x array should be 1-dimensional.') if not fuav.is_1d_like(y): raise IncorrectShapeError('The y array should be 1-dimensional.') # Transform the arrays to unstructured x_array = fuat.as_unstructured(x) y_array = fuat.as_unstructured(y) if not fuav.is_numerical_array(x_array): raise ValueError('The x array should be purely numerical.') if not fuav.is_numerical_array(y_array): raise ValueError('The y array should be purely numerical.') if x_array.shape[0] != y_array.shape[0]: raise IncorrectShapeError(('The x and y arrays should have the same ' 'length.')) distance = np.linalg.norm(x_array - y_array) return distance
def fatf_structured_to_unstructured( structured_array: np.ndarray) -> np.ndarray: """ Converts a structured array into a plane array of the most generic type. If the input arrays is purely numerical, the output array is of the most generic numerical type. Otherwise, the output arrays is converted to a string type. Parameters ---------- structured_array : numpy.ndarray A structured numpy array to be converted into a plane numpy array. Raises ------ TypeError The input array is not a structured numpy array. ValueError The input array consists of complex types such as numpy void and object-like types that are not supported by this function. Returns ------- classic_array : numpy.ndarray A classic numpy array representation of the ``structured_array`` with the most generic type out of the input array's dtypes. """ if not fuav.is_structured_array(structured_array): raise TypeError('structured_array should be a structured numpy array.') if not fuav.is_base_array(structured_array): raise ValueError('fatf_structured_to_unstructured only supports ' 'conversion of arrays that hold base numpy types, ' 'i.e. numerical and string-like -- numpy void and ' 'object-like types are not allowed.') if fuav.is_numerical_array(structured_array): dtype = np.array([i for i in structured_array[0]]).dtype else: dtype = str dtyped_columns = [] # pylint: disable=len-as-condition assert len(structured_array.dtype.names) != 0, 'This should be structured.' for i in structured_array.dtype.names: dtyped_columns.append(structured_array[i].astype(dtype)) classic_array = np.column_stack(dtyped_columns) return classic_array
def _input_is_valid(distances: np.ndarray) -> bool: """ Validates input parameters of a kernel function. Parameters ---------- distances : numpy.ndarray A 1-dimensional numpy array of distances. Raises ------ IncorrectShapeError The ``distances`` array is not a 1-dimensional numpy array. TypeError The ``distances`` array is a structured numpy array or it is not a purely numerical array. Returns ------- is_input_ok : boolean ``True`` if the input is valid, ``False`` otherwise. """ is_input_ok = False if fuav.is_structured_array(distances): raise TypeError('The distances array cannot be a structured array.') if not fuav.is_1d_array(distances): raise IncorrectShapeError('The distances array must be a ' '1-dimensional array.') if not fuav.is_numerical_array(distances): raise TypeError('The distances array must be of numerical type.') is_input_ok = True return is_input_ok
def counterfactual_fairness_check(unfair_counterfactuals: Optional[ np.ndarray] = None, distances: Optional[np.ndarray] = None, threshold: Optional[float] = None) -> bool: """ Checks for counterfactual fairness using a counterfactual fairness arrays. There are two different approaches to evaluate counterfactual fairness. The first one is to take the ``distances`` to the counterfactual examples and see whether there are any that exceed a certain ``threshold`` in which case a given instance is considered to be treated unfairly. Alternatively by using the ``unfair_counterfactuals`` array this function checks whether there are any unfair counterfactual instances. In case all the input parameters are given **the distance-based approach takes the precedence**. Parameters ---------- unfair_counterfactuals : numpy.ndarray, optional (default=None) A 2-dimensional numpy array with counterfactual examples that expose unfairness of a prediction. distances : numpy.ndarray, optional (default=None) A 1-dimensional numpy array with . threshold : number, optional (default=None) A numerical threshold above which a counterfactual instance is too far, therefore it is considered to be an exemplar of individual unfairness. Raises ------ IncorrectShapeError The ``unfair_counterfactuals`` parameter is not a 2-dimensional array. The ``distances`` parameter is not a 1-dimensional array. RuntimeError Either of the required input parameters were not given: ``unfair_counterfactuals`` or ``distances`` and ``threshold``. TypeError The ``threshold`` parameter is not a number. ValueError The ``distances`` array is not purely numerical. Returns ------- counterfactually_unfair : boolean ``True`` if there are any counterfactually unfair instances, ``False`` otherwise. """ if distances is not None and threshold is not None: if not fuav.is_1d_array(distances): raise IncorrectShapeError('The distances parameter has to be a ' '1-dimensional array.') if not fuav.is_numerical_array(distances): raise ValueError('The distances array has to be purely numerical.') if not isinstance(threshold, Number): raise TypeError('The threshold parameter has to be a number.') counterfactually_unfair = (distances > threshold).any() elif unfair_counterfactuals is not None: if not fuav.is_2d_array(unfair_counterfactuals): raise IncorrectShapeError('The unfair counterfactuals parameter ' 'has to be a 2-dimensional numpy array.') counterfactually_unfair = bool(unfair_counterfactuals.size) else: raise RuntimeError('Either of the two is required to run this ' 'function: unfair_counterfactuals parameter or ' 'both distances and threshold parameters.') return counterfactually_unfair
def partial_dependence_ice( ice_array: np.ndarray, include_rows: Optional[Union[int, List[int]]] = None, exclude_rows: Optional[Union[int, List[int]]] = None) -> np.ndarray: """ Calculates Partial Dependence based on Individual Conditional Expectations. .. note:: If you want to calculate Partial Dependence directly from a dataset and a classifier please see :func:`transparency.models.feature_influence.partial_dependence` function. Parameters ---------- ice_array : numpy.ndarray Individual Conditional Expectation array for which Partial Dependence is desired. include_rows : Union[int, List[int]], optional (default=None) Indices of rows that will be included in the PD calculation. If this parameter is specified, PD will only be calculated for the selected rows. If additionally ``exclude_rows`` is specified the selected rows will be a set difference between the two. This parameter can either be a *list* of indices or a single index (integer). exclude_rows : Union[int, List[int]], optional (default=None) The indices of rows to be excluded from the PD calculation. If this parameter is specified and ``include_rows`` is not, these indices will be excluded from all of the rows. If both include and exclude parameters are specified, the rows included in the PD calculation will be a set difference of the two. This parameter can either be a *list* of indices or a single index (integer). Raises ------ IncorrectShapeError The input array is not a 3-dimensional numpy array. TypeError Either ``include_rows`` or ``exclude_rows`` parameter is not ``None``, an integer or a list of integers. ValueError The ``ice_array`` is not an unstructured numpy array or it is not a numerical array. One of the ``include_rows`` or ``exclude_rows`` indices is not valid for the input array. Returns ------- partial_dependence_array : numpy.ndarray A 2-dimensional array of (steps_number, n_classes) shape representing Partial Dependence for all of the classes for selected rows (data points). """ if fuav.is_structured_array(ice_array): raise ValueError('The ice_array should not be structured.') if not fuav.is_numerical_array(ice_array): raise ValueError('The ice_array should be purely numerical.') if len(ice_array.shape) != 3: raise IncorrectShapeError('The ice_array should be 3-dimensional.') rows_number = ice_array.shape[0] include_r = _filter_rows(include_rows, exclude_rows, rows_number) filtered_ice_array = ice_array[include_r] partial_dependence_array = filtered_ice_array.mean(axis=0) return partial_dependence_array
def _validate_input(ice_pdp_array: np.ndarray, feature_linespace: np.ndarray, class_index: int, feature_name: Union[None, str], class_name: Union[None, str], plot_axis: Union[None, plt.Axes], test_partial_dependence: bool = False) -> bool: """ Validates input parameters for ICE and PD plotting functions. Validates input parameters for :func:`fatf.vis.feature_influence.plot_individual_conditional_expectation` and :func:`fatf.vis.feature_influence.plot_partial_dependence` functions. Parameters ---------- ice_pdp_array : numpy.ndarray An array that contains ICE or PD calculations. feature_linespace : numpy.ndarray An array that contains the values for which the selected feature was sampled. class_index : integer The index of the class for which the plot will be created. feature_name : string or None The name of the feature for which ICE or PD was originally calculated. class_name : string or None The name of the class that ``class_index`` parameter points to. plot_axis : matplotlib.pyplot.Axes or None A matplotlib axis object to plot on top of. test_partial_dependence : boolean Whether to treat the input array as PD or ICE calculation result. Raises ------ IncorrectShapeError The ICE or the PD array has a wrong number of dimensions (3 and 2 respectively). The feature linespace array has a wrong number of dimensions -- 1 is expected. IndexError The class index is invalid for the input array. TypeError The class index is not an integer; the feature name is not a string or a ``None``; the class name is not a string or a ``None``; the plot axis is not a matplotlib.pyplot.Axes type object or a ``None``. ValueError The input array is structured or is not numerical. The linespace array is structured, not numerical or its length does not agree with the number of steps in the input array. Returns ------- input_is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-arguments,too-many-branches input_is_valid = False assert isinstance(test_partial_dependence, bool), \ 'test_partial_dependence is not a boolean.' if fuav.is_structured_array(ice_pdp_array): raise ValueError('The input array cannot be a structured array.') if not fuav.is_numerical_array(ice_pdp_array): raise ValueError('The input array has to be a numerical array.') if test_partial_dependence: if len(ice_pdp_array.shape) != 2: raise IncorrectShapeError('plot_partial_depenedence expects a ' '2-dimensional array of shape (n_steps, ' 'n_classes).') else: if len(ice_pdp_array.shape) != 3: raise IncorrectShapeError('plot_individual_condtional_expectation ' 'expects a 3-dimensional array of shape ' '(n_samples, n_steps, n_classes).') if fuav.is_structured_array(feature_linespace): raise ValueError('The linespace array cannot be a structured array.') if not fuav.is_1d_array(feature_linespace): raise IncorrectShapeError('The linespace array has to be a ' '1-dimensional array of shape (n_steps, ).') if not fuav.is_numerical_array(feature_linespace): raise ValueError('The linespace array has to be numerical.') if feature_linespace.shape[0] != ice_pdp_array.shape[-2]: raise ValueError('The length of the linespace array ({}) does not ' 'agree with the number of linespace steps ({}) in ' 'the input array.'.format(feature_linespace.shape[0], ice_pdp_array.shape[-2])) # Is the index valid for the array if not isinstance(class_index, int): raise TypeError('Class index has to be an integer.') if class_index < 0 or class_index >= ice_pdp_array.shape[-1]: raise IndexError('Class index {} is not a valid index for the ' 'input array. There are only {} classes ' 'available.'.format(class_index, ice_pdp_array.shape[-1])) if feature_name is not None and not isinstance(feature_name, str): raise TypeError('The feature name has to be either None or a string.') if class_name is not None and not isinstance(class_name, str): raise TypeError('The class name has to be either None or a string.') if plot_axis is not None and not isinstance(plot_axis, plt.Axes): raise TypeError('The plot axis has to be either None or a matplotlib.' 'pyplot.Axes type object.') input_is_valid = True return input_is_valid
def individual_conditional_expectation( dataset: np.ndarray, model: object, feature_index: Union[int, str], treat_as_categorical: Optional[bool] = None, steps_number: Optional[int] = None, include_rows: Optional[Union[int, List[int]]] = None, exclude_rows: Optional[Union[int, List[int]]] = None ) -> Tuple[np.ndarray, np.ndarray]: """ Calculates Individual Conditional Expectation for a selected feature. Based on the provided dataset and model this function computes Individual Conditional Expectation (ICE) of a selected feature for all target classes. If ``treat_as_categorical`` parameter is not provided the function will infer the type of the selected feature and compute the appropriate ICE. Otherwise, the user can specify whether the selected feature should be treated as a categorical or numerical feature. If the selected feature is numerical, you can specify the number of samples between this feature's minimum and maximum value for which the input model will be evaluated. By default this value is set to 100. Finally, it is possible to filter the rows of the input dataset that will be used to calculate ICE with ``include_rows`` and ``exclude_rows`` parameters. If ``include_rows`` is specified ICE will only be calculated for these rows. If both include and exclude parameters are given, ICE will be computed for the set difference. Finally, if only the exclude parameter is specified, these rows will be subtracted from the whole dataset. This approach is an implementation of a method introduced by [GOLDSTEIN2015PEEKING]_. It is intended to be used with probabilistic models, therefore the input model must have a ``predict_proba`` method. .. [GOLDSTEIN2015PEEKING] Goldstein, A., Kapelner, A., Bleich, J. and Pitkin, E., 2015. Peeking inside the black box: Visualizing statistical learning with plots of individual conditional expectation. Journal of Computational and Graphical Statistics, 24(1), pp.44-65. Parameters ---------- dataset : numpy.ndarray A dataset based on which ICE will be computed. model : object A fitted model which predictions will be used to calculate ICE. (Please see :class:`fatf.utils.models.models.Model` class documentation for the expected model object specification.) feature_index : Union[integer, string] An index of the feature column in the input dataset for which ICE will be computed. treat_as_categorical : boolean, optional (default=None) Whether to treat the selected feature as categorical or numerical. steps_number : integer, optional (default=None, i.e. 100) The number of evenly spaced samples between the minimum and the maximum value of the selected feature for which the model's prediction will be evaluated. (This parameter applies only to numerical features.) include_rows : Union[int, List[int]], optional (default=None) Indices of rows that will be included in the ICE calculation. If this parameter is specified, ICE will only be calculated for the selected rows. If additionally ``exclude_rows`` is specified the selected rows will be a set difference between the two. This parameter can either be a *list* of indices or a single index (integer). exclude_rows : Union[int, List[int]], optional (default=None) The indices of rows to be excluded from the ICE calculation. If this parameter is specified and ``include_rows`` is not, these indices will be excluded from all of the rows. If both include and exclude parameters are specified, the rows included in the ICE calculation will be a set difference of the two. This parameter can either be a *list* of indices or a single index (integer). Warns ----- UserWarning The feature is treated as categorical but the number of steps parameter is provided (not ``None``). In this case the ``steps_number`` parameter is ignored. Also, the user is warned when the selected feature is detected to be categorical (textual) while the user indicated that it is numerical. Raises ------ IncompatibleModelError The model does not have required functionality -- it needs to be able to output probabilities via ``predict_proba`` method. IncorrectShapeError The input dataset is not a 2-dimensional numpy array. IndexError Provided feature (column) index is invalid for the input dataset. TypeError ``treat_as_categorical`` is not ``None`` or boolean. The ``steps_number`` parameter is not ``None`` or integer. Either ``include_rows`` or ``exclude_rows`` parameter is not ``None``, an integer or a list of integers. ValueError The input dataset must only contain base types (textual and numerical values). One of the ``include_rows`` or ``exclude_rows`` indices is not valid for the input dataset. The ``steps_number`` is smaller than 2. Returns ------- ice : numpy.ndarray An array of Individual Conditional Expectations for all of the selected dataset rows and the feature (dataset column) of choice. It's of the (n_samples, steps_number, n_classes) shape where n_samples is the number of rows selected from the dataset for the ICE computation, steps_number is the number of generated samples for the selected feature and n_classes is the number of classes in the target of the dataset. The numbers in this array represent the probability of every class for every selected data point when the selected feature is fixed to one of the values in the generated feature linespace (see below). feature_linespace : numpy.ndarray A one-dimensional array -- (steps_number, ) -- with the values for which the selected feature was substituted when the dataset was evaluated with the specified model. """ # pylint: disable=too-many-arguments,too-many-locals assert _input_is_valid(dataset, model, feature_index, treat_as_categorical, steps_number), 'Input must be valid.' is_structured = fuav.is_structured_array(dataset) if is_structured: column = dataset[feature_index] else: column = dataset[:, feature_index] assert fuav.is_1d_array(column), 'Column must be a 1-dimensional array.' if fuav.is_numerical_array(column): is_categorical_column = False elif fuav.is_textual_array(column): is_categorical_column = True else: assert False, 'Must be an array of a base type.' # pragma: nocover # If needed, infer the column type. if treat_as_categorical is None: treat_as_categorical = is_categorical_column elif not treat_as_categorical and is_categorical_column: message = ('Selected feature is categorical (string-base elements), ' 'however the treat_as_categorical was set to False. Such ' 'a combination is not possible. The feature will be ' 'treated as categorical.') warnings.warn(message, category=UserWarning) treat_as_categorical = True steps_number = None if treat_as_categorical and steps_number is not None: warnings.warn( 'The steps_number parameter will be ignored as the feature is ' 'being treated as categorical.', category=UserWarning) # If needed, get the default steps number. if not treat_as_categorical and steps_number is None: steps_number = 100 rows_number = dataset.shape[0] include_r = _filter_rows(include_rows, exclude_rows, rows_number) filtered_dataset = dataset[include_r] sampled_data, feature_linespace = _interpolate_array( filtered_dataset, feature_index, treat_as_categorical, steps_number) ice = [ model.predict_proba(data_slice) # type: ignore for data_slice in sampled_data ] ice = np.stack(ice, axis=0) return ice, feature_linespace
def merge_ice_arrays(ice_arrays_list: List[np.ndarray]) -> np.ndarray: """ Merges multiple Individual Conditional Expectation arrays. This function allows you to merge Individual Conditional Expectation arrays into a single array as long as they were calculated for the same feature and for the same number of classes. This may be helpful when evaluating ICE for a model over multiple cross-validation folds or for multiple models. Parameters ---------- ice_arrays_list : List[numpy.ndarray] A list of Individual Conditional Expectation arrays to be merged. Raises ------ IncorrectShapeError One of the ICE arrays is not 3-dimensional. TypeError The ``ice_arrays_list`` input parameter is not a list. ValueError The list of ICE arrays to be merged is empty. One of the ICE arrays is not a numerical array. One of the ICE arrays is structured. Some of the ICE arrays do not share the same second (number of steps) or third (number of classes) dimension or type. Returns ------- ice_arrays : numpy.ndarray All of the ICE arrays merged together alongside the first dimension (number of instances). """ if isinstance(ice_arrays_list, list): if not ice_arrays_list: raise ValueError('Cannot merge 0 arrays.') previous_shape = None for ice_array in ice_arrays_list: if not fuav.is_numerical_array(ice_array): raise ValueError('The ice_array list should only contain ' 'numerical arrays.') if fuav.is_structured_array(ice_array): raise ValueError('The ice_array list should only contain ' 'unstructured arrays.') if len(ice_array.shape) != 3: raise IncorrectShapeError('The ice_array should be ' '3-dimensional.') if previous_shape is None: previous_shape = (ice_array.shape[1], ice_array.shape[2], ice_array.dtype) # yapf: disable elif (previous_shape[:2] != ice_array.shape[1:] or previous_shape[2] != ice_array.dtype): raise ValueError('All of the ICE arrays need to be ' 'constructed for the same number of classes ' 'and the same number of samples for the ' 'selected feature (the second and the third ' 'dimension of the ice array).') else: raise TypeError('The ice_arrays_list should be a list of numpy arrays ' 'that represent Individual Conditional Expectation.') ice_arrays = np.concatenate(ice_arrays_list, axis=0) return ice_arrays
def fit(self, X: np.ndarray, y: np.ndarray) -> None: """ Fits the model. Parameters ---------- X : numpy.ndarray The KNN training data. y : numpy.ndarray The KNN training labels. Raises ------ IncorrectShapeError Either the ``X`` array is not 2-dimensional, the ``y`` array is not 1-dimensional, the number of rows in ``X`` is not the same as the number of elements in ``y`` or the ``X`` array has 0 rows or 0 columns. PrefittedModelError Trying to fit the model when it has already been fitted. Usually raised when calling the ``fit`` method for the second time without clearing the model first. TypeError Trying to fit a KNN predictor in a regressor mode with non-numerical target variable. """ if self._is_fitted: raise PrefittedModelError('This model has already been fitted.') if not fuav.is_2d_array(X): raise IncorrectShapeError('The training data must be a 2-' 'dimensional array.') if not fuav.is_1d_array(y): raise IncorrectShapeError('The training data labels must be a 1-' 'dimensional array.') if X.shape[0] == 0: raise IncorrectShapeError('The data array has to have at least ' 'one data point.') # If the array is structured the fuav.is_2d_array function takes care # of checking whether there is at least one column if not fuav.is_structured_array(X) and X.shape[1] == 0: raise IncorrectShapeError('The data array has to have at least ' 'one feature.') if X.shape[0] != y.shape[0]: raise IncorrectShapeError('The number of samples in X must be the ' 'same as the number of labels in y.') if not self._is_classifier and not fuav.is_numerical_array(y): raise TypeError('Regressor can only be fitted for a numerical ' 'target vector.') numerical_indices, categorical_indices = fuat.indices_by_type(X) self._numerical_indices = numerical_indices self._categorical_indices = categorical_indices self._is_structured = fuav.is_structured_array(X) self._X = X self._y = y if self._is_classifier: unique_y, unique_y_counts = np.unique(self._y, return_counts=True) # Order labels lexicographically. unique_y_sort_index = np.argsort(unique_y) self._unique_y = unique_y[unique_y_sort_index] self._unique_y_counts = unique_y_counts[unique_y_sort_index] # How many other labels have the same count. top_y_index = self._unique_y_counts == np.max( self._unique_y_counts) top_y_unique_sorted = np.sort(self._unique_y[top_y_index]) self._majority_label = top_y_unique_sorted[0] self._unique_y_probabilities = (self._unique_y_counts / self._y.shape[0]) else: self._majority_label = self._y.mean() self._unique_y = np.ndarray((0, )) self._unique_y_counts = np.ndarray((0, )) self._unique_y_probabilities = np.ndarray((0, )) self._X_n = self._X.shape[0] self._is_fitted = True
def __init__(self, dataset: np.ndarray, ground_truth: Optional[np.ndarray] = None, categorical_indices: Optional[np.ndarray] = None, int_to_float: bool = True) -> None: """ Constructs an ``Augmentation`` abstract class. """ # pylint: disable=too-many-locals assert _validate_input(dataset, ground_truth=ground_truth, categorical_indices=categorical_indices, int_to_float=int_to_float), 'Invalid input.' self.dataset = dataset self.data_points_number = dataset.shape[0] self.is_structured = fuav.is_structured_array(dataset) self.ground_truth = ground_truth # Sort out column indices indices = fuat.indices_by_type(dataset) num_indices = set(indices[0]) cat_indices = set(indices[1]) all_indices = num_indices.union(cat_indices) if categorical_indices is None: categorical_indices = cat_indices numerical_indices = num_indices else: if cat_indices.difference(categorical_indices): msg = ('Some of the string-based columns in the input dataset ' 'were not selected as categorical features via the ' 'categorical_indices parameter. String-based columns ' 'cannot be treated as numerical features, therefore ' 'they will be also treated as categorical features ' '(in addition to the ones selected with the ' 'categorical_indices parameter).') warnings.warn(msg, UserWarning) categorical_indices = cat_indices.union(categorical_indices) numerical_indices = all_indices.difference(categorical_indices) self.categorical_indices = sorted(list(categorical_indices)) self.numerical_indices = sorted(list(numerical_indices)) self.features_number = len(all_indices) # Sort out the dtype of the sampled array. ntype = np.dtype(np.float64) if int_to_float else np.dtype(np.int64) if self.is_structured: sample_dtype = [] for column_name in self.dataset.dtype.names: if column_name in self.numerical_indices: new_dtype = fuat.generalise_dtype( self.dataset.dtype[column_name], ntype) sample_dtype.append((column_name, new_dtype)) elif column_name in self.categorical_indices: sample_dtype.append( (column_name, self.dataset.dtype[column_name])) else: assert False, 'Unknown column name.' # pragma: nocover else: if fuav.is_numerical_array(self.dataset): sample_dtype = fuat.generalise_dtype(self.dataset.dtype, ntype) else: sample_dtype = self.dataset.dtype self.sample_dtype = sample_dtype
def group_by_column( dataset: np.ndarray, column_index: Index, groupings: Optional[List[Union[float, Tuple[str]]]] = None, numerical_bins_number: int = 5, treat_as_categorical: Optional[bool] = None ) -> Tuple[List[List[int]], List[str]]: """ Groups row indices of an array based on value grouping of a chosen column. If selected column is numerical, by default the values are grouped into 5 bins equally distributed between the minimum and the maximum value of the column. The number of bins can be changed with the ``numerical_bins_number`` if desired. Alternatively, the exact bin boundaries can be given via the ``groupings`` parameter. For categorical columns, the default binning is one bin for every unique value in the selected column. This behaviour can be changed by providing the ``groupings`` parameter, where multiple values can be selected to create one bin. Parameters ---------- dataset : numpy.ndarray A dataset to be used for grouping the row indices. column_index : Union[string, integer] A column index (a string for structured numpy arrays or an integer for unstructured arrays) of the column based on which the row indices will be partitioned. groupings : List[Union[number, Tuple[string]]], optional (default=None) A list of user-specified groupings for the selected column. The default grouping for categorical (textual) columns is splitting them by all the unique values therein. The numerical columns are, by default, binned into 5 bins (see the ``numerical_bins_number`` parameter) uniformly distributed between the minimum and the maximum value of the column. To introduce custom binning for a categorical column ``groupings`` parameter should be a list of tuples, where every tuple represents a single group. For example, a column with the following unique values ``['a', 'b', 'c', 'd']`` can be split into two groups: ``['a', 'd']`` and ``['b', 'c']`` by providing ``[('a', 'd'), ('b', 'c')]`` grouping. For numerical columns custom grouping should be introduced as a list of bucket boundaries. Every bucket includes all the values that are **less or equal** to the specified bucket boundary and greater than the previous boundary if one is given. numerical_bins_number : integer, optional (default=5) The number of bins used for default binning of numerical columns. treat_as_categorical : boolean, optional (default=None) Whether the selected column should be treated as a categorical or numerical feature. If set to ``None``, the type of the column will be inferred from the data therein. If set to ``False``, the column will be treated as numerical unless it is string-based in which case a warning will be emitted and the column will be treated as numerical despite this setting. Finally, if set to ``True``, the column will be treated as categorical. Warns ----- UserWarning When grouping is done on a categorical column a warning is emitted when some of the values in that column are not accounted for, i.e. they are not included in the ``groupings`` parameter. Also, if some of the rows are not included in any of the groupings, a warning is shown. Missing row indices may be a result of some of the values being not-a-number for a numerical column and missing some of the unique values for a categorical column. ``treat_as_categorical`` parameter is set to ``False``, however the feature selected is string-based (i.e. categorical), therefore cannot be treated as a numerical one. Raises ------ IncorrectShapeError The input ``dataset`` is not 2-dimensional. IndexError The supplied ``column_index`` is not valid for the input ``dataset``. TypeError The column index is neither a string nor an integer. The numerical bins number is not an integer. The ``groupings`` parameter is neither a list not ``None``. One of the grouping bin boundaries (for a numerical feature column) is not a number. One of the groupings (for a categorical feature column) is not a tuple. The ``treat_as_categorical`` parameter is neither a boolean nor ``None``. ValueError The input ``dataset`` is not of a base type. The numerical bins number is less than 2. The ``groupings`` list is empty. The numbers in the ``groupings`` parameter are not monotonically increasing (for a numerical column). There are duplicate values shared among tuples in the ``grouping`` parameter or one of the values does not appear in the selected column (for a categorical column). Returns ------- indices_per_bin : List[List[integer]] A list of lists with the latter one holding row indices of a particular group. bin_names : List[string] A list holding a description of each group. """ # pylint: disable=too-many-locals,too-many-branches,too-many-statements if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input array should be 2-dimensional.') if not fuav.is_base_array(dataset): raise ValueError('The input array should be of a base type (a mixture ' 'of numerical and textual types).') # Check index validity if isinstance(column_index, (str, int)): if not fuat.are_indices_valid(dataset, np.array([column_index])): raise IndexError('*{}* is not a valid column index for the input ' 'dataset.'.format(column_index)) else: raise TypeError('The column index can either be a string or an ' 'integer.') # Check the number of numerical bins if isinstance(numerical_bins_number, int): if numerical_bins_number < 2: raise ValueError('The numerical_bins_number needs to be at least ' '2.') else: raise TypeError('The numerical_bins_number parameter has to be an ' 'integer.') # Check treat_as_categorical if treat_as_categorical is not None: if not isinstance(treat_as_categorical, bool): raise TypeError('The treat_as_categorical parameter has to be a ' 'boolean.') if fuav.is_structured_array(dataset): column = dataset[column_index] else: column = dataset[:, column_index] assert fuav.is_1d_array(column), 'This must be a 1D numpy array.' # Get a list of all the row indices all_row_indices = set(range(column.shape[0])) indices_per_bin = [] bin_names = [] is_numerical_column = fuav.is_numerical_array(column) is_categorical_column = fuav.is_textual_array(column) assert is_numerical_column is not is_categorical_column, \ 'The column must be a base array.' # Sort out numerical/categorical column treatment if treat_as_categorical is None: go_numerical = is_numerical_column else: if treat_as_categorical: go_numerical = False else: # Treat as numerical if is_numerical_column: go_numerical = True else: # Is not numerical warnings.warn( 'Selected feature is categorical, therefore cannot be ' 'treated as numerical. The feature will be treated as ' 'categorical despite the treat_as_categorical parameter ' 'set to False.', UserWarning) go_numerical = False if go_numerical: if groupings is None: # Get default bins bins = np.linspace(column.min(), column.max(), num=numerical_bins_number, endpoint=False)[1:].tolist() elif isinstance(groupings, list): if not groupings: raise ValueError('A numerical grouping list has to contain at ' 'least one element.') # Every element in the groupings list must be a number for i, number in enumerate(groupings): if not isinstance(number, Number): raise TypeError('For a numerical column all of the ' 'grouping items must be numbers. *{}* ' 'is not a number.'.format(number)) if i != 0: if number <= groupings[i - 1]: raise ValueError('The numbers in the groupings list ' 'have to be monotonically ' 'increasing.') bins = groupings else: raise TypeError('Since a numerical column was chosen the grouping ' 'must be a list of bin boundaries or None.') lower_edge = 'x <= {}' middle = '{} < x <= {}' upper_edge = '{} < x' indices_seen_so_far = set() # type: Set[int] for i, edge in enumerate(bins): if i == 0: indices = np.where(column <= edge)[0].tolist() indices_per_bin.append(indices) bin_names.append(lower_edge.format(edge)) else: edge_lower = bins[i - 1] indices_l = set(np.where(column <= edge)[0].tolist()) indices_u = set(np.where(column > edge_lower)[0].tolist()) indices = list(indices_l.intersection(indices_u)) indices_per_bin.append(indices) bin_names.append(middle.format(edge_lower, edge)) assert not indices_seen_so_far.intersection(indices), 'Duplicates.' indices_seen_so_far = indices_seen_so_far.union(indices) assert bins, 'If bins is empty, i and edge will not be defined.' # pylint: disable=undefined-loop-variable indices = np.where(column > edge)[0].tolist() indices_per_bin.append(indices) bin_names.append(upper_edge.format(edge)) assert not indices_seen_so_far.intersection(indices), 'Duplicates.' indices_seen_so_far = indices_seen_so_far.union(indices) else: unique_elements = np.sort(np.unique(column)).tolist() if groupings is None: bins = [(i, ) for i in unique_elements] elif isinstance(groupings, list): if not groupings: raise ValueError('A categorical grouping list has to contain ' 'at least one element.') values_seen_so_far = set() # type: Set[str] # Every element in the groupings list must be a valid tuple for value_tuple in groupings: if not isinstance(value_tuple, tuple): raise TypeError('For a categorical column all of the ' 'grouping items must be tuples. *{}* ' 'is not a tuple.'.format(value_tuple)) for value in value_tuple: if value not in unique_elements: raise ValueError('*{}* value is not present in the ' 'selected column.'.format(value)) if values_seen_so_far.intersection(value_tuple): raise ValueError('Some values are duplicated across ' 'tuples.') values_seen_so_far = values_seen_so_far.union(value_tuple) unaccounted_values = set(unique_elements).difference( values_seen_so_far) if unaccounted_values: warnings.warn( 'The following values in the selected column were not ' 'accounted for in the grouping ' 'tuples:\n{}.'.format(unaccounted_values), UserWarning) bins = [tuple(sorted(i)) for i in groupings] # type: ignore bins = sorted(bins) else: raise TypeError('Since a categorical column was chosen the ' 'grouping must be a list of tuples representing ' 'categorical values grouping or None for the ' 'default grouping.') indices_seen_so_far = set() for bin_values in bins: indices = set() for value in bin_values: vid = np.where(column == value)[0].tolist() indices = indices.union(vid) indices_per_bin.append(list(indices)) bin_names.append('{}'.format(bin_values)) assert not indices_seen_so_far.intersection(indices), 'Duplicates.' indices_seen_so_far = indices_seen_so_far.union(indices) # Validate that all of the row indices were accounted for missed_indices = all_row_indices.difference(indices_seen_so_far) if missed_indices: warnings.warn( 'The following row indices could not be accounted for:\n{}.\n For ' 'a numerical column there may have been some numpy.nan therein. ' 'For a categorical column some of the column values were probably ' 'not specified in the grouping, in which case there should be a ' 'separate user warning.'.format(missed_indices), UserWarning) return indices_per_bin, bin_names
def __init__(self, data: np.ndarray, local_explanation: bool = True, model: object = None, **kwargs: Any) -> None: """ Initialises a tabular LIME wrapper. """ # pylint: disable=too-many-branches,too-many-statements warnings.warn( 'The LIME wrapper will be deprecated in FAT Forensics version ' '0.0.3. Please consider using the TabularBlimeyLime explainer ' 'class implemented in the fatf.transparency.predictions.' 'surrogate_explainers module instead. Alternatively, you may ' 'consider building a custom surrogate explainer using the ' 'functionality implemented in FAT Forensics -- see the *Tabular ' 'Surrogates* how-to guide for more details.', FutureWarning) valid_params = self._INIT_PARAMS.union(self._EXPLAIN_INSTANCE_PARAMS) invalid_params = set(kwargs.keys()).difference(valid_params) if invalid_params: raise AttributeError('The following named parameters are not ' 'valid: {}.'.format(invalid_params)) # Split parameters init_params = { key: kwargs[key] for key in kwargs if key in self._INIT_PARAMS } explain_params = { key: kwargs[key] for key in kwargs if key in self._EXPLAIN_INSTANCE_PARAMS } # Check data if not fuav.is_2d_array(data): raise IncorrectShapeError('The data parameter must be a ' '2-dimensional numpy array.') if not fuav.is_numerical_array(data): raise ValueError('LIME does not support non-numerical data ' 'arrays.') # Honour native local explanation keyword local_explanation_keyword = 'sample_around_instance' if local_explanation_keyword not in init_params: init_params[local_explanation_keyword] = local_explanation # Sort out a structured data array if fuav.is_structured_array(data): categorical_indices_keyword = 'categorical_features' categorical_indices = init_params.get(categorical_indices_keyword, None) if categorical_indices is not None: if isinstance(categorical_indices, list): categorical_indices = np.array(categorical_indices) elif isinstance(categorical_indices, np.ndarray): pass else: raise TypeError('The {} parameter either has to be a ' 'list, a numpy array or None.'.format( categorical_indices_keyword)) if not fuav.is_1d_array(categorical_indices): raise IncorrectShapeError( '{} array/list is not ' '1-dimensional.'.format(categorical_indices_keyword)) if not fuav.is_textual_array(categorical_indices): raise ValueError('Since {} is an array of indices for ' 'a structured array, all of its elements ' 'should be strings.'.format( categorical_indices_keyword)) # Check categorical indices if not fuat.are_indices_valid(data, categorical_indices): raise ValueError( 'Indices given in the {} parameter ' 'are not valid for the input data ' 'array.'.format(categorical_indices_keyword)) init_params[categorical_indices_keyword] = np.array( [data.dtype.names.index(y) for y in categorical_indices]) data = fuat.as_unstructured(data) # Get a LIME tabular explainer self.mode = init_params.get('mode', 'classification') if self.mode not in ['classification', 'regression']: raise ValueError("The mode must be either 'classification' or " "'regression'. '{}' given.".format(self.mode)) self.tabular_explainer = lime.lime_tabular.LimeTabularExplainer( data, **init_params) # Check the model self.model = model self.model_is_probabilistic = False if model is not None: if fumv.check_model_functionality( model, require_probabilities=True, suppress_warning=True): self.model_is_probabilistic = True elif fumv.check_model_functionality( model, require_probabilities=False, suppress_warning=True): self.model_is_probabilistic = False logger.warning('The model can only be used for LIME in a ' 'regressor mode.') else: raise IncompatibleModelError('LIME requires a model object to ' 'have a fit method and ' 'optionally a predict_proba ' 'method.') # Check the predictive function and memorise parameters that may be # useful for explaining an instance pred_fn_name = 'predict_fn' if pred_fn_name in explain_params: prediction_function = explain_params[pred_fn_name] # Make sure that its a function if not callable(prediction_function): raise TypeError('The {} parameter is not callable -- it has ' 'to be a function.'.format(pred_fn_name)) # Warn the user if both a model and a function are provided if self.model is not None: warnings.warn( 'Since both, a model and a predictive function, are ' 'provided only the latter will be used.', UserWarning) self.explain_instance_params = explain_params
def _validate_input_lasso_path(dataset: np.ndarray, target: np.ndarray, weights: Union[np.ndarray, None], features_number: Union[int, None], features_percentage: int) -> bool: """ Validates the input parameters of the ``lasso_path`` function. For the input parameter description, warnings and exceptions please see the documentation of the :func:`fatf.utils.data.feature_selection.sklearn.lasso_path` function. Returns ------- input_is_valid : boolean ``True`` if the input is valid, ``False`` otherwise. """ # pylint: disable=too-many-branches input_is_valid = False if not fuav.is_2d_array(dataset): raise IncorrectShapeError('The input data set must be a 2-dimensional ' 'array.') if not fuav.is_numerical_array(dataset): raise TypeError('The input data set must be purely numerical. (The ' 'lasso path feature selection is based on ' 'sklearn.linear_model.lars_path function.)') if not fuav.is_1d_array(target): raise IncorrectShapeError('The target array must be a 1-dimensional ' 'array.') if not fuav.is_numerical_array(target): raise TypeError('The target array must be numerical since this ' 'feature selection method is based on Lasso ' 'regression.') if target.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of labels in the target array ' 'must agree with the number of samples in ' 'the data set.') if weights is not None: if not fuav.is_1d_array(weights): raise IncorrectShapeError('The weights array must 1-dimensional.') if not fuav.is_numerical_array(weights): raise TypeError('The weights array must be purely numerical.') if weights.shape[0] != dataset.shape[0]: raise IncorrectShapeError('The number of weights in the weights ' 'array must be the same as the number ' 'of samples in the input data set.') if features_number is not None: if not isinstance(features_number, int): raise TypeError('The features_number parameter must be an ' 'integer.') if features_number < 1: raise ValueError('The features_number parameter must be a ' 'positive integer.') if not isinstance(features_percentage, int): raise TypeError('The feature_percentage parameter must be an integer.') if features_percentage < 0 or features_percentage > 100: raise ValueError('The feature_percentage parameter must be between 0 ' 'and 100 (inclusive).') input_is_valid = True return input_is_valid
def explain_instance( self, instance: np.ndarray, **kwargs: Any ) -> Union[Dict[str, Tuple[str, float]], List[Tuple[str, float]]]: """ Explains an instance with the LIME tabular explainer. This method wraps around ``explain_instance`` method_ in the LIME tabular explainer object. .. warning:: Contrarily to the LIME tabular explainer this wrapper produces explanations for all of the classes for a classification task by default. If any of the named parameters for this function were specified when initialising this object they will be used unless they are also defined when calling this method, in which case the latter take the precedence. If all: a class-wide model, a class-wide prediction function and a local prediction function (via named parameter to this function) are specified, they are used in the following order: - local prediction function, - global prediction function, and finally - the model. Based on whether the task at hand is classification or regression either ``predict`` (regression) or ``predict_proba`` (classification) method of the model is used. .. _method: https://lime-ml.readthedocs.io/en/latest/lime.html #lime.lime_tabular.LimeTabularExplainer.explain_instance Parameters ---------- instance : numpy.ndarray A 1-dimensional data point (numpy array) to be explained. **kwargs : lime.lime_tabular.LimeTabularExplainer.explain_instance LIME tabular explainer's ``explain_instance`` optional parameters. Raises ------ AttributeError One of the named parameters is invalid for the ``explain_instance`` method of the LIME tabular explainer. IncorrectShapeError The input ``instance`` is not a 1-dimensional numpy array. RuntimeError A predictive function is not available (neither as a ``model`` attribute of this class, nor as a ``predict_fn`` parameter). ValueError The input ``instance`` is not purely numerical. Returns ------- explanation : Dictionary[string, Tuple[string, float]] or \ List[Tuple[string, float]] For classification a dictionary where the keys correspond to class names and the values are tuples (string and float), which represent an explanation in terms of one of the features and the importance of this explanation. For regression a list of tuples (string and float) with the same meaning. """ # pylint: disable=too-many-locals,too-many-branches invalid_params = set(kwargs.keys()).difference( self._EXPLAIN_INSTANCE_PARAMS) if invalid_params: raise AttributeError('The following named parameters are not ' 'valid: {}.'.format(invalid_params)) if not fuav.is_1d_like(instance): raise IncorrectShapeError('The instance to be explained should be ' '1-dimensional.') instance = fuat.as_unstructured(instance) if not fuav.is_numerical_array(instance): raise ValueError('The instance to be explained should be purely ' 'numerical -- LIME does not support categorical ' 'features.') # Merge local kwargs and object's kwargs named_arguments = dict(self.explain_instance_params) for kwarg in self._EXPLAIN_INSTANCE_PARAMS: if kwarg in kwargs: named_arguments[kwarg] = kwargs[kwarg] # If both a model and a predictor function is supplied pred_fn_name = 'predict_fn' if pred_fn_name in named_arguments: pred_fn = named_arguments[pred_fn_name] del named_arguments[pred_fn_name] elif self.model is not None: if self.mode == 'classification': if self.model_is_probabilistic: pred_fn = self.model.predict_proba # type: ignore else: raise RuntimeError('The predictive model is not ' 'probabilistic. Please specify a ' 'predictive function instead.') else: pred_fn = self.model.predict # type: ignore else: raise RuntimeError('A predictive function is not available.') # If unspecified, get explanations for all classes for classification lbls_name = 'labels' if lbls_name not in named_arguments and self.mode == 'classification': # Since we cannot get all of the class names/indices/quantity, # we need to resort to this dirty trick n_classes = pred_fn(np.array([instance])).shape[1] named_arguments[lbls_name] = range(n_classes) exp = self.tabular_explainer.explain_instance(instance, pred_fn, **named_arguments) if self.mode == 'classification': explanation = {} for label in exp.available_labels(): class_name = exp.class_names[label] class_explanation = exp.as_list(label=label) explanation[class_name] = class_explanation else: explanation = exp.as_list() return explanation
def describe_numerical_array( array: Union[np.ndarray, np.void], skip_nans: bool = True) -> Dict[str, Union[int, float, np.ndarray]]: """ Describes a numerical numpy array with basic statistics. If the ``skip_nans`` parameter is set to ``True``, any ``numpy.nan`` present in the input array is skipped for calculating the statistics. Otherwise, they are included, affecting most of the statistics and possibly equating them to ``numpy.nan``. The description output by this function is a dictionary with the following keys: ``count`` : integer The number of elements in the array. ``mean`` : float The *mean* (average) value of the array. ``std`` : float The *standard deviation* of the array. ``min`` : float The *minimum value* in the array. ``25%`` : float The *25 percentile* of the array. ``50%`` : float The *50 percentile* of the array, which is equivalent to its **median**. ``75%`` : float The *75 percentile* of the array. ``max`` : float The *maximum value* in the array. ``nan_count`` : integer The count of ``numpy.nan`` (not-a-number) values in the array. Parameters ---------- array : Union[numpy.ndarray, numpy.void] An array for which a description is desired. skip_nans : boolean, optional (default=True) If set to ``True``, ``numpy.nan``\\ s present in the input array will be excluded while computing the statistics. Raises ------ IncorrectShapeError The input array is not 1-dimensional. ValueError The input array is not purely numerical or it is empty. Returns ------- numerical_description : Dict[string, Union[integer, float, numpy.ndarray]] A dictionary describing the numerical input array. """ if not fuav.is_1d_like(array): raise IncorrectShapeError('The input array should be 1-dimensional.') classic_array = fuat.as_unstructured(array) assert len(classic_array.shape) == 1, '1D arrays only at this point.' if not classic_array.shape[0]: raise ValueError('The input array cannot be empty.') if not fuav.is_numerical_array(classic_array): raise ValueError('The input array should be purely numerical.') nan_indices = np.isnan(classic_array) n_elements = classic_array.shape[0] if skip_nans: classic_array = classic_array[~nan_indices] numerical_description = { 'count': n_elements, 'mean': np.mean(classic_array), 'std': np.std(classic_array), 'min': np.min(classic_array), '25%': np.percentile(classic_array, 25), '50%': np.percentile(classic_array, 50), '75%': np.percentile(classic_array, 75), 'max': np.max(classic_array), 'nan_count': nan_indices.sum() } return numerical_description
def occlude_segments_vectorised( self, vectorised_segments_subset: np.ndarray, image: Optional[np.ndarray] = None, colour: Optional[Union[str, int, RGBcolour]] = None) -> np.ndarray: """ Generates multiple images with a selected subsets of segments occluded. The segments to be occluded are provided as boolean vectors; either a 1-D numpy array of length equal to the number of segments to produce a single occluded image, or a 2-D array where each row represents a separate occlusion pattern. In this format the n-th element or column corresponds to the the n+1 segment id; 1 indicates that the segment should be preserved and 0 that it should be occluded. The occlusion is applied on top of the image used to initialise this class; alternatively, an external ``image`` of the same type and dimensions can be supplied. If a colouring strategy different to the one of the class is desired, it can be specified via the ``colour`` parameter. Parameters ---------- vectorised_segments_subset : numpy.ndarray A 1-D boolean occlusion vector of the length equal to the number of segments or a 2-D boolean matrix of the (number of occlusion images to generate X number of segments) shape. image : numpy.ndarray, optional (default=None) If provided, this ``image`` will be occluded instead of the one used to initialise this class. colour : string, integer, tuple(integer, integer, integer), \ optional (default=None) A colour specifier. By default (``colour=None``) the colouring strategy of the class is used. See the documentation of the :func:`fatf.utils.data.occlusion.Occlusion.set_colouring_strategy` method for more details. Raises ------ IncorrectShapeError The ``vectorised_segments_subset`` numpy array is neither 1- nor 2-dimensional. The number of elements in ``vectorised_segments_subset`` (when it is 1-D) does not correspond to the number of segments. The number of columns in ``vectorised_segments_subset`` (when it is 2-D) does not correspond to the number of segments. The input ``image`` is neither a 2- nor 3-dimensional numpy array. The the height, width or the number of channels in the ``image`` array does not agree with the same parameters of the class image. TypeError The ``vectorised_segments_subset`` numpy array is not boolean. Returns ------- image_occluded : numpy.ndarray A numpy array holding the image(s) with the selected subset(s) of segments occluded. """ # pylint: disable=too-many-branches if image is None: canvas = self.image else: assert ( # yapf: disable fuds._validate_image_array( # pylint: disable=protected-access image, 'image')), 'Invalid image.' if image.shape != self.image.shape: raise IncorrectShapeError( 'The width, height or number of channels of the input ' 'image does not agree with the same parameters of the ' 'original image.') canvas = image if colour is None: colouring_strategy = self._colouring_strategy else: colouring_strategy = self._generate_colouring_strategy(colour) if fuav.is_structured_array(vectorised_segments_subset): raise TypeError('The vector representation of segments cannot be ' 'a structured numpy array.') if not fuav.is_numerical_array(vectorised_segments_subset): raise TypeError('The vector representation of segments should be ' 'a numerical numpy array.') if fuav.is_1d_array(vectorised_segments_subset): if vectorised_segments_subset.shape[0] != self.segments_number: raise IncorrectShapeError( ('The number of elements ({}) in the vector ' 'representation of segments should correspond to the ' 'unique number of segments ({}).').format( vectorised_segments_subset.shape[0], self.segments_number)) samples = 1 vectorised_segments_subset = np.asarray( [vectorised_segments_subset]) elif fuav.is_2d_array(vectorised_segments_subset): if vectorised_segments_subset.shape[1] != self.segments_number: raise IncorrectShapeError( ('The number of columns ({}) in the vector representation ' 'of segments should correspond to the unique number of ' 'segments ({}).').format( vectorised_segments_subset.shape[1], self.segments_number)) samples = vectorised_segments_subset.shape[0] else: raise IncorrectShapeError( 'The vector representation of segments should be a 1- or ' '2-dimensional numpy array.') _unique_entries = set(np.unique(vectorised_segments_subset).astype( int)).difference((0, 1)) # yapf: disable if _unique_entries: raise TypeError('The vector representation of segments should be ' 'binary numpy array.') # image_occluded = canvas.copy() image_occluded = np.repeat(canvas[np.newaxis, :], samples, axis=0) for i, vec in enumerate(vectorised_segments_subset): # Get ids of segments to be occluded (0s) from a vector form # 1 is added as segments are numbered from 1, not 0 segments_subset = np.where(vec == 0)[0] + 1 occlusion_mask = fuds.get_segment_mask(segments_subset.tolist(), self.segments) image_occluded[i, occlusion_mask] = colouring_strategy( occlusion_mask) if samples == 1: image_occluded = image_occluded[0] return image_occluded
def describe_array( array: np.ndarray, include: Optional[Union[str, int, List[Union[str, int]]]] = None, exclude: Optional[Union[str, int, List[Union[str, int]]]] = None, **kwargs: bool ) -> Dict[Union[str, int], Union[str, int, float, bool, np.ndarray, Dict[str, Union[str, int, float, bool, np.ndarray]]] ]: # yapf: disable """ Describes categorical (textual) and numerical columns in the input array. The details of numerical and categorical descriptions can be found in :func:`fatf.transparency.data.describe_functions.describe_numerical_array` and :func:`fatf.transparency.data.describe_functions.\ describe_categorical_array` functions documentation respectively. To filter out the columns that will be described you can use ``include`` and ``exclude`` parameters. Either of these can be a list with columns indices, a string or an integer when excluding or including just one column; or one of the keywords: ``'numerical'`` or ``'categorical'``, to indicate that only numerical or categorical columns should be included/ excluded. By default all columns are described. Parameters ---------- array : numpy.ndarray The array to be described. include : Union[str, int, List[Union[str, int]]], optional (default=None) A list of column indices to be included in the description. If ``None`` (the default value), all of the columns will be included. Alternatively this can be set to a single index (either a string or an integer) to compute statistics just for this one column. It is also possible to set it to ``'numerical'`` or ``'categorical'`` to just include numerical or categorical columns respectively. exclude : Union[str, int, List[Union[str, int]]], optional (default=None) A list of column indices to be excluded from the description. If ``None`` (the default value), none of the columns will be excluded. Alternatively this can be set to a single index (either a string or an integer) to exclude just one column. It is also possible to set it to ``'numerical'`` or ``'categorical'`` to exclude wither all numerical or all categorical columns respectively. **kwargs : bool Keyword arguments that are passed to the :func:`fatf.transparency.\ data.describe_functions.describe_numerical_array` function responsible for describing numerical arrays. Warns ----- UserWarning When using ``include`` or ``exclude`` parameters for 1-dimensional input arrays (in which case these parameters are ignored). Raises ------ IncorrectShapeError The input array is neither 1- not 2-dimensional. RuntimeError None of the columns were selected to be described. ValueError The input array is not of a base type (textual and numerical elements). The input array has 0 columns. Returns ------- description : Dict[Union[str, int], Dict[str, \ Union[str, int, float bool, np.ndarray]]] For 2-dimensional arrays a dictionary describing every column under a key corresponding to its index in the input array. For a 1-dimensional input array a dictionary describing that array. """ # pylint: disable=too-many-locals,too-many-branches is_1d = fuav.is_1d_like(array) if is_1d: array = fuat.as_unstructured(array) is_2d = False else: is_2d = fuav.is_2d_array(array) if not is_1d and not is_2d: raise IncorrectShapeError('The input array should be 1- or ' '2-dimensional.') if not fuav.is_base_array(array): raise ValueError('The input array should be of a base type (a mixture ' 'of numerical and textual types).') if is_1d: if include is not None or exclude is not None: warnings.warn( 'The input array is 1-dimensional. Ignoring include and ' 'exclude parameters.', category=UserWarning) if fuav.is_numerical_array(array): description = describe_numerical_array(array, **kwargs) elif fuav.is_textual_array(array): description = describe_categorical_array(array) else: # pragma: no cover assert False, 'A base array should either be numerical or textual.' elif is_2d: numerical_indices, categorical_indices = fuat.indices_by_type(array) is_structured_array = fuav.is_structured_array(array) if (numerical_indices.shape[0] + categorical_indices.shape[0]) == 0: raise ValueError('The input array cannot have 0 columns.') numerical_indices_set = set(numerical_indices) categorical_indices_set = set(categorical_indices) all_indices = categorical_indices_set.union(numerical_indices_set) # Indices to be included include_indices = _filter_include_indices(categorical_indices_set, numerical_indices_set, include, all_indices) categorical_indices_set, numerical_indices_set = include_indices # Indices to be included exclude_indices = _filter_exclude_indices(categorical_indices_set, numerical_indices_set, exclude, all_indices) categorical_indices_set, numerical_indices_set = exclude_indices all_indices = numerical_indices_set.union(categorical_indices_set) if len(all_indices) == 0: # pylint: disable=len-as-condition raise RuntimeError('None of the columns were selected to be ' 'described.') description = dict() for idx in numerical_indices_set: if is_structured_array: description[idx] = describe_numerical_array( # type: ignore array[idx], **kwargs) else: description[idx] = describe_numerical_array( # type: ignore array[:, idx], **kwargs) for idx in categorical_indices_set: if is_structured_array: description[idx] = describe_categorical_array( # type: ignore array[idx]) else: description[idx] = describe_categorical_array( # type: ignore array[:, idx]) else: # pragma: no cover assert False, 'The input array can only be 1- or 2-dimensional.' return description # type: ignore
def __init__(self, clf: sklearn.base.BaseEstimator, feature_names: Optional[List[str]] = None, class_names: Optional[List[str]] = None) -> None: """ Initialises the ``SKLearnExplainer`` class. """ # Validate the input assert _validate_input(clf, feature_names, class_names), 'Invalid init parameters.' self.clf = clf self.feature_names = feature_names self.class_names = class_names # Check whether the model is of the right type and is fitted assert self._validate_kind_fitted(), 'Unfitted or wrong type model.' # Classifier or regressor self.is_classifier = self._is_classifier() assert isinstance(self.is_classifier, bool), 'Has to be boolean.' # The number of features (number of columns in a data array) expected # by the classifier self.features_number = self._get_features_number() if self.features_number is not None: assert isinstance(self.features_number, int), 'Wrong type.' # Get the list of classes that the predictive model can output self.classes_array = self._get_classes_array() if self.classes_array is not None: assert isinstance(self.classes_array, np.ndarray), 'Bad type.' if self.classes_array is not None: assert fuav.is_1d_array(self.classes_array), 'Must be 1-D array.' assert (fuav.is_numerical_array(self.classes_array) or fuav.is_textual_array(self.classes_array)), 'Bad type.' # A regressor must not have class names if not self.is_classifier: assert self.classes_array is None and self.class_names is None, \ "Regressor's class_names and classes_array must both be None." # Validate feature names length if self.feature_names is None: if self.features_number is not None: logger.info('Generating missing feature names from the number ' 'of features using "feature %d" pattern.') self.feature_names = [ 'feature {}'.format(i) for i in range(self.features_number) ] else: if self.features_number is None: warnings.warn( 'Cannot validate the length of feature names list since ' 'the _get_features_number method ' 'returned None.', UserWarning) else: if len(self.feature_names) != self.features_number: raise ValueError('The length of the feature_names list ' 'is different than the number of ' 'features extracted from the classifier.') # Validate class names length if self.class_names is None: if self.classes_array is not None: logger.info('Generating missing class names from the array of ' 'classes output by the classifier using ' '"class %s" pattern.') self.class_names = [ 'class {}'.format(i) for i in self.classes_array ] else: if self.classes_array is None: warnings.warn( 'Cannot validate the length of class names list since the ' '_get_classes_array method returned None.', UserWarning) else: if self.classes_array.shape[0] != len(self.class_names): raise ValueError('The length of the class_names list is ' 'different than the length of the ' 'classes array extracted from the ' 'classifier.')
def indices_by_type(array: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Identifies indices of columns with numerical and non-numerical values. Checks whether a numpy array is purely numerical or a structured array and returns two numpy arrays: the first-one with indices of numerical columns and the second-one with indices of non-numerical columns. Parameters ---------- array : numpy.ndarray A numpy array to be checked (it has to be a 2-dimensional array). Raises ------ TypeError The input array is not a numpy array-like object. ValueError The input array consists of complex types such as numpy void and object-like types that are not supported by this function. IncorrectShapeError The input array is not 2-dimensional. Returns ------- numerical_indices : numpy.ndarray A numpy array containing indices of the numerical columns of the input array. non_numerical_indices : numpy.ndarray A numpy array containing indices of the non-numerical columns of the input array. """ if not isinstance(array, np.ndarray): raise TypeError('The input should be a numpy array-like.') if not fuav.is_2d_array(array): raise IncorrectShapeError('The input array should be 2-dimensional.') if not fuav.is_base_array(array): raise ValueError('indices_by_type only supports input arrays that ' 'hold base numpy types, i.e. numerical and ' 'string-like -- numpy void and object-like types are ' 'not allowed.') if fuav.is_structured_array(array): assert len(array.dtype) > 1, 'This should be a 2D array.' numerical_indices_list = [] non_numerical_indices_list = [] for column_name in array.dtype.names: column_dtype = array.dtype[column_name] if fuav.is_numerical_dtype(column_dtype): numerical_indices_list.append(column_name) else: non_numerical_indices_list.append(column_name) numerical_indices = np.array(numerical_indices_list) non_numerical_indices = np.array(non_numerical_indices_list) else: if fuav.is_numerical_array(array): numerical_indices = np.array(range(array.shape[1])) non_numerical_indices = np.empty((0, ), dtype='i8') else: numerical_indices = np.empty((0, ), dtype='i8') non_numerical_indices = np.array(range(array.shape[1])) return numerical_indices, non_numerical_indices