def confusion_matrix_per_subgroup( dataset: np.ndarray, # ground_truth: np.ndarray, predictions: np.ndarray, # column_index: Index, groupings: Optional[List[Union[float, Tuple[str]]]] = None, numerical_bins_number: int = 5, treat_as_categorical: Optional[bool] = None, # labels: Optional[List[Union[str, float]]] = None ) -> Tuple[List[np.ndarray], List[str]]: """ Computes confusion matrices for every defined sub-population. This is useful for computing a variety of performance metrics for each sub-population. For warnings raised by this method please see the documentation of :func:`fatf.utils.data.tools.validate_indices_per_bin` function. Parameters ---------- dataset, column_index, groupings, numerical_bins_number, \ and treat_as_categorical These parameters are described in the documentation of :func:`fatf.utils.data.tools.group_by_column` function and are used to define a grouping (i.e. sub-populations). If you have your own index-based grouping and would like to get sub-population-based confusion matrices, please consider using :func:`fatf.utils.metrics.tools.confusion_matrix_per_subgroup_indexed` function. ground_truth, predictions, and labels These parameters are described in the documentation of :func:`fatf.utils.metrics.tools.get_confusion_matrix` function and are used to calculate confusion matrices. Returns ------- population_confusion_matrix : List[numpy.ndarray] A list of confusion matrices for each sub-population. bin_names : List[strings] The name of every sub-population (binning results) defined by the feature ranges for a numerical feature and feature value sets for a categorical feature. """ # pylint: disable=too-many-arguments indices_per_bin, bin_names = fudt.group_by_column(dataset, column_index, groupings, numerical_bins_number, treat_as_categorical) assert fudt.validate_indices_per_bin(indices_per_bin), \ 'Binned indices list is invalid.' population_confusion_matrix = confusion_matrix_per_subgroup_indexed( indices_per_bin, ground_truth, predictions, labels) return population_confusion_matrix, bin_names
def test_group_by_column_errors(): """ Tests :func:`fatf.utils.data.tools.group_by_column` for errors. """ incorrect_shape_error_data = 'The input array should be 2-dimensional.' value_error_data = ('The input array should be of a base type (a mixture ' 'of numerical and textual types).') # index_error_index = ('*{}* is not a valid column index for the input ' 'dataset.') type_error_index = 'The column index can either be a string or an integer.' # value_error_bins = 'The numerical_bins_number needs to be at least 2.' type_error_bins = ('The numerical_bins_number parameter has to be an ' 'integer.') # value_error_grouping_num_empty = ('A numerical grouping list has to ' 'contain at least one element.') type_error_grouping_num_inner = ('For a numerical column all of the ' 'grouping items must be numbers. *{}* is ' 'not a number.') value_error_grouping_num_monotonicity = ('The numbers in the groupings ' 'list have to be monotonically ' 'increasing.') type_error_grouping_num_general = ('Since a numerical column was chosen ' 'the grouping must be a list of bin ' 'boundaries or None.') # type_error_grouping_cat_general = ('Since a categorical column was chosen ' 'the grouping must be a list of tuples ' 'representing categorical values ' 'grouping or None for the default ' 'grouping.') type_error_grouping_cat_tuple = ('For a categorical column all of the ' 'grouping items must be tuples. *{}* ' 'is not a tuple.') value_error_grouping_cat_empty = ('A categorical grouping list has to ' 'contain at least one element.') value_error_grouping_cat_extra = ('*{}* value is not present in the ' 'selected column.') value_error_grouping_cat_unique = ('Some values are duplicated across ' 'tuples.') # type_error_tac = 'The treat_as_categorical parameter has to be a boolean.' # user_warning_val = ('The following values in the selected column were ' 'not accounted for in the grouping tuples:\n{}.') user_warning_ind = ('The following row indices could not be accounted for:' '\n{}.\n For a numerical column there may have been ' 'some numpy.nan therein. For a categorical column ' 'some of the column values were probably not ' 'specified in the grouping, in which case there ' 'should be a separate user warning.') num_array = np.array([[1, 2], [3, 4]]) cat_array = np.array([['a', 'b'], [3, 4]]) with pytest.raises(IncorrectShapeError) as exin: fudt.group_by_column(np.ones((2, 2, 2)), 1) assert str(exin.value) == incorrect_shape_error_data with pytest.raises(ValueError) as exin: fudt.group_by_column(np.array([[1, 2], [3, None]]), None) assert str(exin.value) == value_error_data with pytest.raises(IndexError) as exin: fudt.group_by_column(num_array, 3) assert str(exin.value) == index_error_index.format(3) with pytest.raises(TypeError) as exin: fudt.group_by_column(num_array, None) assert str(exin.value) == type_error_index with pytest.raises(ValueError) as exin: fudt.group_by_column(num_array, 1, numerical_bins_number=1) assert str(exin.value) == value_error_bins with pytest.raises(TypeError) as exin: fudt.group_by_column(num_array, 1, numerical_bins_number='1') assert str(exin.value) == type_error_bins with pytest.raises(TypeError) as exin: fudt.group_by_column(num_array, 1, groupings='a') assert str(exin.value) == type_error_grouping_num_general with pytest.raises(ValueError) as exin: fudt.group_by_column(num_array, 1, groupings=[]) assert str(exin.value) == value_error_grouping_num_empty with pytest.raises(TypeError) as exin: fudt.group_by_column(num_array, 1, groupings=[5, 7.3, 8, 'a']) assert str(exin.value) == type_error_grouping_num_inner.format('a') with pytest.raises(ValueError) as exin: fudt.group_by_column(num_array, 1, groupings=[5, 7.3, 8, 7.9, 11]) assert str(exin.value) == value_error_grouping_num_monotonicity with pytest.raises(TypeError) as exin: fudt.group_by_column(cat_array, 1, groupings='a') assert str(exin.value) == type_error_grouping_cat_general with pytest.raises(TypeError) as exin: fudt.group_by_column(cat_array, 0, groupings=[('3', ), ['a'], ('a', )]) assert str(exin.value) == type_error_grouping_cat_tuple.format("['a']") with pytest.raises(ValueError) as exin: fudt.group_by_column(cat_array, 1, groupings=[]) assert str(exin.value) == value_error_grouping_cat_empty with pytest.raises(ValueError) as exin: fudt.group_by_column(cat_array, 0, groupings=[('3', 'a'), ('1', )]) assert str(exin.value) == value_error_grouping_cat_extra.format('1') with pytest.raises(ValueError) as exin: fudt.group_by_column(cat_array, 0, groupings=[('3', 'a'), ('a', )]) assert str(exin.value) == value_error_grouping_cat_unique with pytest.raises(TypeError) as exin: fudt.group_by_column(cat_array, 0, treat_as_categorical='None') assert str(exin.value) == type_error_tac with pytest.warns(UserWarning) as warning: grp, grpn = fudt.group_by_column(cat_array, 0, groupings=[('3', )]) assert len(warning) == 2 assert user_warning_val.format("{'a'}") == str(warning[0].message) assert user_warning_ind.format('{0}') == str(warning[1].message) assert grp == [[1]] assert grpn == ["('3',)"] # nan_array = np.array([[0, np.inf], [0, 7], [0, -np.inf], [0, np.nan]]) with pytest.warns(UserWarning) as warning: grp, grpn = fudt.group_by_column(nan_array, 1, groupings=[1]) assert len(warning) == 1 assert user_warning_ind.format('{3}') == str(warning[0].message) assert grp == [[2], [0, 1]] assert grpn == ['x <= 1', '1 < x']
def test_group_by_column(): """ Tests :func:`fatf.utils.data.tools.group_by_column`. """ user_warning_tac = ('Selected feature is categorical, therefore cannot be ' 'treated as numerical. The feature will be treated as ' 'categorical despite the treat_as_categorical ' 'parameter set to False.') n_1_grp = [[0, 1, 2, 5], [4], [], [], [3]] n_1_grps = ['x <= 7.6', '7.6 < x <= 16.2', '16.2 < x <= 24.799999999999997', '24.799999999999997 < x <= 33.4', '33.4 < x'] # yapf: disable n_0_grp = [[0, 5], [4], [1, 2, 3]] n_0_grps = ['x <= 0.05', '0.05 < x <= 7.7', '7.7 < x'] n_2_grp = [[3], [0, 1, 2, 4, 5]] n_2_grps = ['x <= -6.5', '-6.5 < x'] c_1_grp_d = [[0, 4], [3], [1, 2], [5]] c_1_grps_d = ["('a+',)", "('a-',)", "('b+',)", "('b-',)"] c_1_grp_c = [[0, 3, 4], [1, 2, 5]] c_1_grps_c = ["('a+', 'a-')", "('b+', 'b-')"] num_array = np.array([ [0, 5, 6], [9, -1, 5], [14, 7, 2], [55, 42, -22], [7.7, 8.8, 9], [0.01, 7.0001, 5] ]) # yapf: disable struct_array = np.array( [(0, 'a+', 6), (9, 'b+', 5), (14, 'b+', 2), (55, 'a-', -22), (7.7, 'a+', 9), (0.01, 'b-', 5)], dtype=[('a', np.float32), ('b', 'U2'), ('c', np.int32)] ) # yapf: disable cat_array = np.array([ ['a', 'a+', '1'], ['b', 'b+', '2'], ['b', 'b+', '3'], ['a', 'a-', '3'], ['b', 'a+', '2'], ['b', 'b-', '1'] ]) # yapf: disable # Classic array, numerical -- all default grp, grpn = fudt.group_by_column(num_array, 1) assert grp == n_1_grp assert grpn == n_1_grps grp, grpn = fudt.group_by_column(num_array, 1, treat_as_categorical=False) assert grp == n_1_grp assert grpn == n_1_grps grp, grpn = fudt.group_by_column(num_array, 2, treat_as_categorical=True) assert grp == [[3], [2], [1, 5], [0], [4]] assert grpn == ['(-22.0,)', '(2.0,)', '(5.0,)', '(6.0,)', '(9.0,)'] # Structured array, numerical -- custom bins number (treat_as_categorical) grp, grpn = fudt.group_by_column( struct_array, 'c', numerical_bins_number=2) assert grp == n_2_grp assert grpn == n_2_grps # Structured array, numerical -- custom intervals grp, grpn = fudt.group_by_column(struct_array, 'a', groupings=[0.05, 7.7]) assert grp == n_0_grp assert grpn == n_0_grps # Classic array, categorical -- default binning (treat_as_categorical) grp, grpn = fudt.group_by_column(cat_array, 1) assert grp == c_1_grp_d assert grpn == c_1_grps_d grp, grpn = fudt.group_by_column(cat_array, 1, treat_as_categorical=True) assert grp == c_1_grp_d assert grpn == c_1_grps_d with pytest.warns(UserWarning) as warning: grp, grpn = fudt.group_by_column( cat_array, 1, treat_as_categorical=False) assert len(warning) == 1 assert str(warning[0].message) == user_warning_tac assert grp == c_1_grp_d assert grpn == c_1_grps_d grp, grpn = fudt.group_by_column( cat_array, 1, groupings=[('a-', ), ('b+', ), ('a+', ), ('b-', )]) assert grp == c_1_grp_d assert grpn == c_1_grps_d # Structured array, categorical -- custom bins grp, grpn = fudt.group_by_column( struct_array, 'b', groupings=[('a-', 'a+'), ('b-', 'b+')]) assert grp == c_1_grp_c assert grpn == c_1_grps_c
# --------------------- # # The measure of Sample Size Disparity can be achieved by calling the # :func:`fatf.utils.data.tools.group_by_column` grouping function and counting # the number of instances in each group. By doing that for the *target vector* # (ground truth) we can see whether the classes in our data set are balanced # for each sub-group defined by a specified set of values for that feature. # # In the example below we will check whether there are roughly the same number # of data points collected for *males* and *females*. Then we will see whether # the class distribution (*fail* and *success*) for these two sub-populations # is similar. # Group the data based on the unique values of the 'gender' column grouping_column = 'gender' grouping_indices, grouping_names = fatf_data_tools.group_by_column( hr_X, grouping_column, treat_as_categorical=True) # Print out the data distribution for the grouping print('The grouping based on the *{}* feature has the ' 'following distribution:'.format(grouping_column)) for grouping_name, grouping_idx in zip(grouping_names, grouping_indices): print(' * "{}" grouping has {} instances.'.format( grouping_name, len(grouping_idx))) # Get the class distribution for each sub-grouping grouping_class_distribution = dict() for grouping_name, grouping_idx in zip(grouping_names, grouping_indices): sg_y = hr_y[grouping_idx] sg_classes, sg_counts = np.unique(sg_y, return_counts=True) grouping_class_distribution[grouping_name] = dict()
def sampling_bias( dataset: np.ndarray, column_index: Index, groupings: Optional[List[Union[float, Tuple[str]]]] = None, numerical_bins_number: int = 5, treat_as_categorical: Optional[bool] = None ) -> Tuple[List[int], np.ndarray, List[str]]: """ Computes information needed for evaluating and remedying sampling bias. Computes the *number of instances* per sub-population defined by the input parameters, the *weights* that can be used for cost-sensitive learning to mitigate the sampling bias and the *names* of each sub-population (in terms of the selected feature and its values). .. note:: To evaluate the sampling bias in terms of a binary ``True``/``False`` answer please use the :func:`fatf.accountability.data.measures.sampling_bias_check` function or :func:`fatf.accountability.data.measures.sampling_bias_grid_check` function to see sub-population pairwise sampling bias. For warnings raised by this method please see the documentation of :func:`fatf.utils.data.tools.validate_indices_per_bin` function. Parameters ---------- dataset, column_index, groupings, numerical_bins_number, and \ treat_as_categorical These parameters are described in the documentation of :func:`fatf.utils.data.tools.group_by_column` function and are used to define a grouping (i.e. sub-populations). If you have your own index-based grouping and would like to get counts and weights for cost-sensitive learning, please consider using :func:`fatf.accountability.data.measures.sampling_bias_indexed` function. Returns ------- counts : List[integers] A number of data points for each sub-population defined by partitioning of the selected feature. weights : numpy.ndarray A weight for every instance (that could be grouped, i.e. assigned to one of the sub-populations) in the input ``dataset``. The weights are useful for training a cost-sensitive classifier to mitigate the sampling bias. The weights are inversely proportional to the number of instance occurrences for every sub-population. bin_names : List[strings] The name of every sub-population (binning results) defined by the feature ranges for a numerical feature and feature value sets for a categorical feature. """ indices_per_bin, bin_names = fudt.group_by_column(dataset, column_index, groupings, numerical_bins_number, treat_as_categorical) assert fudt.validate_indices_per_bin(indices_per_bin), \ 'Binned indices list is invalid.' counts = [len(i) for i in indices_per_bin] weights = _get_weights(indices_per_bin) return counts, weights, bin_names