def test_example_model(self): train_tf_file = self._write_tf_records(self._create_data()) classifier = example_model.train_model(self._model_dir, train_tf_file, LABEL, TEXT_FEATURE, FEATURE_MAP) validate_tf_file = self._write_tf_records(self._create_data()) tfma_eval_result_path = os.path.join(self._model_dir, 'tfma_eval_result') example_model.evaluate_model(classifier, validate_tf_file, tfma_eval_result_path, SLICE, LABEL, FEATURE_MAP) expected_slice_keys = [ 'Overall', 'slice:slice3', 'slice:slice1', 'slice:slice2' ] evaluation_results = tfma.load_eval_result(tfma_eval_result_path) self.assertLen(evaluation_results.slicing_metrics, 4) # Verify if false_positive_rate metrics are computed for all values of # slice. for (slice_key, metric_value) in evaluation_results.slicing_metrics: slice_key = slicer.stringify_slice_key(slice_key) self.assertIn(slice_key, expected_slice_keys) self.assertGreaterEqual( 1.0, metric_value[''][''] ['post_export_metrics/[email protected]'] ['doubleValue']) self.assertLessEqual( 0.0, metric_value[''][''] ['post_export_metrics/[email protected]'] ['doubleValue'])
def process(self, element: types.Extracts) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slices = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def convert_slicing_metrics_to_ui_input( slicing_metrics: List[Tuple[slicer.SliceKeyOrCrossSliceKeyType, view_types.MetricsByOutputName]], slicing_column: Optional[str] = None, slicing_spec: Optional[slicer.SingleSliceSpec] = None, output_name: str = '', multi_class_key: str = '') -> Optional[List[Dict[str, Any]]]: """Renders the Fairness Indicator view. Args: slicing_metrics: tfma.EvalResult.slicing_metrics. slicing_column: The slicing column to to filter results. If both slicing_column and slicing_spec are None, show all eval results. slicing_spec: The slicing spec to filter results. If both slicing_column and slicing_spec are None, show all eval results. output_name: The output name associated with metric (for multi-output models). multi_class_key: The multi-class key associated with metric (for multi-class models). Returns: A list of dicts for each slice, where each dict contains keys 'sliceValue', 'slice', and 'metrics'. Raises: ValueError if no related eval result found or both slicing_column and slicing_spec are not None. """ if slicing_column and slicing_spec: raise ValueError( 'Only one of the "slicing_column" and "slicing_spec" parameters ' 'can be set.') if slicing_column: slicing_spec = slicer.SingleSliceSpec(columns=[slicing_column]) data = [] for (slice_key, metric_value) in slicing_metrics: if (metric_value is not None and output_name in metric_value and multi_class_key in metric_value[output_name]): metrics = metric_value[output_name][multi_class_key] # To add evaluation data for cross slice comparison. if slicer.is_cross_slice_key(slice_key): _add_cross_slice_key_data(slice_key, metrics, data) # To add evaluation data for regular slices. elif (slicing_spec is None or not slice_key or slicing_spec.is_slice_applicable(slice_key)): data.append({ 'sliceValue': stringify_slice_key_value(slice_key), 'slice': slicer.stringify_slice_key(slice_key), 'metrics': metrics }) if not data: raise ValueError( 'No eval result found for output_name:"%s" and ' 'multi_class_key:"%s" and slicing_column:"%s" and slicing_spec:"%s".' % (output_name, multi_class_key, slicing_column, slicing_spec)) return data
def convert_eval_result_to_ui_input( eval_result: model_eval_lib.EvalResult, slicing_column: Optional[Text] = None, slicing_spec: Optional[slicer.SingleSliceSpec] = None, output_name: Text = '', multi_class_key: Text = '') -> Optional[List[Dict[Text, Any]]]: """Renders the Fairness Indicator view. Args: eval_result: An tfma.EvalResult. slicing_column: The slicing column to to filter results. If both slicing_column and slicing_spec are None, show all eval results. slicing_spec: The slicing spec to filter results. If both slicing_column and slicing_spec are None, show all eval results. output_name: The output name associated with metric (for multi-output models). multi_class_key: The multi-class key associated with metric (for multi-class models). Returns: A FairnessIndicatorViewer object if in Jupyter notebook; None if in Colab. Raises: ValueError if no related eval result found or both slicing_column and slicing_spec are not None. """ if slicing_column and slicing_spec: raise ValueError( 'Only one of the "slicing_column" and "slicing_spec" parameters ' 'can be set.') if slicing_column: slicing_spec = slicer.SingleSliceSpec(columns=[slicing_column]) data = [] for (slice_key, metric_value) in eval_result.slicing_metrics: slice_key_ok = ( slicing_spec is None or not slice_key or slicing_spec.is_slice_applicable(slice_key)) metric_ok = ( output_name in metric_value and multi_class_key in metric_value[output_name]) if slice_key_ok and metric_ok: data.append({ 'sliceValue': stringify_slice_key_value(slice_key), 'slice': slicer.stringify_slice_key(slice_key), 'metrics': metric_value[output_name][multi_class_key] }) if not data: raise ValueError( 'No eval result found for output_name:"%s" and ' 'multi_class_key:"%s" and slicing_column:"%s" and slicing_spec:"%s".' % (output_name, multi_class_key, slicing_column, slicing_spec)) return data
def testStringifySliceKey(self): test_cases = [ ('overall', (), 'Overall'), ('one bytes feature', (('age_str', '5'),), 'age_str:5'), ('one int64 feature', (('age', 1),), 'age:1'), ('mixed', (('age', 1), ('gender', 'f')), 'age_X_gender:1_X_f'), ('more', (('age', 1), ('gender', 'f'), ('interest', 'cars')), 'age_X_gender_X_interest:1_X_f_X_cars'), ('unicode', (('text', b'\xe4\xb8\xad\xe6\x96\x87'),), u'text:\u4e2d\u6587'), ] # pyformat: disable for (name, slice_key, stringified_key) in test_cases: self.assertEqual( stringified_key, slicer.stringify_slice_key(slice_key), msg=name)
def _add_cross_slice_key_data(slice_key: slicer.CrossSliceKeyType, metrics: view_types.MetricsByTextKey, data: List[Any]): """Adds data for cross slice key. Baseline and comparison slice keys are joined by '__XX__'. Args: slice_key: Cross slice key. metrics: Metrics data for the cross slice key. data: List where UI data is to be appended. """ baseline_key = slice_key[0] comparison_key = slice_key[1] stringify_slice_value = stringify_slice_key_value( baseline_key) + '__XX__' + stringify_slice_key_value(comparison_key) stringify_slice = slicer.stringify_slice_key( baseline_key) + '__XX__' + slicer.stringify_slice_key(comparison_key) data.append({ 'sliceValue': stringify_slice_value, 'slice': stringify_slice, 'metrics': metrics })
def process( self, element: types.Extracts, slice_spec: List[slicer.SingleSliceSpec]) -> List[types.Extracts]: # Slice on transformed features if available. features_dicts = [] if (constants.TRANSFORMED_FEATURES_KEY in element and element[constants.TRANSFORMED_FEATURES_KEY] is not None): transformed_features = element[constants.TRANSFORMED_FEATURES_KEY] # If only one model, the output is stored without keying on model name. if not self._eval_config or len( self._eval_config.model_specs) == 1: features_dicts.append(transformed_features) else: # Search for slices in each model's transformed features output. for spec in self._eval_config.model_specs: if spec.name in transformed_features: features_dicts.append(transformed_features[spec.name]) # Search for slices first in transformed features (if any). If a match is # not found there then search in raw features. slice_keys = list( slicer.get_slices_for_features_dicts( features_dicts, util.get_features_from_extracts(element), slice_spec)) # If SLICE_KEY_TYPES_KEY already exists, that means the # SqlSliceKeyExtractor has generated some slice keys. We need to add # them to current slice_keys list. if (constants.SLICE_KEY_TYPES_KEY in element and element[constants.SLICE_KEY_TYPES_KEY]): slice_keys.extend(element[constants.SLICE_KEY_TYPES_KEY]) unique_slice_keys = list(set(slice_keys)) if len(slice_keys) != len(unique_slice_keys): self._duplicate_slice_keys_counter.inc() # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = ( slicer.slice_keys_to_numpy_array(unique_slice_keys)) # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in unique_slice_keys))) return [element_copy]
def process(self, element: types.Extracts) -> List[types.Extracts]: features = util.get_features_from_extracts(element) slices = list( slicer.get_slices_for_features_dict(features, self._slice_spec)) # Make a a shallow copy, so we don't mutate the original. element_copy = copy.copy(element) element_copy[constants.SLICE_KEY_TYPES_KEY] = slices # Add a list of stringified slice keys to be materialized to output table. if self._materialize: element_copy[constants.SLICE_KEYS_KEY] = types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=(list( slicer.stringify_slice_key(x).encode('utf-8') for x in slices))) return [element_copy]
def get_slices_as_dataframe( slices: List[SliceComparisonResult], additional_metric_keys: Optional[List[metric_types.MetricKey]] = None ) -> pd.DataFrame: """Returns top slices as a dataframe. Args: slices: List of ordered slices. additional_metric_keys: An optional list of additional metric keys to display. Returns: Dataframe containing information about the slices. """ dataframe_data = [] for slice_info in slices: slice_metrics = _get_metrics_as_dict(slice_info.raw_slice_metrics) row_data = { 'Slice': slicer_lib.stringify_slice_key(slice_info.slice_key), 'Size': slice_info.num_examples, 'Slice metric': slice_info.slice_metric, 'Base metric': slice_info.base_metric, 'P-Value': slice_info.p_value, 'Effect size': slice_info.effect_size } if additional_metric_keys: for metric_key in additional_metric_keys: # The MetricKeys are converted to strings for the column names since # all of the other column names in the dataframe are strings. row_data[str( metric_key)] = slice_metrics[metric_key].unsampled_value dataframe_data.append(row_data) # The column labels are used to ensure that the order of the columns is always # the same. ordered_column_labels = [ 'Slice', 'Size', 'Slice metric', 'Base metric', 'P-Value', 'Effect size' ] if additional_metric_keys: ordered_column_labels.extend( [str(metric_key) for metric_key in additional_metric_keys]) dataframe = pd.DataFrame(dataframe_data, columns=ordered_column_labels) dataframe.set_index('Slice', inplace=True) return dataframe
def get_slices_as_dataframe( slices: List[SliceComparisonResult], additional_metric_names: Optional[List[Text]] = None) -> pd.DataFrame: """Returns top slices as a dataframe. Args: slices: List of ordered slices. additional_metric_names: An optional list of additional metric names to display Returns: Dataframe containing information about the slices. """ rows = [] for slice_info in slices: slice_metrics = _get_metrics_as_dict(slice_info.raw_slice_metrics) row = { 'Slice': slicer_lib.stringify_slice_key(slice_info.slice_key), 'Size': slice_info.num_examples, 'Slice metric': slice_info.slice_metric, 'Base metric': slice_info.base_metric, 'P-Value': slice_info.p_value, 'Effect size': slice_info.effect_size } if additional_metric_names: for metric_key in additional_metric_names: row[metric_key] = slice_metrics[metric_key].unsampled_value rows.append(row) ordered_columns = [ 'Slice', 'Size', 'Slice metric', 'Base metric', 'P-Value', 'Effect size' ] if additional_metric_names: ordered_columns.extend(additional_metric_names) dataframe = pd.DataFrame(rows, columns=ordered_columns) dataframe.set_index('Slice', inplace=True) return dataframe
def find_all_slices( results: List[Tuple[slicer.SliceKeyType, Dict[Text, Any]]], slicing_spec: slicer.SingleSliceSpec ) -> List[Dict[Text, Union[Dict[Text, Any], Text]]]: """Util function that extracts slicing metrics for the named column. Args: results: A list of records. Each record is a tuple of (slice_name, {metric_name, metric_value}). slicing_spec: The spec to slice on. Returns: A list of {slice, metrics} """ data = [] for (slice_key, metric_value) in results: if slicing_spec.is_slice_applicable(slice_key): data.append({ 'slice': slicer.stringify_slice_key(slice_key), 'metrics': metric_value }) return data # pytype: disable=bad-return-type
def find_top_slices(metrics: List[metrics_for_slice_pb2.MetricsForSlice], metric_key: Text, statistics: statistics_pb2.DatasetFeatureStatisticsList, comparison_type: Text = 'HIGHER', min_num_examples: int = 10, num_top_slices: int = 10, rank_by: Text = 'EFFECT_SIZE'): """Finds top-k slices. Args: metrics: List of slice metrics protos. We assume that the metrics have MetricValue.confidence_interval field populated. This will be populated when the metrics computed with confidence intervals enabled. metric_key: Name of the metric based on which significance testing is done. statistics: Data statistics used to configure AutoSliceKeyExtractor. comparison_type: Type of comparison indicating if we are looking for slices whose metric is higher (`HIGHER`) or lower (`LOWER`) than the metric of the base slice (overall dataset). min_num_examples: Minimum number of examples that a slice should have. num_top_slices: Number of top slices to return. rank_by: Indicates how the slices should be ordered in the result. Returns: List of ordered slices. """ assert comparison_type in ['HIGHER', 'LOWER'] assert min_num_examples > 0 assert 0 < num_top_slices assert rank_by in ['EFFECT_SIZE', 'PVALUE'] metrics_dict = { slicer_lib.deserialize_slice_key(slice_metrics.slice_key): slice_metrics for slice_metrics in metrics } overall_slice_metrics = metrics_dict[()] del metrics_dict[()] boundaries = auto_slice_key_extractor._get_bucket_boundaries(statistics) # pylint: disable=protected-access overall_metrics_dict = _get_metrics_as_dict(overall_slice_metrics) to_be_sorted_slices = [] for slice_key, slice_metrics in metrics_dict.items(): slice_metrics_dict = _get_metrics_as_dict(slice_metrics) num_examples = slice_metrics_dict['example_count'].unsampled_value if num_examples < min_num_examples: continue # Prune non-interesting slices. if np.isnan(slice_metrics_dict[metric_key].unsampled_value): continue if comparison_type == 'HIGHER': comparison_fn = operator.le else: comparison_fn = operator.ge if comparison_fn(slice_metrics_dict[metric_key].unsampled_value, overall_metrics_dict[metric_key].unsampled_value): continue # Only consider statistically significant slices. is_significant, pvalue = _is_significant_slice( slice_metrics_dict[metric_key].unsampled_value, slice_metrics_dict[metric_key].sample_standard_deviation, slice_metrics_dict['example_count'].unsampled_value, overall_metrics_dict[metric_key].unsampled_value, overall_metrics_dict[metric_key].sample_standard_deviation, overall_metrics_dict['example_count'].unsampled_value, comparison_type) if not is_significant: continue # Format the slice info (feature names, values) in the proto into a # slice key. transformed_slice_key = [] for (feature, value) in slice_key: if feature.startswith( auto_slice_key_extractor.TRANSFORMED_FEATURE_PREFIX): feature = feature[len(auto_slice_key_extractor. TRANSFORMED_FEATURE_PREFIX):] value = _bucket_to_range(value, boundaries[feature]) transformed_slice_key.append((feature, value)) slice_key = slicer_lib.stringify_slice_key( tuple(transformed_slice_key)) # Compute effect size for the slice. effect_size = _compute_effect_size( slice_metrics_dict[metric_key].unsampled_value, slice_metrics_dict[metric_key].sample_standard_deviation, overall_metrics_dict[metric_key].unsampled_value, overall_metrics_dict[metric_key].sample_standard_deviation) to_be_sorted_slices.append( SliceComparisonResult( slice_key, num_examples, slice_metrics_dict[metric_key].unsampled_value, overall_metrics_dict[metric_key].unsampled_value, pvalue, effect_size)) # Rank the slices. ranking_fn, reverse = operator.attrgetter('effect_size'), True if rank_by == 'PVALUE': ranking_fn, reverse = operator.attrgetter('pvalue'), False result = sorted(to_be_sorted_slices, key=ranking_fn, reverse=reverse)[:num_top_slices] return result
def test_get_slices_as_dataframe(self): input_slices = [ auto_slicing_util.SliceComparisonResult( slice_key=(('native-country', 'United-States'), ), num_examples=29170, slice_metric=0.09, base_metric=0.087, p_value=0, effect_size=0.46, raw_slice_metrics=text_format.Parse( """ slice_key { single_slice_keys { column: "native-country" bytes_value: "United-States" } } metric_keys_and_values { key { name: "false_positives" } value { bounded_value { lower_bound { value: 1754.6514199722158 } upper_bound { value: 2092.488580027784 } value { value: 1923.57 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 1754.6514199722158 } upper_bound { value: 2092.488580027784 } t_distribution_value { sample_mean { value: 1923.57 } sample_standard_deviation { value: 85.13110418664061 } sample_degrees_of_freedom { value: 99 } unsampled_value { value: 1943.0 } } } } } metric_keys_and_values { key { name: "false_negatives" } value { bounded_value { lower_bound { value: 3595.413107983637 } upper_bound { value: 4195.886892016363 } value { value: 3895.65 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 3595.413107983637 } upper_bound { value: 4195.886892016363 } t_distribution_value { sample_mean { value: 3895.65 } sample_standard_deviation { value: 151.31253252729257 } sample_degrees_of_freedom { value: 99 } unsampled_value { value: 3935.0 } } } } }""", metrics_for_slice_pb2.MetricsForSlice())), auto_slicing_util.SliceComparisonResult( slice_key=(('age', '[58.0, 90.0)'), ), num_examples=2999, slice_metric=0.09, base_metric=0.0875, p_value=7.8, effect_size=0.98, raw_slice_metrics=text_format.Parse( """ slice_key { single_slice_keys { column: "age" bytes_value: "[58.0, 90.0)" } } metric_keys_and_values { key { name: "false_positives" } value { bounded_value { lower_bound { value: 167.54646972321814 } upper_bound { value: 236.37353027678188 } value { value: 201.96 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 167.54646972321814 } upper_bound { value: 236.37353027678188 } t_distribution_value { sample_mean { value: 201.96 } sample_standard_deviation { value: 17.343632837435358 } sample_degrees_of_freedom { value: 99 } unsampled_value { value: 204.0 } } } } } metric_keys_and_values { key { name: "false_negatives" } value { bounded_value { lower_bound { value: 486.4402337348782 } upper_bound { value: 610.479766265122 } value { value: 548.46 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 486.4402337348782 } upper_bound { value: 610.479766265122 } t_distribution_value { sample_mean { value: 548.46 } sample_standard_deviation { value: 31.256544914589938 } sample_degrees_of_freedom { value: 99 } unsampled_value { value: 554.0 } } } } }""", metrics_for_slice_pb2.MetricsForSlice())) ] additional_metric_keys = [ metric_types.MetricKey('false_positives'), metric_types.MetricKey('false_negatives') ] expected_dataframe_data = [{ 'Slice': slicer_lib.stringify_slice_key(input_slices[0].slice_key), 'Size': input_slices[0].num_examples, 'Slice metric': input_slices[0].slice_metric, 'Base metric': input_slices[0].base_metric, 'P-Value': input_slices[0].p_value, 'Effect size': input_slices[0].effect_size, str(additional_metric_keys[0]): 1923.57, str(additional_metric_keys[1]): 3895.65 }, { 'Slice': slicer_lib.stringify_slice_key(input_slices[1].slice_key), 'Size': input_slices[1].num_examples, 'Slice metric': input_slices[1].slice_metric, 'Base metric': input_slices[1].base_metric, 'P-Value': input_slices[1].p_value, 'Effect size': input_slices[1].effect_size, str(additional_metric_keys[0]): 201.96, str(additional_metric_keys[1]): 548.46 }] expected_dataframe_column_labels = [ 'Slice', 'Size', 'Slice metric', 'Base metric', 'P-Value', 'Effect size', str(additional_metric_keys[0]), str(additional_metric_keys[1]) ] expected_dataframe = pd.DataFrame( expected_dataframe_data, columns=expected_dataframe_column_labels) expected_dataframe.set_index('Slice', inplace=True) actual_dataframe = auto_slicing_util.get_slices_as_dataframe( input_slices, additional_metric_keys) assert_frame_equal(actual_dataframe, expected_dataframe)