def test_jackknife_merge_jackknife_samples_small_samples(self): metric_key = metric_types.MetricKey(u'metric') slice_key1 = (u'slice_feature', 1) slice_key2 = (u'slice_feature', 2) # the sample value is irrelevant for this test as we only verify counters. sample_value = {metric_key: 42} sliced_derived_metrics = [ # unsampled value for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { metric_key: 2.1, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 16 }), # 5 sample values for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value), ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value), ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 2)), sample_value), ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 3)), sample_value), ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 4)), sample_value), # unsampled value for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { metric_key: 6.3, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 10000 }), # 5 sample values for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value), ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value), ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 2)), sample_value), ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 3)), sample_value), ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 4)), sample_value), ] with beam.Pipeline() as pipeline: _ = ( pipeline | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False) | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples(num_jackknife_samples=5)) result = pipeline.run() # we expect one bad jackknife samples counter increment for slice1. # slice1: num_samples=5, n=16, d=3.2, sqrt(n)=4, d < sqrt(n) = True # slice2: num_samples=5, n=10000, d=2000, sqrt(n)=100, d < sqrt(n) = False metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices_with_small_jackknife_samples') counters = result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(1, counters[0].committed) # verify total slice counter metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices') counters = result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(2, counters[0].committed)
def test_jackknife_merge_jackknife_samples_missing_samples(self): metric_key = metric_types.MetricKey(u'metric') slice_key1 = (u'slice_feature', 1) slice_key2 = (u'slice_feature', 2) # the sample value is irrelevant for this test as we only verify counters. sample_value = {metric_key: 42} sliced_derived_metrics = [ # unsampled value for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { metric_key: 2.1, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 16 }), # 2 sample values for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value), ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value), # unsampled value for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { metric_key: 6.3, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 10000 }), # Only 1 sample value (missing sample ID 1) for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value), ] with beam.Pipeline() as pipeline: _ = ( pipeline | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False) | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples(num_jackknife_samples=2)) result = pipeline.run() # we expect one missing samples counter increment for slice2, since we # expected 2 samples, but only saw 1. metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices_missing_jackknife_samples') counters = result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(1, counters[0].committed) # verify total slice counter metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices') counters = result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(2, counters[0].committed)
def test_jackknife_merge_jackknife_samples_numpy_overflow(self): sample_values = np.random.RandomState(seed=0).randint(0, 1e10, 20) slice_key = (u'slice_feature', 1) metric_key = metric_types.MetricKey(u'metric') sliced_derived_metrics = [ ((slice_key, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { metric_key: 1, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 200, }) ] for sample_id, value in enumerate(sample_values): sliced_derived_metrics.append( ((slice_key, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, sample_id)), { metric_key: value, })) with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False) | 'JackknifeCombinePerKey' >> jackknife.MergeJackknifeSamples(num_jackknife_samples=20)) def check_result(got_pcoll): expected_pcoll = [ ((slice_key, ), { metric_key: types.ValueWithTDistribution( sample_mean=5293977041.15, sample_standard_deviation=12845957824.018991, sample_degrees_of_freedom=19, unsampled_value=1), }), ] self.assertCountEqual(expected_pcoll, got_pcoll) util.assert_that(result, check_result)
def _ComputePerSlice( # pylint: disable=invalid-name sliced_extracts: beam.pvalue.PCollection, computations: List[metric_types.MetricComputation], derived_computations: List[metric_types.DerivedMetricComputation], cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]] = None, compute_with_sampling: Optional[bool] = False, num_jackknife_samples: int = 0, skip_ci_metric_keys: Set[metric_types.MetricKey] = frozenset(), random_seed_for_testing: Optional[int] = None, baseline_model_name: Optional[Text] = None) -> beam.pvalue.PCollection: """PTransform for computing, aggregating and combining metrics and plots. Args: sliced_extracts: Incoming PCollection consisting of slice key and extracts. computations: List of MetricComputations. derived_computations: List of DerivedMetricComputations. cross_slice_specs: List of CrossSlicingSpec. compute_with_sampling: True to compute with bootstrap sampling. This allows _ComputePerSlice to be used to generate unsampled values from the whole data set, as well as bootstrap resamples, in which each element is treated as if it showed up p ~ poission(1) times. num_jackknife_samples: number of delete-d jackknife estimates to use in computing standard errors on metrics. skip_ci_metric_keys: List of metric keys for which to skip confidence interval computation. random_seed_for_testing: Seed to use for unit testing. baseline_model_name: Name for baseline model. Returns: PCollection of (slice key, dict of metrics). """ # TODO(b/123516222): Remove this workaround per discussions in CL/227944001 sliced_extracts.element_type = beam.typehints.Any def convert_and_add_derived_values( sliced_results: Tuple[slicer.SliceKeyType, Tuple[metric_types.MetricsDict, ...]], derived_computations: List[metric_types.DerivedMetricComputation], ) -> Tuple[slicer.SliceKeyType, metric_types.MetricsDict]: """Converts per slice tuple of dicts into single dict and adds derived.""" result = {} for v in sliced_results[1]: result.update(v) for c in derived_computations: result.update(c.result(result)) # Remove private metrics keys = list(result.keys()) for k in keys: if k.name.startswith('_') and not k.name.startswith('__'): result.pop(k) return sliced_results[0], result def add_diff_metrics( sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType], Dict[metric_types.MetricKey, Any]], baseline_model_name: Optional[Text], ) -> Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]]: """Add diff metrics if there is a baseline model.""" result = copy.copy(sliced_metrics[1]) if baseline_model_name: diff_result = {} for k, v in result.items(): if k.model_name != baseline_model_name and k.make_baseline_key( baseline_model_name) in result: # plots will not be diffed. if not isinstance(v, message.Message): diff_result[k.make_diff_key()] = v - result[ k.make_baseline_key(baseline_model_name)] result.update(diff_result) return (sliced_metrics[0], result) combiner = _ComputationsCombineFn( computations=computations, compute_with_sampling=compute_with_sampling, random_seed_for_testing=random_seed_for_testing) if num_jackknife_samples: # We do not use the hotkey fanout hint used by the non-jacknife path because # the random jackknife partitioning naturally mitigates hot keys. sliced_combiner_outputs = ( sliced_extracts | 'JackknifeCombinePerSliceKey' >> jackknife.JackknifeCombinePerKey(combiner, num_jackknife_samples)) else: sliced_combiner_outputs = ( sliced_extracts | 'CombinePerSliceKey' >> beam.CombinePerKey(combiner). with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)) sliced_derived_values_and_diffs = ( sliced_combiner_outputs | 'ConvertAndAddDerivedValues' >> beam.Map( convert_and_add_derived_values, derived_computations) | 'AddCrossSliceMetrics' >> _AddCrossSliceMetrics(cross_slice_specs) # pylint: disable=no-value-for-parameter | 'AddDiffMetrics' >> beam.Map(add_diff_metrics, baseline_model_name)) if num_jackknife_samples: return (sliced_derived_values_and_diffs | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples( num_jackknife_samples, skip_ci_metric_keys)) else: return sliced_derived_values_and_diffs
def test_jackknife_merge_jackknife_samples(self): x_key = metric_types.MetricKey(u'x') y_key = metric_types.MetricKey(u'y') cm_key = metric_types.MetricKey(u'confusion_matrix') cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[]) example_count_key = metric_types.MetricKey( example_count.EXAMPLE_COUNT_NAME) slice_key1 = (u'slice_feature', 1) slice_key2 = (u'slice_feature', 2) sliced_derived_metrics = [ # unsampled value for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { x_key: 1.6, y_key: 16, cm_key: cm_metric, example_count_key: 100, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 100 }), # sample values 1 of 2 for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), { x_key: 1, y_key: 10, cm_key: cm_metric, example_count_key: 45, }), # sample values 2 of 2 for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), { x_key: 2, y_key: 20, cm_key: cm_metric, example_count_key: 55, }), # unsampled value for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { x_key: 3.3, y_key: 33, cm_key: cm_metric, example_count_key: 1000, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 1000 }), # sample values 1 of 2 for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), { x_key: 2, y_key: 20, cm_key: cm_metric, example_count_key: 450, }), # sample values 2 of 2 for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), { x_key: 4, y_key: 40, cm_key: cm_metric, example_count_key: 550, }), ] with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False) | 'JackknifeCombinePerKey' >> jackknife.MergeJackknifeSamples( num_jackknife_samples=2, skip_ci_metric_keys=[example_count_key])) # For standard error calculations, see delete-d jackknife formula in: # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf # Rather than normalize by all possible n-choose-d samples, we normalize # by the actual number of samples (2). def check_result(got_pcoll): expected_pcoll = [ ( (slice_key1, ), { x_key: types.ValueWithTDistribution( sample_mean=1.5, # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5 sample_standard_deviation=.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15, # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5 sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: cm_metric, example_count_key: 100, }), ( (slice_key2, ), { x_key: types.ValueWithTDistribution( sample_mean=3, # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5 sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30, # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5 sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: cm_metric, example_count_key: 1000, }), ] self.assertCountEqual(expected_pcoll, got_pcoll) util.assert_that(result, check_result)