예제 #1
0
  def test_jackknife_merge_jackknife_samples_small_samples(self):
    metric_key = metric_types.MetricKey(u'metric')
    slice_key1 = (u'slice_feature', 1)
    slice_key2 = (u'slice_feature', 2)
    # the sample value is irrelevant for this test as we only verify counters.
    sample_value = {metric_key: 42}
    sliced_derived_metrics = [
        # unsampled value for slice 1
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                       jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                           metric_key: 2.1,
                           jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 16
                       }),
        # 5 sample values for slice 1
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value),
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value),
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 2)), sample_value),
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 3)), sample_value),
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 4)), sample_value),
        # unsampled value for slice 2
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                       jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                           metric_key: 6.3,
                           jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 10000
                       }),
        # 5 sample values for slice 2
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value),
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value),
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 2)), sample_value),
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 3)), sample_value),
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 4)), sample_value),
    ]

    with beam.Pipeline() as pipeline:
      _ = (
          pipeline
          | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False)
          | 'MergeJackknifeSamples' >>
          jackknife.MergeJackknifeSamples(num_jackknife_samples=5))

      result = pipeline.run()
      # we expect one bad jackknife samples counter increment for slice1.
      # slice1: num_samples=5, n=16, d=3.2, sqrt(n)=4, d < sqrt(n) = True
      # slice2: num_samples=5, n=10000, d=2000, sqrt(n)=100, d < sqrt(n) = False
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices_with_small_jackknife_samples')
      counters = result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(1, counters[0].committed)

      # verify total slice counter
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices')
      counters = result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(2, counters[0].committed)
예제 #2
0
  def test_jackknife_merge_jackknife_samples_missing_samples(self):
    metric_key = metric_types.MetricKey(u'metric')
    slice_key1 = (u'slice_feature', 1)
    slice_key2 = (u'slice_feature', 2)
    # the sample value is irrelevant for this test as we only verify counters.
    sample_value = {metric_key: 42}
    sliced_derived_metrics = [
        # unsampled value for slice 1
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                       jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                           metric_key: 2.1,
                           jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 16
                       }),
        # 2 sample values for slice 1
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value),
        ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), sample_value),
        # unsampled value for slice 2
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                       jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                           metric_key: 6.3,
                           jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 10000
                       }),
        # Only 1 sample value (missing sample ID 1) for slice 2
        ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), sample_value),
    ]

    with beam.Pipeline() as pipeline:
      _ = (
          pipeline
          | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False)
          | 'MergeJackknifeSamples' >>
          jackknife.MergeJackknifeSamples(num_jackknife_samples=2))

      result = pipeline.run()
      # we expect one missing samples counter increment for slice2, since we
      # expected 2 samples, but only saw 1.
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices_missing_jackknife_samples')
      counters = result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(1, counters[0].committed)

      # verify total slice counter
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices')
      counters = result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(2, counters[0].committed)
예제 #3
0
    def test_jackknife_merge_jackknife_samples_numpy_overflow(self):
        sample_values = np.random.RandomState(seed=0).randint(0, 1e10, 20)
        slice_key = (u'slice_feature', 1)
        metric_key = metric_types.MetricKey(u'metric')
        sliced_derived_metrics = [
            ((slice_key, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                          jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                              metric_key: 1,
                              jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY:
                              200,
                          })
        ]
        for sample_id, value in enumerate(sample_values):
            sliced_derived_metrics.append(
                ((slice_key, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, sample_id)),
                 {
                     metric_key: value,
                 }))
        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'Create' >> beam.Create(sliced_derived_metrics,
                                          reshuffle=False)
                | 'JackknifeCombinePerKey' >>
                jackknife.MergeJackknifeSamples(num_jackknife_samples=20))

            def check_result(got_pcoll):
                expected_pcoll = [
                    ((slice_key, ), {
                        metric_key:
                        types.ValueWithTDistribution(
                            sample_mean=5293977041.15,
                            sample_standard_deviation=12845957824.018991,
                            sample_degrees_of_freedom=19,
                            unsampled_value=1),
                    }),
                ]
                self.assertCountEqual(expected_pcoll, got_pcoll)

            util.assert_that(result, check_result)
def _ComputePerSlice(  # pylint: disable=invalid-name
        sliced_extracts: beam.pvalue.PCollection,
        computations: List[metric_types.MetricComputation],
        derived_computations: List[metric_types.DerivedMetricComputation],
        cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]] = None,
        compute_with_sampling: Optional[bool] = False,
        num_jackknife_samples: int = 0,
        skip_ci_metric_keys: Set[metric_types.MetricKey] = frozenset(),
        random_seed_for_testing: Optional[int] = None,
        baseline_model_name: Optional[Text] = None) -> beam.pvalue.PCollection:
    """PTransform for computing, aggregating and combining metrics and plots.

  Args:
    sliced_extracts: Incoming PCollection consisting of slice key and extracts.
    computations: List of MetricComputations.
    derived_computations: List of DerivedMetricComputations.
    cross_slice_specs: List of CrossSlicingSpec.
    compute_with_sampling: True to compute with bootstrap sampling. This allows
      _ComputePerSlice to be used to generate unsampled values from the whole
      data set, as well as bootstrap resamples, in which each element is treated
      as if it showed up p ~ poission(1) times.
    num_jackknife_samples: number of delete-d jackknife estimates to use in
      computing standard errors on metrics.
    skip_ci_metric_keys: List of metric keys for which to skip confidence
      interval computation.
    random_seed_for_testing: Seed to use for unit testing.
    baseline_model_name: Name for baseline model.

  Returns:
    PCollection of (slice key, dict of metrics).
  """
    # TODO(b/123516222): Remove this workaround per discussions in CL/227944001
    sliced_extracts.element_type = beam.typehints.Any

    def convert_and_add_derived_values(
        sliced_results: Tuple[slicer.SliceKeyType,
                              Tuple[metric_types.MetricsDict, ...]],
        derived_computations: List[metric_types.DerivedMetricComputation],
    ) -> Tuple[slicer.SliceKeyType, metric_types.MetricsDict]:
        """Converts per slice tuple of dicts into single dict and adds derived."""
        result = {}
        for v in sliced_results[1]:
            result.update(v)
        for c in derived_computations:
            result.update(c.result(result))
        # Remove private metrics
        keys = list(result.keys())
        for k in keys:
            if k.name.startswith('_') and not k.name.startswith('__'):
                result.pop(k)
        return sliced_results[0], result

    def add_diff_metrics(
        sliced_metrics: Tuple[Union[slicer.SliceKeyType,
                                    slicer.CrossSliceKeyType],
                              Dict[metric_types.MetricKey, Any]],
        baseline_model_name: Optional[Text],
    ) -> Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]]:
        """Add diff metrics if there is a baseline model."""

        result = copy.copy(sliced_metrics[1])

        if baseline_model_name:
            diff_result = {}
            for k, v in result.items():
                if k.model_name != baseline_model_name and k.make_baseline_key(
                        baseline_model_name) in result:
                    # plots will not be diffed.
                    if not isinstance(v, message.Message):
                        diff_result[k.make_diff_key()] = v - result[
                            k.make_baseline_key(baseline_model_name)]
            result.update(diff_result)

        return (sliced_metrics[0], result)

    combiner = _ComputationsCombineFn(
        computations=computations,
        compute_with_sampling=compute_with_sampling,
        random_seed_for_testing=random_seed_for_testing)
    if num_jackknife_samples:
        # We do not use the hotkey fanout hint used by the non-jacknife path because
        # the random jackknife partitioning naturally mitigates hot keys.
        sliced_combiner_outputs = (
            sliced_extracts
            | 'JackknifeCombinePerSliceKey' >>
            jackknife.JackknifeCombinePerKey(combiner, num_jackknife_samples))
    else:
        sliced_combiner_outputs = (
            sliced_extracts
            | 'CombinePerSliceKey' >> beam.CombinePerKey(combiner).
            with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT))

    sliced_derived_values_and_diffs = (
        sliced_combiner_outputs
        | 'ConvertAndAddDerivedValues' >> beam.Map(
            convert_and_add_derived_values, derived_computations)
        | 'AddCrossSliceMetrics' >> _AddCrossSliceMetrics(cross_slice_specs)  # pylint: disable=no-value-for-parameter
        | 'AddDiffMetrics' >> beam.Map(add_diff_metrics, baseline_model_name))

    if num_jackknife_samples:
        return (sliced_derived_values_and_diffs
                | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples(
                    num_jackknife_samples, skip_ci_metric_keys))
    else:
        return sliced_derived_values_and_diffs
예제 #5
0
    def test_jackknife_merge_jackknife_samples(self):
        x_key = metric_types.MetricKey(u'x')
        y_key = metric_types.MetricKey(u'y')
        cm_key = metric_types.MetricKey(u'confusion_matrix')
        cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5],
                                                       tp=[0],
                                                       fp=[1],
                                                       tn=[2],
                                                       fn=[3],
                                                       tp_examples=[],
                                                       tn_examples=[],
                                                       fp_examples=[],
                                                       fn_examples=[])
        example_count_key = metric_types.MetricKey(
            example_count.EXAMPLE_COUNT_NAME)
        slice_key1 = (u'slice_feature', 1)
        slice_key2 = (u'slice_feature', 2)
        sliced_derived_metrics = [
            # unsampled value for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                           jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                               x_key: 1.6,
                               y_key: 16,
                               cm_key: cm_metric,
                               example_count_key: 100,
                               jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY:
                               100
                           }),
            # sample values 1 of 2 for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), {
                x_key: 1,
                y_key: 10,
                cm_key: cm_metric,
                example_count_key: 45,
            }),
            # sample values 2 of 2 for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), {
                x_key: 2,
                y_key: 20,
                cm_key: cm_metric,
                example_count_key: 55,
            }),
            # unsampled value for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                           jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                               x_key: 3.3,
                               y_key: 33,
                               cm_key: cm_metric,
                               example_count_key: 1000,
                               jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY:
                               1000
                           }),
            # sample values 1 of 2 for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), {
                x_key: 2,
                y_key: 20,
                cm_key: cm_metric,
                example_count_key: 450,
            }),
            # sample values 2 of 2 for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), {
                x_key: 4,
                y_key: 40,
                cm_key: cm_metric,
                example_count_key: 550,
            }),
        ]

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'Create' >> beam.Create(sliced_derived_metrics,
                                          reshuffle=False)
                | 'JackknifeCombinePerKey' >> jackknife.MergeJackknifeSamples(
                    num_jackknife_samples=2,
                    skip_ci_metric_keys=[example_count_key]))

            # For standard error calculations, see delete-d jackknife formula in:
            # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf
            # Rather than normalize by all possible n-choose-d samples, we normalize
            # by the actual number of samples (2).
            def check_result(got_pcoll):
                expected_pcoll = [
                    (
                        (slice_key1, ),
                        {
                            x_key:
                            types.ValueWithTDistribution(
                                sample_mean=1.5,
                                # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5
                                sample_standard_deviation=.5,
                                sample_degrees_of_freedom=1,
                                unsampled_value=1.6),
                            y_key:
                            types.ValueWithTDistribution(
                                sample_mean=15,
                                # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5
                                sample_standard_deviation=5,
                                sample_degrees_of_freedom=1,
                                unsampled_value=16),
                            cm_key:
                            cm_metric,
                            example_count_key:
                            100,
                        }),
                    (
                        (slice_key2, ),
                        {
                            x_key:
                            types.ValueWithTDistribution(
                                sample_mean=3,
                                # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5
                                sample_standard_deviation=1,
                                sample_degrees_of_freedom=1,
                                unsampled_value=3.3),
                            y_key:
                            types.ValueWithTDistribution(
                                sample_mean=30,
                                # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5
                                sample_standard_deviation=10,
                                sample_degrees_of_freedom=1,
                                unsampled_value=33),
                            cm_key:
                            cm_metric,
                            example_count_key:
                            1000,
                        }),
                ]
                self.assertCountEqual(expected_pcoll, got_pcoll)

            util.assert_that(result, check_result)