示例#1
0
def _calibration_plot(
    num_buckets: int = DEFAULT_NUM_BUCKETS,
    left: Optional[float] = None,
    right: Optional[float] = None,
    name: Text = CALIBRATION_PLOT_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    schema: Optional[schema_pb2.Schema] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
  """Returns metric computations for calibration plot."""
  key = metric_types.PlotKey(
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  label_left, label_right = None, None
  if (left is None or right is None) and eval_config and schema:
    label_left, label_right = _find_label_domain(eval_config, schema,
                                                 model_name, output_name)
  if left is None:
    left = label_left if label_left is not None else 0.0
  if right is None:
    right = label_right if label_right is not None else 1.0

  # Make sure calibration histogram is calculated. Note we are using the default
  # number of buckets assigned to the histogram instead of the value used for
  # the plots just in case the computation is shared with other metrics and
  # plots that need higher preicion. It will be downsampled later.
  computations = calibration_histogram.calibration_histogram(
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      left=left,
      right=right,
      aggregation_type=aggregation_type,
      class_weights=class_weights)
  histogram_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    thresholds = [
        left + i * (right - left) / num_buckets for i in range(num_buckets + 1)
    ]
    thresholds = [float('-inf')] + thresholds
    histogram = calibration_histogram.rebin(
        thresholds, metrics[histogram_key], left=left, right=right)
    return {key: _to_proto(thresholds, histogram)}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
示例#2
0
def _calibration_plot(
    num_buckets: int = DEFAULT_NUM_BUCKETS,
    left: float = 0.0,
    right: float = 1.0,
    name: Text = CALIBRATION_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for calibration plot."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # Make sure calibration histogram is calculated. Note we are using the default
    # number of buckets assigned to the histogram instead of the value used for
    # the plots just in case the computation is shared with other metrics and
    # plots that need higher preicion. It will be downsampled later.
    computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        left=left,
        right=right)
    histogram_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        thresholds = [
            left + i * (right - left) / num_buckets
            for i in range(num_buckets + 1)
        ]
        thresholds = [float('-inf')] + thresholds
        histogram = calibration_histogram.rebin(thresholds,
                                                metrics[histogram_key],
                                                left=left,
                                                right=right)
        return {key: _to_proto(thresholds, histogram)}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
    def testCalibrationHistogram(self):
        histogram = calibration_histogram.calibration_histogram()[0]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([2.0])
        }
        example3 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([3.0])
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([-0.1]),
            'example_weights': np.array([4.0])
        }
        example5 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([5.0])
        }
        example6 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([6.0])
        }
        example7 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([7.0])
        }
        example8 = {
            'labels': np.array([1.0]),
            'predictions': np.array([1.1]),
            'example_weights': np.array([8.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create([
                    example1, example2, example3, example4, example5, example6,
                    example7, example8
                ])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_plots, 1)
                    key = metric_types.PlotKey('_calibration_histogram_10000')
                    self.assertIn(key, got_plots)
                    got_histogram = got_plots[key]
                    self.assertLen(got_histogram, 5)
                    self.assertEqual(
                        got_histogram[0],
                        calibration_histogram.Bucket(
                            bucket_id=0,
                            weighted_labels=1.0 * 4.0,
                            weighted_predictions=-0.1 * 4.0,
                            weighted_examples=4.0))
                    self.assertEqual(
                        got_histogram[1],
                        calibration_histogram.Bucket(
                            bucket_id=2001,
                            weighted_labels=0.0 + 0.0,
                            weighted_predictions=0.2 + 7 * 0.2,
                            weighted_examples=1.0 + 7.0))
                    self.assertEqual(
                        got_histogram[2],
                        calibration_histogram.Bucket(
                            bucket_id=5001,
                            weighted_labels=1.0 * 5.0,
                            weighted_predictions=0.5 * 3.0 + 0.5 * 5.0,
                            weighted_examples=3.0 + 5.0))
                    self.assertEqual(
                        got_histogram[3],
                        calibration_histogram.Bucket(
                            bucket_id=8001,
                            weighted_labels=1.0 * 2.0 + 1.0 * 6.0,
                            weighted_predictions=0.8 * 2.0 + 0.8 * 6.0,
                            weighted_examples=2.0 + 6.0))
                    self.assertEqual(
                        got_histogram[4],
                        calibration_histogram.Bucket(bucket_id=10001,
                                                     weighted_labels=1.0 * 8.0,
                                                     weighted_predictions=1.1 *
                                                     8.0,
                                                     weighted_examples=8.0))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
def binary_confusion_matrices(
        num_thresholds: Optional[int] = None,
        thresholds: Optional[List[float]] = None,
        name: Optional[Text] = None,
        eval_config: Optional[config.EvalConfig] = None,
        model_name: Text = '',
        output_name: Text = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        use_histogram: Optional[bool] = None,
        extract_label_prediction_and_weight: Optional[Callable[
            ..., Any]] = metric_util.to_label_prediction_example_weight,
        preprocessor: Optional[Callable[..., Any]] = None,
        example_id_key: Optional[Text] = None,
        example_ids_count: Optional[int] = None,
        fractional_labels: float = True) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used. If used, num_thresholds must
      be > 1.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the boundaries with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used. For metrics computed at top k
      this may be a single negative threshold value (i.e. -inf).
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    aggregation_type: Optional aggregation type.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).
    use_histogram: If true, matrices will be derived from calibration
      histograms.
    extract_label_prediction_and_weight: User-provided function argument that
      yields label, prediction, and example weights for use in calculations
      (relevant only when use_histogram flag is not true).
    preprocessor: User-provided preprocessor for including additional extracts
      in StandardMetricInputs (relevant only when use_histogram flag is not
      true).
    example_id_key: Feature key containing example id (relevant only when
      use_histogram flag is not true).
    example_ids_count: Max number of example ids to be extracted for false
      positives and false negatives (relevant only when use_histogram flag is
      not true).
    fractional_labels: If true, each incoming tuple of (label, prediction, and
      example weight) will be split into two tuples as follows (where l, p, w
      represent the resulting label, prediction, and example weight values): (1)
        l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l =
        1.0, p = prediction, and w = example_weight * label If enabled, an
        exception will be raised if labels are not within [0, 1]. The
        implementation is such that tuples associated with a weight of zero are
        not yielded. This means it is safe to enable fractional_labels even when
        the labels only take on the values of 0.0 or 1.0.

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    # Keras AUC turns num_thresholds parameters into thresholds which circumvents
    # sharing of settings. If the thresholds match the interpolated version of the
    # thresholds then reset back to num_thresholds.
    if (name is None and thresholds
            and thresholds == _interpolated_thresholds(len(thresholds))):
        num_thresholds = len(thresholds)
        thresholds = None
    if num_thresholds is not None:
        if num_thresholds <= 1:
            raise ValueError('num_thresholds must be > 1')
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = _interpolated_thresholds(num_thresholds)
        if name is None:
            name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME,
                                  num_thresholds)
    elif name is None:
        name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, list(thresholds))

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    computations = []
    metric_key = None

    if use_histogram is None:
        use_histogram = (num_thresholds is not None
                         or (len(thresholds) == 1 and thresholds[0] < 0))

    if use_histogram:
        # Use calibration histogram to calculate matrices. For efficiency (unless
        # all predictions are matched - i.e. thresholds <= 0) we will assume that
        # other metrics will make use of the calibration histogram and re-use the
        # default histogram for the given model_name/output_name/sub_key. This is
        # also required to get accurate counts at the threshold boundaries. If this
        # becomes an issue, then calibration histogram can be updated to support
        # non-linear boundaries.
        computations = calibration_histogram.calibration_histogram(
            eval_config=eval_config,
            num_buckets=(
                # For precision/recall_at_k were a single large negative threshold
                # is used, we only need one bucket. Note that the histogram will
                # actually have 2 buckets: one that we set (which handles
                # predictions > -1.0) and a default catch-all bucket (i.e. bucket 0)
                # that the histogram creates for large negative predictions (i.e.
                # predictions <= -1.0).
                1 if len(thresholds) == 1 and thresholds[0] <= 0 else None),
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            aggregation_type=aggregation_type,
            class_weights=class_weights)
        metric_key = computations[-1].keys[-1]
    else:
        computations = _binary_confusion_matrix_computation(
            eval_config=eval_config,
            thresholds=thresholds,
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            extract_label_prediction_and_weight=
            extract_label_prediction_and_weight,
            preprocessor=preprocessor,
            example_id_key=example_id_key,
            example_ids_count=example_ids_count,
            aggregation_type=aggregation_type,
            class_weights=class_weights,
            fractional_labels=fractional_labels)
        metric_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        matrices = None
        if use_histogram:
            if len(thresholds) == 1 and thresholds[0] < 0:
                # This case is used when all positive prediction values are relevant
                # matches (e.g. when calculating top_k for precision/recall where the
                # non-top_k values are expected to have been set to float('-inf')).
                histogram = metrics[metric_key]
            else:
                # Calibration histogram uses intervals of the form [start, end) where
                # the prediction >= start. The confusion matrices want intervals of the
                # form (start, end] where the prediction > start. Add a small epsilon so
                # that >= checks don't match. This correction shouldn't be needed in
                # practice but allows for correctness in small tests.
                rebin_thresholds = [
                    t + _EPSILON if t != 0 else t for t in thresholds
                ]
                if thresholds[0] >= 0:
                    # Add -epsilon bucket to account for differences in histogram vs
                    # confusion matrix intervals mentioned above. If the epsilon bucket is
                    # missing the false negatives and false positives will be 0 for the
                    # first threshold.
                    rebin_thresholds = [-_EPSILON] + rebin_thresholds
                if thresholds[-1] < 1.0:
                    # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon
                    # othewise true negatives and true positives will be overcounted.
                    rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON]
                histogram = calibration_histogram.rebin(
                    rebin_thresholds, metrics[metric_key])
            matrices = _historgram_to_binary_confusion_matrices(
                thresholds, histogram)
        else:
            matrices = _matrix_to_binary_confusion_matrices(
                thresholds, metrics[metric_key])
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
示例#5
0
def binary_confusion_matrices(
    num_thresholds: Optional[int] = None,
    thresholds: Optional[List[float]] = None,
    name: Text = BINARY_CONFUSION_MATRICES_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the bondaires with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used.
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    if num_thresholds is not None:
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                      for i in range(num_thresholds - 2)]
        thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON]

    # Use calibration histogram to calculate matrices. For efficiency (unless all
    # predictions are matched - i.e. thresholds <= 0) we will assume that other
    # metrics will make use of the calibration histogram and re-use the default
    # histogram for the given model_name/output_name/sub_key. This is also
    # required to get accurate counts at the threshold boundaries. If this becomes
    # an issue, then calibration histogram can be updated to support non-linear
    # boundaries.
    num_buckets = 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None
    histogram_computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        num_buckets=num_buckets,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights)
    histogram_key = histogram_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        # Calibration histogram uses intervals of the form [start, end) where the
        # prediction >= start. The confusion matrices want intervals of the form
        # (start, end] where the prediction > start. Add a small epsilon so that >=
        # checks don't match. This correction shouldn't be needed in practice but
        # allows for correctness in small tests.
        if len(thresholds) == 1:
            # When there is only one threshold, we need to make adjustments so that
            # we have proper boundaries around the threshold for <, >= comparions.
            if thresholds[0] < 0:
                # This case is used when all prediction values are considered matches
                # (e.g. when calculating top_k for precision/recall).
                rebin_thresholds = [thresholds[0], thresholds[0] + _EPSILON]
            else:
                # This case is used for a single threshold within [0, 1] (e.g. 0.5).
                rebin_thresholds = [
                    -_EPSILON, thresholds[0] + _EPSILON, 1.0 + _EPSILON
                ]
        else:
            rebin_thresholds = ([thresholds[0]] +
                                [t + _EPSILON for t in thresholds[1:]])
        histogram = calibration_histogram.rebin(rebin_thresholds,
                                                metrics[histogram_key])
        matrices = _to_binary_confusion_matrices(thresholds, histogram)
        if len(thresholds) == 1:
            # Reset back to 1 bucket
            matrices = Matrices(thresholds,
                                tp=[matrices.tp[1]],
                                fp=[matrices.fp[1]],
                                tn=[matrices.tn[1]],
                                fn=[matrices.fn[1]])
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = histogram_computations
    computations.append(derived_computation)
    return computations
def binary_confusion_matrices(
    num_thresholds: Optional[int] = None,
    thresholds: Optional[List[float]] = None,
    name: Text = BINARY_CONFUSION_MATRICES_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used. If used, num_thresholds must
      be > 1.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the boundaries with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used. For metrics computed at top k
      this may be a single negative threshold value (i.e. -inf).
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    aggregation_type: Optional aggregation type.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    if num_thresholds is not None:
        if num_thresholds <= 1:
            raise ValueError('num_thresholds must be > 1')
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                      for i in range(num_thresholds - 2)]
        thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON]

    # Use calibration histogram to calculate matrices. For efficiency (unless all
    # predictions are matched - i.e. thresholds <= 0) we will assume that other
    # metrics will make use of the calibration histogram and re-use the default
    # histogram for the given model_name/output_name/sub_key. This is also
    # required to get accurate counts at the threshold boundaries. If this becomes
    # an issue, then calibration histogram can be updated to support non-linear
    # boundaries.
    histogram_computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        num_buckets=(
            # For precision/recall_at_k were a single large negative threshold is
            # used, we only need one bucket. Note that the histogram will actually
            # have 2 buckets: one that we set (which handles predictions > -1.0)
            # and a default catch-all bucket (i.e. bucket 0) that the histogram
            # creates for large negative predictions (i.e. predictions <= -1.0).
            1 if len(thresholds) == 1 and thresholds[0] <= 0 else None),
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights)
    histogram_key = histogram_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        if len(thresholds) == 1 and thresholds[0] < 0:
            # This case is used when all positive prediction values are considered
            # matches (e.g. when calculating top_k for precision/recall where the
            # non-top_k values are expected to have been set to float('-inf')).
            histogram = metrics[histogram_key]
        else:
            # Calibration histogram uses intervals of the form [start, end) where the
            # prediction >= start. The confusion matrices want intervals of the form
            # (start, end] where the prediction > start. Add a small epsilon so that
            # >= checks don't match. This correction shouldn't be needed in practice
            # but allows for correctness in small tests.
            rebin_thresholds = [
                t + _EPSILON if t != 0 else t for t in thresholds
            ]
            if thresholds[0] >= 0:
                # Add -epsilon bucket to account for differences in histogram vs
                # confusion matrix intervals mentioned above. If the epsilon bucket is
                # missing the false negatives and false positives will be 0 for the
                # first threshold.
                rebin_thresholds = [-_EPSILON] + rebin_thresholds
            if thresholds[-1] < 1.0:
                # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon
                # othewise true negatives and true positives will be overcounted.
                rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON]
            histogram = calibration_histogram.rebin(rebin_thresholds,
                                                    metrics[histogram_key])
        matrices = _to_binary_confusion_matrices(thresholds, histogram)
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = histogram_computations
    computations.append(derived_computation)
    return computations
示例#7
0
def _lift_metrics(
    num_buckets: Optional[int] = None,
    left: Optional[float] = None,
    right: Optional[float] = None,
    name: Optional[str] = None,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    output_name: str = '',
    aggregation_type: Optional[metric_types.AggregationType] = None,
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None,
    example_weighted: bool = False,
    ignore_out_of_bound_examples: bool = False,
) -> metric_types.MetricComputations:
    """Returns computations for lift metrics."""
    if eval_config is None or not eval_config.cross_slicing_specs:
        raise ValueError(
            'tfma.CrossSlicingSpec with a baseline and at least one comparison '
            'slicing spec must be provided for Lift metrics')

    if num_buckets is None:
        num_buckets = DEFAULT_NUM_BUCKETS

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key,
                                 example_weighted=example_weighted)

    computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        num_buckets=num_buckets,
        left=left,
        right=right,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights,
        example_weighted=example_weighted,
        prediction_based_bucketing=False,
        fractional_labels=False)
    metric_key = computations[-1].keys[-1]

    def cross_slice_comparison(
        baseline_metrics: Dict[metric_types.MetricKey, Any],
        comparison_metrics: Dict[metric_types.MetricKey, Any],
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns lift metrics values."""
        baseline_histogram = baseline_metrics[metric_key]
        comparison_histogram = comparison_metrics[metric_key]

        baseline_bucket = {}
        comparison_bucket = {}
        bucket_ids = set()

        for bucket in baseline_histogram:
            baseline_bucket[bucket.bucket_id] = bucket
            bucket_ids.add(bucket.bucket_id)

        for bucket in comparison_histogram:
            comparison_bucket[bucket.bucket_id] = bucket
            bucket_ids.add(bucket.bucket_id)

        baseline_pred_values = 0.0
        comparison_pred_values = 0.0
        comparison_num_examples = 0.0

        for bucket_id in bucket_ids:
            if ignore_out_of_bound_examples:
                # Ignore buckets having examples with out of bound label values.
                if bucket_id <= 0 or bucket_id > num_buckets:
                    continue
            num_examples = 0.0
            if bucket_id in comparison_bucket:
                num_examples = comparison_bucket[bucket_id].weighted_examples
                comparison_pred_values += comparison_bucket[
                    bucket_id].weighted_predictions
                comparison_num_examples += num_examples

            if bucket_id in baseline_bucket:
                # To compute background/baseline re-weighted average prediction values.
                # Background re-weighting is done by dividing the in-slice ground truth
                # density by the background density so that the marginal ground truth
                # distributions of in-slice items and background items appear similar.
                weight = num_examples / baseline_bucket[
                    bucket_id].weighted_examples
                baseline_pred_values += weight * baseline_bucket[
                    bucket_id].weighted_predictions

        lift_value = (comparison_pred_values -
                      baseline_pred_values) / comparison_num_examples
        return {key: lift_value}

    cross_slice_computation = metric_types.CrossSliceMetricComputation(
        keys=[key], cross_slice_comparison=cross_slice_comparison)

    computations.append(cross_slice_computation)
    return computations
示例#8
0
  def testTopKCalibrationHistogramWithTopK(self):
    histogram = calibration_histogram.calibration_histogram(
        sub_key=metric_types.SubKey(top_k=2), example_weighted=True)[0]

    example1 = {
        'labels': np.array([2]),
        'predictions': np.array([0.2, 0.05, 0.5, 0.05]),
        'example_weights': np.array([1.0])
    }
    example2 = {
        'labels': np.array([2]),
        'predictions': np.array([0.8, 0.1, 0.8, 0.5]),
        'example_weights': np.array([2.0])
    }
    example3 = {
        'labels': np.array([3]),
        'predictions': np.array([0.2, 0.5, 0.1, 0.1]),
        'example_weights': np.array([3.0])
    }
    example4 = {
        'labels': np.array([0]),
        'predictions': np.array([-0.1, 1.1, -0.7, -0.4]),
        'example_weights': np.array([4.0])
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_plots = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_plots, 1)
          key = metric_types.PlotKey(
              name='_calibration_histogram_10000',
              sub_key=metric_types.SubKey(top_k=2),
              example_weighted=True)
          self.assertIn(key, got_plots)
          got_histogram = got_plots[key]
          self.assertLen(got_histogram, 5)
          self.assertEqual(
              got_histogram[0],
              calibration_histogram.Bucket(
                  bucket_id=0,
                  weighted_labels=3.0 + 4.0,
                  weighted_predictions=(2 * 1.0 * float('-inf') +
                                        2 * 2.0 * float('-inf') +
                                        2 * 3.0 * float('-inf') +
                                        2 * 4.0 * float('-inf') + -0.1 * 4.0),
                  weighted_examples=(1.0 * 2.0 + 2.0 * 2.0 + 3.0 * 2.0 +
                                     4.0 * 3.0)))
          self.assertEqual(
              got_histogram[1],
              calibration_histogram.Bucket(
                  bucket_id=2001,
                  weighted_labels=0.0 + 0.0,
                  weighted_predictions=0.2 + 3 * 0.2,
                  weighted_examples=1.0 + 3.0))
          self.assertEqual(
              got_histogram[2],
              calibration_histogram.Bucket(
                  bucket_id=5001,
                  weighted_labels=1.0 + 0.0 * 3.0,
                  weighted_predictions=0.5 * 1.0 + 0.5 * 3.0,
                  weighted_examples=1.0 + 3.0))
          self.assertEqual(
              got_histogram[3],
              calibration_histogram.Bucket(
                  bucket_id=8001,
                  weighted_labels=0.0 * 2.0 + 1.0 * 2.0,
                  weighted_predictions=0.8 * 2.0 + 0.8 * 2.0,
                  weighted_examples=2.0 + 2.0))
          self.assertEqual(
              got_histogram[4],
              calibration_histogram.Bucket(
                  bucket_id=10001,
                  weighted_labels=0.0 * 4.0,
                  weighted_predictions=1.1 * 4.0,
                  weighted_examples=4.0))

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')