Пример #1
0
def _total_attributions(
    absolute: bool = True,
    name: Text = '',
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
) -> metric_types.MetricComputations:
    """Returns metric computations for total attributions."""

    key = metric_types.AttributionsKey(name=name,
                                       model_name=model_name,
                                       output_name=output_name,
                                       sub_key=sub_key)

    # Make sure total_attributions is calculated.
    computations = _total_attributions_computations(
        absolute=absolute,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
    )
    private_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.AttributionsKey, Dict[Text, Union[float,
                                                             np.ndarray]]]:
        """Returns total attributions."""
        return {key: metrics[private_key]}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
def _confusion_matrix_at_thresholds(
    thresholds: List[float],
    name: Text = CONFUSION_MATRIX_AT_THRESHOLDS_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for confusion matrix at thresholds."""
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey,
                      metrics_for_slice_pb2.ConfusionMatrixAtThresholds]
    ) -> Dict[metric_types.MetricKey, Any]:
        return {key: to_proto(thresholds, metrics[matrices_key])}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Пример #3
0
def _calibration_plot(
    num_buckets: int = DEFAULT_NUM_BUCKETS,
    left: Optional[float] = None,
    right: Optional[float] = None,
    name: Text = CALIBRATION_PLOT_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    schema: Optional[schema_pb2.Schema] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
  """Returns metric computations for calibration plot."""
  key = metric_types.PlotKey(
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  label_left, label_right = None, None
  if (left is None or right is None) and eval_config and schema:
    label_left, label_right = _find_label_domain(eval_config, schema,
                                                 model_name, output_name)
  if left is None:
    left = label_left if label_left is not None else 0.0
  if right is None:
    right = label_right if label_right is not None else 1.0

  # Make sure calibration histogram is calculated. Note we are using the default
  # number of buckets assigned to the histogram instead of the value used for
  # the plots just in case the computation is shared with other metrics and
  # plots that need higher preicion. It will be downsampled later.
  computations = calibration_histogram.calibration_histogram(
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      left=left,
      right=right,
      aggregation_type=aggregation_type,
      class_weights=class_weights)
  histogram_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    thresholds = [
        left + i * (right - left) / num_buckets for i in range(num_buckets + 1)
    ]
    thresholds = [float('-inf')] + thresholds
    histogram = calibration_histogram.rebin(
        thresholds, metrics[histogram_key], left=left, right=right)
    return {key: _to_proto(thresholds, histogram)}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
Пример #4
0
def _confusion_matrix_plot(
    num_thresholds: int = DEFAULT_NUM_THRESHOLDS,
    name: Text = CONFUSION_MATRIX_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for confusion matrix plots."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # The interoploation strategy used here matches how the legacy post export
    # metrics calculated its plots.
    thresholds = [
        i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1)
    ]
    thresholds = [-1e-6] + thresholds

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        # Use a custom name since we have a custom interpolation strategy which
        # will cause the default naming used by the binary confusion matrix to be
        # very long.
        name=(binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME + '_' +
              name),
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey,
              metrics_for_slice_pb2.ConfusionMatrixAtThresholds]:
        return {
            key:
            confusion_matrix_metrics.to_proto(thresholds,
                                              metrics[matrices_key])
        }

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Пример #5
0
def _mean_attributions(
    absolute: bool = True,
    name: str = MEAN_ATTRIBUTIONS_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    output_name: str = '',
    sub_key: Optional[metric_types.SubKey] = None,
    example_weighted: bool = False,
) -> metric_types.MetricComputations:
    """Returns metric computations for mean attributions."""
    key = metric_types.AttributionsKey(name=name,
                                       model_name=model_name,
                                       output_name=output_name,
                                       sub_key=sub_key,
                                       example_weighted=example_weighted)

    # Make sure total_attributions is calculated.
    computations = _total_attributions_computations(
        absolute=absolute,
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        example_weighted=example_weighted)
    total_attributions_key = computations[-1].keys[-1]
    # Make sure example_count is calculated
    computations.extend(
        example_count.example_count(model_names=[model_name],
                                    output_names=[output_name],
                                    sub_keys=[sub_key],
                                    example_weighted=example_weighted))
    example_count_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.AttributionsKey, Dict[str, Union[float,
                                                            np.ndarray]]]:
        """Returns mean attributions."""
        total_attributions = metrics[total_attributions_key]
        count = metrics[example_count_key]
        attributions = {}
        for k, v in total_attributions.items():
            if np.isclose(count, 0.0):
                attributions[k] = float('nan')
            else:
                attributions[k] = v / count
        return {key: attributions}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
Пример #6
0
def _coefficient_of_discrimination(
    name: str = COEFFICIENT_OF_DISCRIMINATION_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    output_name: str = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None,
    example_weighted: bool = False) -> metric_types.MetricComputations:
  """Returns metric computations for coefficient of discrimination."""
  key = metric_types.MetricKey(
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      example_weighted=example_weighted)

  # Compute shared tjur discimination metrics.
  computations = _tjur_discrimination(
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      aggregation_type=aggregation_type,
      class_weights=class_weights,
      example_weighted=example_weighted)
  # Shared metrics are based on a single computation and key.
  tjur_discrimination_key = computations[0].keys[0]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, float]:
    """Returns coefficient of discrimination."""
    metric = metrics[tjur_discrimination_key]
    if (metric.total_negative_weighted_labels == 0 or
        metric.total_positive_weighted_labels == 0):
      value = float('nan')
    else:
      avg_pos_label = (
          metric.total_positive_weighted_predictions /
          metric.total_positive_weighted_labels)
      avg_neg_label = (
          metric.total_negative_weighted_predictions /
          metric.total_negative_weighted_labels)
      value = avg_pos_label - avg_neg_label
    return {key: value}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
Пример #7
0
def macro_average(
    metric_name: Text,
    sub_keys: List[metric_types.SubKey],
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing macro average of given metric.

  Args:
    metric_name: Name of underlying metric average is being computed for.
    sub_keys: Sub keys used to compute the metric.
    eval_config: Eval config.
    model_name: Optional model name.
    output_name: Optional output name.
    class_weights: Optional class weights to apply. If sub_key.class_id is not
      set or not found in the dictionary then 1.0 is assumed.

  Returns:
    Computation for performing the macro average.
  """
    del eval_config

    key = metric_types.MetricKey(name=metric_name,
                                 model_name=model_name,
                                 output_name=output_name)

    def result(
        metrics: Dict[metric_types.MetricKey, float]
    ) -> Dict[metric_types.MetricKey, float]:
        """Returns macro average."""
        total_value = 0.0
        total_weight = 0.0
        for sub_key in sub_keys:
            child_key = metric_types.MetricKey(name=metric_name,
                                               model_name=model_name,
                                               output_name=output_name,
                                               sub_key=sub_key)
            weight = 1.0
            if (class_weights and child_key.sub_key is not None
                    and child_key.sub_key.class_id is not None
                    and child_key.sub_key.class_id in class_weights):
                weight = class_weights[child_key.sub_key.class_id]
            total_value += _to_float(metrics[child_key]) * weight
            total_weight += weight
        average = total_value / total_weight if total_weight else float('nan')
        return {key: average}

    return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
    def _metric_computation(
        self,
        thresholds: Optional[List[float]] = None,
        name: Text = '',
        eval_config: Optional[config.EvalConfig] = None,
        model_name: Text = '',
        output_name: Text = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None
    ) -> metric_types.MetricComputations:
        """Returns metric computations for specificity."""
        key = metric_types.MetricKey(name=name,
                                     model_name=model_name,
                                     output_name=output_name,
                                     sub_key=sub_key)

        if not thresholds:
            thresholds = [0.5]

        # Make sure matrices are calculated.
        matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
            eval_config=eval_config,
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            aggregation_type=aggregation_type,
            class_weights=class_weights,
            thresholds=thresholds)
        matrices_key = matrices_computations[-1].keys[-1]

        def result(
            metrics: Dict[metric_types.MetricKey, Any]
        ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]:
            matrices = metrics[matrices_key]
            values = []
            for i in range(len(thresholds)):
                values.append(
                    self.result(matrices.tp[i], matrices.tn[i], matrices.fp[i],
                                matrices.fn[i]))
            return {
                key: values[0] if len(thresholds) == 1 else np.array(values)
            }

        derived_computation = metric_types.DerivedMetricComputation(
            keys=[key], result=result)
        computations = matrices_computations
        computations.append(derived_computation)
        return computations
Пример #9
0
def _auc_plot(
    num_thresholds: int = DEFAULT_NUM_THRESHOLDS,
    name: Text = AUC_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for AUC plots."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # The interoploation stragety used here matches how the legacy post export
    # metrics calculated its plots.
    thresholds = [
        i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1)
    ]
    thresholds = [-1e-6] + thresholds

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey,
              metrics_for_slice_pb2.ConfusionMatrixAtThresholds]:
        return {
            key:
            confusion_matrix_at_thresholds.to_proto(thresholds,
                                                    metrics[matrices_key])
        }

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Пример #10
0
def output_average(
    metric_name: str,
    output_weights: Dict[str, float],
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    sub_key: Optional[metric_types.SubKey] = None,
    example_weighted: bool = False) -> metric_types.MetricComputations:
  """Returns metric computations for computing output average of given metric.

  Args:
    metric_name: Name of underlying metric average is being computed for.
    output_weights: Output weights to use to compute metric.
    eval_config: Eval config.
    model_name: Optional model name.
    sub_key: Optional sub key associated with metric (e.g. top_k).
    example_weighted: True if example weights should be applied.

  Returns:
    Computation for performing the output average.
  """
  del eval_config

  key = metric_types.MetricKey(
      name=metric_name,
      model_name=model_name,
      sub_key=sub_key,
      example_weighted=example_weighted)

  def result(
      metrics: Dict[metric_types.MetricKey, float]
  ) -> Dict[metric_types.MetricKey, float]:
    """Returns output average."""
    total_value = 0.0
    total_weight = 0.0
    for output_name, output_weight in output_weights.items():
      child_key = metric_types.MetricKey(
          name=metric_name,
          model_name=model_name,
          output_name=output_name,
          sub_key=sub_key,
          example_weighted=example_weighted)
      total_value += _to_float(metrics[child_key]) * output_weight
      total_weight += output_weight
    average = total_value / total_weight if total_weight else float('nan')
    return {key: average}

  return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
Пример #11
0
def _calibration_plot(
    num_buckets: int = DEFAULT_NUM_BUCKETS,
    left: float = 0.0,
    right: float = 1.0,
    name: Text = CALIBRATION_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for calibration plot."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # Make sure calibration histogram is calculated. Note we are using the default
    # number of buckets assigned to the histogram instead of the value used for
    # the plots just in case the computation is shared with other metrics and
    # plots that need higher preicion. It will be downsampled later.
    computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        left=left,
        right=right)
    histogram_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        thresholds = [
            left + i * (right - left) / num_buckets
            for i in range(num_buckets + 1)
        ]
        thresholds = [float('-inf')] + thresholds
        histogram = calibration_histogram.rebin(thresholds,
                                                metrics[histogram_key],
                                                left=left,
                                                right=right)
        return {key: _to_proto(thresholds, histogram)}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
def _multi_class_confusion_matrix_plot(
    thresholds: Optional[List[float]] = None,
    num_thresholds: Optional[int] = None,
    name: str = MULTI_CLASS_CONFUSION_MATRIX_PLOT_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    output_name: str = '',
    example_weighted: bool = False) -> metric_types.MetricComputations:
  """Returns computations for multi-class confusion matrix plot."""
  if num_thresholds is None and thresholds is None:
    thresholds = [0.0]

  key = metric_types.PlotKey(
      name=name,
      model_name=model_name,
      output_name=output_name,
      example_weighted=example_weighted)

  # Make sure matrices are calculated.
  matrices_computations = (
      multi_class_confusion_matrix_metrics.multi_class_confusion_matrices(
          thresholds=thresholds,
          num_thresholds=num_thresholds,
          eval_config=eval_config,
          model_name=model_name,
          output_name=output_name,
          example_weighted=example_weighted))
  matrices_key = matrices_computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey,
                    multi_class_confusion_matrix_metrics.Matrices]
  ) -> Dict[metric_types.PlotKey,
            metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds]:
    return {
        key:
            metrics[matrices_key].to_proto()
            .multi_class_confusion_matrix_at_thresholds
    }

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations = matrices_computations
  computations.append(derived_computation)
  return computations
Пример #13
0
def _calibration(
    name: str = CALIBRATION_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: str = '',
    output_name: str = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None,
    example_weighted: bool = False) -> metric_types.MetricComputations:
  """Returns metric computations for calibration."""
  key = metric_types.MetricKey(
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      example_weighted=example_weighted)

  # Make sure weighted_labels_predictions_examples are calculated.
  computations = _weighted_labels_predictions_examples(
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      aggregation_type=aggregation_type,
      class_weights=class_weights,
      example_weighted=example_weighted)
  weighted_labels_predictions_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns calibration."""
    metric = metrics[weighted_labels_predictions_key]
    if np.isclose(metric.total_weighted_labels, 0.0):
      value = float('nan')
    else:
      value = metric.total_weighted_predictions / metric.total_weighted_labels

    return {key: value}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
def _relative_coefficient_of_discrimination(
    name: Text = RELATIVE_COEFFICIENT_OF_DISCRIMINATION_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    class_weights: Optional[Dict[float, int]] = None
) -> metric_types.MetricComputations:
  """Returns metric computations for coefficient of discrimination."""
  key = metric_types.MetricKey(
      name=name, model_name=model_name, output_name=output_name)

  # Compute shared tjur discimination metrics.
  computations = _tjur_discrimination(
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      class_weights=class_weights)
  # Shared metrics are based on a single computation and key.
  tjur_discrimination_key = computations[0].keys[0]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, float]:
    """Returns coefficient of discrimination."""
    metric = metrics[tjur_discrimination_key]
    if (metric.total_negative_weighted_labels == 0 or
        metric.total_positive_weighted_labels == 0 or
        metric.total_negative_weighted_predictions == 0):
      value = float('nan')
    else:
      avg_pos_label = (
          metric.total_positive_weighted_predictions /
          metric.total_positive_weighted_labels)
      avg_neg_label = (
          metric.total_negative_weighted_predictions /
          metric.total_negative_weighted_labels)
      value = avg_pos_label / avg_neg_label
    return {key: value}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
Пример #15
0
def _mean_label(
    name: Text = MEAN_LABEL_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for mean label."""

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    # Make sure weighted_labels_predictions_examples are calculated.
    computations = _weighted_labels_predictions_examples(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights)
    weighted_labels_predictions_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns mean label."""
        metric = metrics[weighted_labels_predictions_key]
        if np.isclose(metric.total_weighted_examples, 0.0):
            value = float('nan')
        else:
            value = metric.total_weighted_labels / metric.total_weighted_examples
        return {key: value}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
def _multi_class_confusion_matrix_at_thresholds(
    thresholds: Optional[List[float]] = None,
    name: Text = MULTI_CLASS_CONFUSION_MATRIX_AT_THRESHOLDS_NAME,
    eval_config: Optional[config_pb2.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
) -> metric_types.MetricComputations:
    """Returns computations for multi-class confusion matrix at thresholds."""
    if not thresholds:
        thresholds = [0.5]

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name)

    # Make sure matrices are calculated.
    matrices_computations = multi_class_confusion_matrices(
        thresholds=thresholds,
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[
            metric_types.MetricKey,
            metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds]
    ) -> Dict[metric_types.MetricKey,
              metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds]:
        return {key: metrics[matrices_key]}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Пример #17
0
def _total_attributions(
        absolute: bool = True,
        name: str = '',
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        sub_key: Optional[metric_types.SubKey] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metric computations for total attributions."""
    key = metric_types.AttributionsKey(name=name,
                                       model_name=model_name,
                                       output_name=output_name,
                                       sub_key=sub_key,
                                       example_weighted=example_weighted)

    # Make sure total_attributions is calculated.
    computations = _total_attributions_computations(
        absolute=absolute,
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        example_weighted=example_weighted)
    private_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.AttributionsKey, Dict[str, Union[float,
                                                            np.ndarray]]]:
        """Returns total attributions."""
        return {key: metrics[private_key]}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
Пример #18
0
def _fairness_indicators_metrics_at_thresholds(
    thresholds: List[float],
    name: Text = FAIRNESS_INDICATORS_METRICS_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns computations for fairness metrics at thresholds."""
    metric_key_by_name_by_threshold = collections.defaultdict(dict)
    keys = []
    digits_num = calculate_digits(thresholds)
    for t in thresholds:
        for m in FAIRNESS_INDICATORS_SUB_METRICS:
            key = metric_types.MetricKey(
                name='%s/%s@%.*f' %
                (name, m, digits_num,
                 t),  # e.g. "fairness_indicators_metrics/[email protected]"
                model_name=model_name,
                output_name=output_name,
                sub_key=sub_key)
            keys.append(key)
            metric_key_by_name_by_threshold[t][m] = key

    # Make sure matrices are calculated.
    computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights,
        thresholds=thresholds)
    confusion_matrices_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns fairness metrics values."""
        metric = metrics[confusion_matrices_key]
        output = {}

        for i, threshold in enumerate(thresholds):
            num_positives = metric.tp[i] + metric.fn[i]
            num_negatives = metric.tn[i] + metric.fp[i]

            tpr = metric.tp[i] / (num_positives or float('nan'))
            tnr = metric.tn[i] / (num_negatives or float('nan'))
            fpr = metric.fp[i] / (num_negatives or float('nan'))
            fnr = metric.fn[i] / (num_positives or float('nan'))
            pr = (metric.tp[i] + metric.fp[i]) / (
                (num_positives + num_negatives) or float('nan'))
            nr = (metric.tn[i] + metric.fn[i]) / (
                (num_positives + num_negatives) or float('nan'))

            fdr = metric.fp[i] / (
                (metric.fp[i] + metric.tp[i]) or float('nan'))
            fomr = metric.fn[i] / (
                (metric.fn[i] + metric.tn[i]) or float('nan'))

            output[metric_key_by_name_by_threshold[threshold]
                   ['false_positive_rate']] = fpr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_negative_rate']] = fnr
            output[metric_key_by_name_by_threshold[threshold]
                   ['true_positive_rate']] = tpr
            output[metric_key_by_name_by_threshold[threshold]
                   ['true_negative_rate']] = tnr
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_rate']] = pr
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_rate']] = nr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_discovery_rate']] = fdr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_omission_rate']] = fomr

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations
Пример #19
0
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, eval_config: config.EvalConfig,
    model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  # Special handling for AUC metric which supports aggregation inherently via
  # multi_label flag.
  if (isinstance(metric, tf.keras.metrics.AUC) and
      hasattr(metric, 'label_weights')):
    if metric.label_weights:
      if class_weights:
        raise ValueError(
            'class weights are configured in two different places: (1) via the '
            'tf.keras.metrics.AUC class (using "label_weights") and (2) via '
            'the MetricsSpecs (using "aggregate.class_weights"). Either remove '
            'the label_weights settings in the AUC class or remove the '
            'class_weights from the AggregationOptions: metric={}, '
            'class_weights={}'.format(metric, class_weights))
      class_weights = {i: v for i, v in enumerate(metric.label_weights)}
    if metric.multi_label:
      raise NotImplementedError('AUC.multi_label=True is not implemented yet.')

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  thresholds = None
  num_thresholds = None
  if hasattr(metric, _THRESHOLDS_KEY):
    if (len(
        metric.thresholds) == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS):
      num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS
    else:
      thresholds = metric.thresholds
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds

  # By default use separate compuations for the confusion matrices since the
  # metrics might be using different thresholds (note, the underlying histogram
  # the confusion matrices are based on will still only be calculated once).
  if (num_thresholds is not None and
      num_thresholds == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS):
    name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME
  else:
    name = '_{}{}'.format(
        metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME)

  # Make sure matrices are calculated. Note that the use of class_weights here
  # implies that micro averaging is being performed.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      name=name,
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns AUC derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, model_name: Text, output_name: Text,
    sub_key: Optional[metric_types.SubKey],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  # By default use separate compuations for the confusion matrices since the
  # metrics might be using different thresholds (note, the underlying histogram
  # the confusion matrices are based on will still only be calculated once).
  name = '_{}{}'.format(
      metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME)
  thresholds = None
  if hasattr(metric, _THRESHOLDS_KEY):
    thresholds = metric.thresholds
  num_thresholds = None
  if hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds
  # Increase the default number of thresholds if keras defaults were used (this
  # also allows us to share the computation with other confusion based metrics).
  if (num_thresholds == _DEFAULT_NUM_THRESHOLDS_IN_KERAS and
      _CONFIG_KEY in metric_config and
      _NUM_THRESHOLDS_KEY in metric_config[_CONFIG_KEY]):
    name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME
    num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS
    metric_config[_CONFIG_KEY][_NUM_THRESHOLDS_KEY] = num_thresholds
    thresholds = None
    if _THRESHOLDS_KEY in metric_config[_CONFIG_KEY]:
      metric_config[_CONFIG_KEY][_THRESHOLDS_KEY] = None
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is not None and num_thresholds is not None:
    num_thresholds = None

  # Make sure matrices are calculated.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns AUC derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
Пример #21
0
def _flip_rate(
        counterfactual_prediction_key: Optional[str] = None,
        example_id_key: Optional[str] = None,
        example_ids_count: int = flip_count.DEFAULT_NUM_EXAMPLE_IDS,
        name: str = FLIP_RATE_NAME,
        thresholds: Sequence[float] = flip_count.DEFAULT_THRESHOLDS,
        model_name: str = '',
        output_name: str = '',
        eval_config: Optional[config_pb2.EvalConfig] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns computations for flip rate."""
    keys, metric_key_by_name_by_threshold = flip_count.create_metric_keys(
        thresholds, _METRICS_LIST, name, model_name, output_name,
        example_weighted)

    computations = flip_count.flip_count(
        thresholds=thresholds,
        counterfactual_prediction_key=counterfactual_prediction_key,
        example_id_key=example_id_key,
        example_ids_count=example_ids_count,
        model_name=model_name,
        output_name=output_name,
        eval_config=eval_config,
        example_weighted=example_weighted)

    _, flip_count_metric_key_by_name_by_threshold = flip_count.create_metric_keys(
        thresholds, flip_count.METRICS_LIST, flip_count.FLIP_COUNT_NAME,
        model_name, output_name, example_weighted)

    def pick_overall_flip_examples(ntp_examples: np.ndarray,
                                   ptn_examples: np.ndarray) -> np.ndarray:
        output_size = min(example_ids_count,
                          ntp_examples.size + ptn_examples.size)
        examples = np.vstack([ntp_examples, ptn_examples])
        return np.random.choice(examples.flatten(),
                                size=output_size,
                                replace=False)

    def result(
        metrics: Dict[metric_types.MetricKey, Union[float, np.ndarray]]
    ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]:
        """Returns flip rate metrics values."""
        output = {}
        for threshold in thresholds:
            ptn = flip_count_metric_key_by_name_by_threshold[threshold][
                _POSITIVE_TO_NEGATIVE]
            ntp = flip_count_metric_key_by_name_by_threshold[threshold][
                _NEGATIVE_TO_POSITIVE]
            pos_examples = flip_count_metric_key_by_name_by_threshold[
                threshold][_POSITIVE_TO_NEGATIVE_EXAMPLE_IDS]
            neg_examples = flip_count_metric_key_by_name_by_threshold[
                threshold][_NEGATIVE_TO_POSITIVE_EXAMPLE_IDS]
            pos = flip_count_metric_key_by_name_by_threshold[threshold][
                _POSITIVE_EXAMPLES_COUNT]
            neg = flip_count_metric_key_by_name_by_threshold[threshold][
                _NEGATIVE_EXAMPLES_COUNT]
            output[metric_key_by_name_by_threshold[threshold]
                   [_OVERALL]] = (metrics[ntp] + metrics[ptn]) / (
                       (metrics[pos] + metrics[neg]) or float('NaN'))
            output[metric_key_by_name_by_threshold[threshold]
                   [_POSITIVE_TO_NEGATIVE]] = metrics[ptn] / (
                       (metrics[pos] + metrics[neg]) or float('NaN'))
            output[metric_key_by_name_by_threshold[threshold]
                   [_NEGATIVE_TO_POSITIVE]] = metrics[ntp] / (
                       (metrics[pos] + metrics[neg]) or float('NaN'))
            output[metric_key_by_name_by_threshold[threshold]
                   [_POSITIVE_TO_NEGATIVE_EXAMPLE_IDS]] = metrics[pos_examples]
            output[metric_key_by_name_by_threshold[threshold]
                   [_NEGATIVE_TO_POSITIVE_EXAMPLE_IDS]] = metrics[neg_examples]
            # TODO(sokeefe): Should this depend on  of example_weighted?
            if not example_weighted:
                assert isinstance(metrics[neg_examples], np.ndarray)
                assert isinstance(metrics[pos_examples], np.ndarray)
                output[metric_key_by_name_by_threshold[threshold]
                       [_SAMPLE_EXAMPLES_IDS]] = pick_overall_flip_examples(
                           ntp_examples=metrics[neg_examples],
                           ptn_examples=metrics[pos_examples])

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations
Пример #22
0
def weighted_macro_average(
    metric_name: Text,
    sub_keys: List[metric_types.SubKey],
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing weighted macro average of metric.

  The weights per class are based on the percentage of positive labels for each
  class.

  Args:
    metric_name: Name of metric weighted average is being computed for.
    sub_keys: Sub keys used to compute the metric.
    eval_config: Eval config.
    model_name: Optional model name.
    output_name: Optional output name.
    class_weights: Optional class weights to apply. If sub_key.class_id is not
      set or not found in the dictionary then 1.0 is assumed. Note that these
      weights are applied in addition to the weights based on the positive
      labels for each class.

  Returns:
    Computation for performing the weighted macro average.
  """
    key = metric_types.MetricKey(name=metric_name,
                                 model_name=model_name,
                                 output_name=output_name)

    class_ids = [k.class_id for k in sub_keys if k.class_id is not None]

    # Compute the weights for labels.
    computations = _class_weights_from_labels(class_ids=class_ids,
                                              eval_config=eval_config,
                                              model_name=model_name,
                                              output_name=output_name)
    # Class weights metrics are based on a single computation and key.
    class_weights_from_labels_key = computations[0].keys[0]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, float]:
        """Returns weighted macro average."""
        class_weights_from_labels = metrics[class_weights_from_labels_key]
        total_value = 0.0
        total_weight = 0.0
        for sub_key in sub_keys:
            child_key = metric_types.MetricKey(name=metric_name,
                                               model_name=model_name,
                                               output_name=output_name,
                                               sub_key=sub_key)
            weight = 1.0
            if (child_key.sub_key is not None
                    and child_key.sub_key.class_id is not None):
                if (class_weights_from_labels and child_key.sub_key.class_id
                        in class_weights_from_labels):
                    weight = class_weights_from_labels[
                        child_key.sub_key.class_id]
                if class_weights and child_key.sub_key.class_id in class_weights:
                    weight *= class_weights[child_key.sub_key.class_id]
            total_value += _to_float(metrics[child_key]) * weight
            total_weight += weight
        average = total_value / total_weight if total_weight else float('nan')
        return {key: average}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
Пример #23
0
def weighted_macro_average(
    metric_name: Text,
    sub_keys: Iterable[metric_types.SubKey],
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing weighted macro average of metric.

  The weights per class are based on the percentage of positive labels for each
  class.

  Args:
    metric_name: Name of metric weighted average is being computed for.
    sub_keys: Sub keys used to compute the metric (e.g. class_ids, etc).
    eval_config: Eval config.
    model_name: Optional model name.
    output_name: Optional output name.
    sub_key: Optional sub key associated with aggregation metric (e.g. top_k).
    class_weights: Optional class weights to apply. Required if sub_key is not
      provided. If class_weights are provided, but a sub_key.class_id (if
      sub_key is None) or sub_key.k (if sub_key is top_k) is not set or not
      found in the dictionary then 0.0 is assumed. Note that these weights are
      applied in addition to the weights based on the positive labels for each
      class.

  Returns:
    Computation for performing the weighted macro average.
  """
    key = metric_types.MetricKey(name=metric_name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    class_ids = [k.class_id for k in sub_keys if k.class_id is not None]

    # Compute the weights for labels.
    computations = _class_weights_from_labels(class_ids=class_ids,
                                              eval_config=eval_config,
                                              model_name=model_name,
                                              output_name=output_name)
    # Class weights metrics are based on a single computation and key.
    class_weights_from_labels_key = computations[0].keys[0]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, float]:
        """Returns weighted macro average."""
        class_weights_from_labels = metrics[class_weights_from_labels_key]
        total_value = 0.0
        total_weight = 0.0
        for sub_key in sub_keys:
            child_key = metric_types.MetricKey(name=metric_name,
                                               model_name=model_name,
                                               output_name=output_name,
                                               sub_key=sub_key)
            if child_key not in metrics:
                # Use private name if not found under metric name
                child_key = metric_types.MetricKey(name='_' + metric_name,
                                                   model_name=model_name,
                                                   output_name=output_name,
                                                   sub_key=sub_key)
            weight = 1.0 if not class_weights else 0.0
            offset = None
            if (child_key.sub_key is not None
                    and child_key.sub_key.class_id is not None):
                offset = child_key.sub_key.class_id
            elif child_key.sub_key is not None and child_key.sub_key.k is not None:
                offset = child_key.sub_key.k
            if offset is not None:
                if (class_weights_from_labels and child_key.sub_key.class_id
                        in class_weights_from_labels):
                    weight = class_weights_from_labels[offset]
                if class_weights and child_key.sub_key.class_id in class_weights:
                    weight *= class_weights[offset]
            total_value += _to_float(metrics[child_key]) * weight
            total_weight += weight
        average = total_value / total_weight if total_weight else float('nan')
        return {key: average}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
Пример #24
0
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, eval_config: config_pb2.EvalConfig,
    model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey],
    aggregation_type: Optional[metric_types.AggregationType],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  # Special handling for AUC metric which supports aggregation inherently via
  # multi_label flag.
  if (isinstance(metric, tf.keras.metrics.AUC) and
      hasattr(metric, 'label_weights')):
    if metric.label_weights:
      if class_weights:
        raise ValueError(
            'class weights are configured in two different places: (1) via the '
            'tf.keras.metrics.AUC class (using "label_weights") and (2) via '
            'the MetricsSpecs (using "aggregate.class_weights"). Either remove '
            'the label_weights settings in the AUC class or remove the '
            'class_weights from the AggregationOptions: metric={}, '
            'class_weights={}'.format(metric, class_weights))
      class_weights = {i: v for i, v in enumerate(metric.label_weights)}
    if metric.multi_label:
      raise NotImplementedError('AUC.multi_label=True is not implemented yet.')

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      aggregation_type=aggregation_type,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  thresholds = None
  num_thresholds = None
  # The top_k metrics have special settings. If we are setting the top_k value
  # outside of keras (i.e. using BinarizeOptions), then we need to set the
  # special threshold ourselves otherwise the default threshold of 0.5 is used.
  if (sub_key and sub_key.top_k is not None and
      _get_config_value(_TOP_K_KEY, metric_config) is None and
      _get_config_value(_THRESHOLDS_KEY, metric_config) is None and
      _get_config_value(_NUM_THRESHOLDS_KEY, metric_config) is None):
    thresholds = [float('-inf')]
  elif hasattr(metric, _THRESHOLDS_KEY):
    thresholds = metric.thresholds
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds

  # Make sure matrices are calculated.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      aggregation_type=aggregation_type,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns result derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
def binary_confusion_matrices(
    num_thresholds: Optional[int] = None,
    thresholds: Optional[List[float]] = None,
    name: Text = BINARY_CONFUSION_MATRICES_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used. If used, num_thresholds must
      be > 1.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the boundaries with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used. For metrics computed at top k
      this may be a single negative threshold value (i.e. -inf).
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    aggregation_type: Optional aggregation type.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    if num_thresholds is not None:
        if num_thresholds <= 1:
            raise ValueError('num_thresholds must be > 1')
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                      for i in range(num_thresholds - 2)]
        thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON]

    # Use calibration histogram to calculate matrices. For efficiency (unless all
    # predictions are matched - i.e. thresholds <= 0) we will assume that other
    # metrics will make use of the calibration histogram and re-use the default
    # histogram for the given model_name/output_name/sub_key. This is also
    # required to get accurate counts at the threshold boundaries. If this becomes
    # an issue, then calibration histogram can be updated to support non-linear
    # boundaries.
    histogram_computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        num_buckets=(
            # For precision/recall_at_k were a single large negative threshold is
            # used, we only need one bucket. Note that the histogram will actually
            # have 2 buckets: one that we set (which handles predictions > -1.0)
            # and a default catch-all bucket (i.e. bucket 0) that the histogram
            # creates for large negative predictions (i.e. predictions <= -1.0).
            1 if len(thresholds) == 1 and thresholds[0] <= 0 else None),
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights)
    histogram_key = histogram_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        if len(thresholds) == 1 and thresholds[0] < 0:
            # This case is used when all positive prediction values are considered
            # matches (e.g. when calculating top_k for precision/recall where the
            # non-top_k values are expected to have been set to float('-inf')).
            histogram = metrics[histogram_key]
        else:
            # Calibration histogram uses intervals of the form [start, end) where the
            # prediction >= start. The confusion matrices want intervals of the form
            # (start, end] where the prediction > start. Add a small epsilon so that
            # >= checks don't match. This correction shouldn't be needed in practice
            # but allows for correctness in small tests.
            rebin_thresholds = [
                t + _EPSILON if t != 0 else t for t in thresholds
            ]
            if thresholds[0] >= 0:
                # Add -epsilon bucket to account for differences in histogram vs
                # confusion matrix intervals mentioned above. If the epsilon bucket is
                # missing the false negatives and false positives will be 0 for the
                # first threshold.
                rebin_thresholds = [-_EPSILON] + rebin_thresholds
            if thresholds[-1] < 1.0:
                # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon
                # othewise true negatives and true positives will be overcounted.
                rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON]
            histogram = calibration_histogram.rebin(rebin_thresholds,
                                                    metrics[histogram_key])
        matrices = _to_binary_confusion_matrices(thresholds, histogram)
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = histogram_computations
    computations.append(derived_computation)
    return computations
Пример #26
0
def _flip_rate(
    counterfactual_prediction_key: str,
    example_id_key: Optional[str] = None,
    example_ids_count: int = flip_count.DEFAULT_NUM_EXAMPLE_IDS,
    name: str = FLIP_RATE_NAME,
    thresholds: Sequence[float] = flip_count.DEFAULT_THRESHOLDS,
    model_name: str = '',
    output_name: str = '',
    eval_config: Optional[config.EvalConfig] = None,
) -> metric_types.MetricComputations:
    """Returns computations for flip rate."""
    keys, metric_key_by_name_by_threshold = flip_count.create_metric_keys(
        thresholds, _METRICS_LIST, name, model_name, output_name)

    computations = flip_count.flip_count(
        thresholds=thresholds,
        counterfactual_prediction_key=counterfactual_prediction_key,
        example_id_key=example_id_key,
        example_ids_count=example_ids_count,
        model_name=model_name,
        output_name=output_name,
        eval_config=eval_config)

    _, flip_count_metric_key_by_name_by_threshold = flip_count.create_metric_keys(
        thresholds, flip_count.METRICS_LIST, flip_count.FLIP_COUNT_NAME,
        model_name, output_name)

    def result(
        metrics: Dict[metric_types.MetricKey, float]
    ) -> Dict[metric_types.MetricKey, float]:
        """Returns flip rate metrics values."""
        output = {}
        for threshold in thresholds:
            ptn = flip_count_metric_key_by_name_by_threshold[threshold][
                'positive_to_negative']
            ntp = flip_count_metric_key_by_name_by_threshold[threshold][
                'negative_to_positive']
            pos_examples = flip_count_metric_key_by_name_by_threshold[
                threshold]['positive_to_negative_examples_ids']
            neg_examples = flip_count_metric_key_by_name_by_threshold[
                threshold]['negative_to_positive_examples_ids']
            pos = flip_count_metric_key_by_name_by_threshold[threshold][
                'positive_examples_count']
            neg = flip_count_metric_key_by_name_by_threshold[threshold][
                'negative_examples_count']
            output[metric_key_by_name_by_threshold[threshold]['overall']] = (
                metrics[ntp] + metrics[ptn]) / (metrics[pos] + metrics[neg])
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative']] = metrics[ptn] / metrics[pos]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive']] = metrics[ntp] / metrics[neg]
            output[metric_key_by_name_by_threshold[threshold][
                'positive_to_negative_examples_ids']] = metrics[pos_examples]
            output[metric_key_by_name_by_threshold[threshold][
                'negative_to_positive_examples_ids']] = metrics[neg_examples]

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations
Пример #27
0
def macro_average(
    metric_name: Text,
    sub_keys: Iterable[metric_types.SubKey],
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing macro average of given metric.

  Args:
    metric_name: Name of underlying metric average is being computed for.
    sub_keys: Sub keys used to compute the metric (e.g. class_ids, etc).
    eval_config: Eval config.
    model_name: Optional model name.
    output_name: Optional output name.
    sub_key: Optional sub key associated with aggregation metric (e.g. top_k).
    class_weights: Optional class weights to apply. Required if sub_key is not
      provided. If class_weights are provided, but a sub_key.class_id (if
      sub_key is None) or sub_key.k (if sub_key is top_k) is not set or not
      found in the dictionary then 0.0 is assumed.

  Returns:
    Computation for performing the macro average.
  """
    del eval_config

    key = metric_types.MetricKey(name=metric_name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    def result(
        metrics: Dict[metric_types.MetricKey, float]
    ) -> Dict[metric_types.MetricKey, float]:
        """Returns macro average."""
        total_value = 0.0
        total_weight = 0.0
        for sub_key in sub_keys:
            child_key = metric_types.MetricKey(name=metric_name,
                                               model_name=model_name,
                                               output_name=output_name,
                                               sub_key=sub_key)
            if child_key not in metrics:
                # Use private name if not found under metric name
                child_key = metric_types.MetricKey(name='_' + metric_name,
                                                   model_name=model_name,
                                                   output_name=output_name,
                                                   sub_key=sub_key)
            weight = 1.0 if not class_weights else 0.0
            offset = None
            if (child_key.sub_key is not None
                    and child_key.sub_key.class_id is not None):
                offset = child_key.sub_key.class_id
            elif child_key.sub_key is not None and child_key.sub_key.k is not None:
                offset = child_key.sub_key.k
            if offset is not None and offset in class_weights:
                weight = class_weights[offset]
            total_value += _to_float(metrics[child_key]) * weight
            total_weight += weight
        average = total_value / total_weight if total_weight else float('nan')
        return {key: average}

    return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
def binary_confusion_matrices(
        num_thresholds: Optional[int] = None,
        thresholds: Optional[List[float]] = None,
        name: Optional[Text] = None,
        eval_config: Optional[config.EvalConfig] = None,
        model_name: Text = '',
        output_name: Text = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        use_histogram: Optional[bool] = None,
        extract_label_prediction_and_weight: Optional[Callable[
            ..., Any]] = metric_util.to_label_prediction_example_weight,
        preprocessor: Optional[Callable[..., Any]] = None,
        example_id_key: Optional[Text] = None,
        example_ids_count: Optional[int] = None,
        fractional_labels: float = True) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used. If used, num_thresholds must
      be > 1.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the boundaries with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used. For metrics computed at top k
      this may be a single negative threshold value (i.e. -inf).
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    aggregation_type: Optional aggregation type.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).
    use_histogram: If true, matrices will be derived from calibration
      histograms.
    extract_label_prediction_and_weight: User-provided function argument that
      yields label, prediction, and example weights for use in calculations
      (relevant only when use_histogram flag is not true).
    preprocessor: User-provided preprocessor for including additional extracts
      in StandardMetricInputs (relevant only when use_histogram flag is not
      true).
    example_id_key: Feature key containing example id (relevant only when
      use_histogram flag is not true).
    example_ids_count: Max number of example ids to be extracted for false
      positives and false negatives (relevant only when use_histogram flag is
      not true).
    fractional_labels: If true, each incoming tuple of (label, prediction, and
      example weight) will be split into two tuples as follows (where l, p, w
      represent the resulting label, prediction, and example weight values): (1)
        l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l =
        1.0, p = prediction, and w = example_weight * label If enabled, an
        exception will be raised if labels are not within [0, 1]. The
        implementation is such that tuples associated with a weight of zero are
        not yielded. This means it is safe to enable fractional_labels even when
        the labels only take on the values of 0.0 or 1.0.

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    # Keras AUC turns num_thresholds parameters into thresholds which circumvents
    # sharing of settings. If the thresholds match the interpolated version of the
    # thresholds then reset back to num_thresholds.
    if (name is None and thresholds
            and thresholds == _interpolated_thresholds(len(thresholds))):
        num_thresholds = len(thresholds)
        thresholds = None
    if num_thresholds is not None:
        if num_thresholds <= 1:
            raise ValueError('num_thresholds must be > 1')
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = _interpolated_thresholds(num_thresholds)
        if name is None:
            name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME,
                                  num_thresholds)
    elif name is None:
        name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, list(thresholds))

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    computations = []
    metric_key = None

    if use_histogram is None:
        use_histogram = (num_thresholds is not None
                         or (len(thresholds) == 1 and thresholds[0] < 0))

    if use_histogram:
        # Use calibration histogram to calculate matrices. For efficiency (unless
        # all predictions are matched - i.e. thresholds <= 0) we will assume that
        # other metrics will make use of the calibration histogram and re-use the
        # default histogram for the given model_name/output_name/sub_key. This is
        # also required to get accurate counts at the threshold boundaries. If this
        # becomes an issue, then calibration histogram can be updated to support
        # non-linear boundaries.
        computations = calibration_histogram.calibration_histogram(
            eval_config=eval_config,
            num_buckets=(
                # For precision/recall_at_k were a single large negative threshold
                # is used, we only need one bucket. Note that the histogram will
                # actually have 2 buckets: one that we set (which handles
                # predictions > -1.0) and a default catch-all bucket (i.e. bucket 0)
                # that the histogram creates for large negative predictions (i.e.
                # predictions <= -1.0).
                1 if len(thresholds) == 1 and thresholds[0] <= 0 else None),
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            aggregation_type=aggregation_type,
            class_weights=class_weights)
        metric_key = computations[-1].keys[-1]
    else:
        computations = _binary_confusion_matrix_computation(
            eval_config=eval_config,
            thresholds=thresholds,
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            extract_label_prediction_and_weight=
            extract_label_prediction_and_weight,
            preprocessor=preprocessor,
            example_id_key=example_id_key,
            example_ids_count=example_ids_count,
            aggregation_type=aggregation_type,
            class_weights=class_weights,
            fractional_labels=fractional_labels)
        metric_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        matrices = None
        if use_histogram:
            if len(thresholds) == 1 and thresholds[0] < 0:
                # This case is used when all positive prediction values are relevant
                # matches (e.g. when calculating top_k for precision/recall where the
                # non-top_k values are expected to have been set to float('-inf')).
                histogram = metrics[metric_key]
            else:
                # Calibration histogram uses intervals of the form [start, end) where
                # the prediction >= start. The confusion matrices want intervals of the
                # form (start, end] where the prediction > start. Add a small epsilon so
                # that >= checks don't match. This correction shouldn't be needed in
                # practice but allows for correctness in small tests.
                rebin_thresholds = [
                    t + _EPSILON if t != 0 else t for t in thresholds
                ]
                if thresholds[0] >= 0:
                    # Add -epsilon bucket to account for differences in histogram vs
                    # confusion matrix intervals mentioned above. If the epsilon bucket is
                    # missing the false negatives and false positives will be 0 for the
                    # first threshold.
                    rebin_thresholds = [-_EPSILON] + rebin_thresholds
                if thresholds[-1] < 1.0:
                    # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon
                    # othewise true negatives and true positives will be overcounted.
                    rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON]
                histogram = calibration_histogram.rebin(
                    rebin_thresholds, metrics[metric_key])
            matrices = _historgram_to_binary_confusion_matrices(
                thresholds, histogram)
        else:
            matrices = _matrix_to_binary_confusion_matrices(
                thresholds, metrics[metric_key])
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations.append(derived_computation)
    return computations
Пример #29
0
def binary_confusion_matrices(
    num_thresholds: Optional[int] = None,
    thresholds: Optional[List[float]] = None,
    name: Text = BINARY_CONFUSION_MATRICES_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for computing binary confusion matrices.

  Args:
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used.
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the bondaires with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used.
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    sub_key: Optional sub key.
    class_weights: Optional class weights to apply to multi-class / multi-label
      labels and predictions prior to flattening (when micro averaging is used).

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        num_thresholds = DEFAULT_NUM_THRESHOLDS
    if num_thresholds is not None:
        # The interpolation strategy used here matches that used by keras for AUC.
        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                      for i in range(num_thresholds - 2)]
        thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON]

    # Use calibration histogram to calculate matrices. For efficiency (unless all
    # predictions are matched - i.e. thresholds <= 0) we will assume that other
    # metrics will make use of the calibration histogram and re-use the default
    # histogram for the given model_name/output_name/sub_key. This is also
    # required to get accurate counts at the threshold boundaries. If this becomes
    # an issue, then calibration histogram can be updated to support non-linear
    # boundaries.
    num_buckets = 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None
    histogram_computations = calibration_histogram.calibration_histogram(
        eval_config=eval_config,
        num_buckets=num_buckets,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights)
    histogram_key = histogram_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Matrices]:
        """Returns binary confusion matrices."""
        # Calibration histogram uses intervals of the form [start, end) where the
        # prediction >= start. The confusion matrices want intervals of the form
        # (start, end] where the prediction > start. Add a small epsilon so that >=
        # checks don't match. This correction shouldn't be needed in practice but
        # allows for correctness in small tests.
        if len(thresholds) == 1:
            # When there is only one threshold, we need to make adjustments so that
            # we have proper boundaries around the threshold for <, >= comparions.
            if thresholds[0] < 0:
                # This case is used when all prediction values are considered matches
                # (e.g. when calculating top_k for precision/recall).
                rebin_thresholds = [thresholds[0], thresholds[0] + _EPSILON]
            else:
                # This case is used for a single threshold within [0, 1] (e.g. 0.5).
                rebin_thresholds = [
                    -_EPSILON, thresholds[0] + _EPSILON, 1.0 + _EPSILON
                ]
        else:
            rebin_thresholds = ([thresholds[0]] +
                                [t + _EPSILON for t in thresholds[1:]])
        histogram = calibration_histogram.rebin(rebin_thresholds,
                                                metrics[histogram_key])
        matrices = _to_binary_confusion_matrices(thresholds, histogram)
        if len(thresholds) == 1:
            # Reset back to 1 bucket
            matrices = Matrices(thresholds,
                                tp=[matrices.tp[1]],
                                fp=[matrices.fp[1]],
                                tn=[matrices.tn[1]],
                                fn=[matrices.fn[1]])
        return {key: matrices}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = histogram_computations
    computations.append(derived_computation)
    return computations
Пример #30
0
def flip_count(
        counterfactual_prediction_key: Optional[str] = None,
        example_id_key: Optional[str] = None,
        example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS,
        name: str = FLIP_COUNT_NAME,
        thresholds: Sequence[float] = DEFAULT_THRESHOLDS,
        model_name: str = '',
        output_name: str = '',
        eval_config: Optional[config_pb2.EvalConfig] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metric computations for computing flip counts."""
    keys, metric_key_by_name_by_threshold = create_metric_keys(
        thresholds, METRICS_LIST, name, model_name, output_name,
        example_weighted)

    feature_keys = [counterfactual_prediction_key]
    if example_id_key:
        feature_keys.append(example_id_key)

    def extract_label_prediction_and_weight(
        inputs: metric_types.StandardMetricInputs,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        example_weighted: bool = False,
        fractional_labels: bool = False,
        flatten: bool = True,
    ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Yields label, prediction, and example weights to be used in calculations.

    This function is a customized metric_util.to_label_prediction_example_weight
    function which yields original prediction as label and counterfactual
    prediction as prediction and derive flip count metrics from false positives
    and false negatives of binary confusion matrix.

    Args:
      inputs: Standard metric inputs.
      eval_config: Eval config
      model_name: Optional model name (if multi-model evaluation).
      output_name: Optional output name (if multi-output model type).
      sub_key: Optional sub key. (unused)
      aggregation_type: Optional aggregation type. (unused)
      class_weights: Optional class weights to apply to multi-class /
        multi-label labels and predictions. (unused)
      example_weighted: True if example weights should be applied.
      fractional_labels: If true, each incoming tuple of (label, prediction,
        example weight) will be split into two tuples as follows (where l, p, w
        represent the resulting label, prediction, and example weight values):
          (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label)
          (2) l = 1.0, p = prediction, and w = example_weight * label If
          enabled, an exception will be raised if labels are not within [0, 1].
          The implementation is such that tuples associated with a weight of
          zero are not yielded. This means it is safe to enable
          fractional_labels even when the labels only take on the values of 0.0
          or 1.0. (unused)
      flatten: True to flatten the final label and prediction outputs so that
        the yielded values are always arrays of size 1. For example, multi-class
        /multi-label outputs would be converted into label and prediction pairs
        that could then be processed by a binary classification metric in order
        to compute a micro average over all classes. (unused)

    Yields:
      Tuple of (label, prediction, example_weight).

    Raises:
      ValueError: If counterfactual prediction key is not found within either
        the features or predictions.
      ValueError: If predictions is None or empty.
    """
        del (sub_key, aggregation_type, class_weights, fractional_labels,
             flatten)  # unused

        # TODO(sokeefe): Look into removing the options to pass counterfactual
        # predictions in a feature and instead as a baseline model.
        if (counterfactual_prediction_key is not None
                and counterfactual_prediction_key in inputs.features):
            counterfactual_prediction = inputs.features[
                counterfactual_prediction_key]
        elif eval_config is not None:
            counterfactual_model_spec = model_util.get_baseline_model_spec(
                eval_config)
            if counterfactual_model_spec is not None:
                _, counterfactual_prediction, _ = next(
                    metric_util.to_label_prediction_example_weight(
                        inputs,
                        eval_config=eval_config,
                        model_name=counterfactual_model_spec.name,
                        output_name=output_name,
                        example_weighted=example_weighted,
                        fractional_labels=
                        False,  # Labels are ignored for flip counts.
                        flatten=False,  # Flattened below
                        allow_none=True,  # Allow None labels
                        require_single_example_weight=True))
            else:
                raise ValueError(
                    'The Counterfactual model must be listed with '
                    f'`is_baseline` equal to `True`. Found: {eval_config}')
        else:
            raise ValueError(
                '`counterfactual_prediction` was not found within the provided '
                'inputs. It must be included as either a feature key or within the '
                'predictions. Found:\n'
                f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n'
                f'`inputs.prediction`:{inputs.prediction}')

        if counterfactual_prediction is None:
            raise ValueError(
                '%s feature key is None (required for FlipCount metric)' %
                counterfactual_prediction_key)

        def get_by_keys(value: Any, keys: List[str]) -> Any:
            if isinstance(value, dict):
                new_value = util.get_by_keys(value, keys, optional=True)
                if new_value is not None:
                    return new_value
            return value

        if model_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [model_name])
        if output_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [output_name])

        _, prediction, example_weight = next(
            metric_util.to_label_prediction_example_weight(
                inputs,
                eval_config=eval_config,
                model_name=model_name,
                output_name=output_name,
                example_weighted=example_weighted,
                fractional_labels=False,  # Labels are ignored for flip counts.
                flatten=False,  # Flattened below
                allow_none=True,  # Allow None labels
                require_single_example_weight=True))

        if prediction.size != counterfactual_prediction.size:
            raise ValueError(
                'prediction and counterfactual_prediction size should be same for '
                'FlipCount metric, %f != %f' %
                (prediction.size, counterfactual_prediction.size))

        if prediction.size == 0:
            raise ValueError(
                'prediction is empty (required for FlipCount metric)')
        else:  # Always flatten
            example_weight = np.array(
                [float(example_weight) for i in range(prediction.shape[-1])])
            for p, cfp, w in zip(prediction.flatten(),
                                 counterfactual_prediction.flatten(),
                                 example_weight.flatten()):
                yield np.array([p]), np.array([cfp]), np.array([w])

    # Setting fractional label to false, since prediction is being used as label
    # and it can be a non-binary value.
    computations = binary_confusion_matrices.binary_confusion_matrices(
        thresholds=list(thresholds),
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        example_weighted=example_weighted,
        extract_label_prediction_and_weight=extract_label_prediction_and_weight,
        preprocessor=metric_types.FeaturePreprocessor(
            feature_keys=feature_keys),
        example_id_key=example_id_key,
        example_ids_count=example_ids_count,
        fractional_labels=False)
    examples_metric_key, matrices_metric_key = computations[-1].keys

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns flip count metrics values."""
        matrix = metrics[matrices_metric_key]
        examples = metrics[examples_metric_key]

        output = {}
        for i, threshold in enumerate(matrix.thresholds):
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative']] = matrix.fn[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive']] = matrix.fp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative_examples_ids']] = np.array(
                       examples.fn_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive_examples_ids']] = np.array(
                       examples.fp_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i]

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations