예제 #1
0
def _min_label_position(
        name=MIN_LABEL_POSITION_NAME,
        label_key: Optional[Text] = None,
        eval_config: Optional[config.EvalConfig] = None,
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        query_key: Text = '') -> metric_types.MetricComputations:
    """Returns metric computations for min label position."""
    if not query_key:
        raise ValueError(
            'a query_key is required to use MinLabelPosition metric')
    if model_names is None:
        model_names = ['']
    if output_names is None:
        output_names = ['']
    keys = []
    computations = []
    preprocessor = None
    if label_key:
        preprocessor = metric_types.FeaturePreprocessor(
            feature_keys=[label_key])
    for model_name in model_names:
        for output_name in output_names:
            key = metric_types.MetricKey(name=name,
                                         model_name=model_name,
                                         output_name=output_name)
            keys.append(key)
            computations.append(
                metric_types.MetricComputation(
                    keys=[key],
                    preprocessor=preprocessor,
                    combiner=_MinLabelPositionCombiner(key, eval_config,
                                                       label_key)))
    return computations
예제 #2
0
def _fixed_size_sample(
        sampled_key: Text,
        size: int,
        name: Text,
        random_seed: Optional[int],
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        sub_keys: Optional[List[metric_types.SubKey]] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metrics computations for FixedSizeSample metrcs."""
    keys = []
    for model_name in model_names or ['']:
        for output_name in output_names or ['']:
            for sub_key in sub_keys or [None]:
                keys.append(
                    metric_types.MetricKey(name,
                                           model_name=model_name,
                                           output_name=output_name,
                                           sub_key=sub_key,
                                           example_weighted=example_weighted))
    return [
        metric_types.MetricComputation(
            keys=keys,
            preprocessor=metric_types.FeaturePreprocessor(
                feature_keys=[sampled_key]),
            combiner=_FixedSizeSampleCombineFn(
                metric_keys=keys,
                sampled_key=sampled_key,
                size=size,
                example_weighted=example_weighted,
                random_seed=random_seed))
    ]
예제 #3
0
def _ndcg(gain_key: str,
          top_k_list: Optional[List[int]] = None,
          name: str = NDCG_NAME,
          eval_config: Optional[config_pb2.EvalConfig] = None,
          model_names: Optional[List[str]] = None,
          output_names: Optional[List[str]] = None,
          sub_keys: Optional[List[metric_types.SubKey]] = None,
          example_weighted: bool = False,
          query_key: str = '') -> metric_types.MetricComputations:
  """Returns metric computations for NDCG."""
  if not query_key:
    raise ValueError('a query_key is required to use NDCG metric')
  sub_keys = [k for k in sub_keys if k is not None]
  if top_k_list:
    if sub_keys is None:
      sub_keys = []
    for k in top_k_list:
      if not any([sub_key.top_k == k for sub_key in sub_keys]):
        sub_keys.append(metric_types.SubKey(top_k=k))
  if not sub_keys or any([sub_key.top_k is None for sub_key in sub_keys]):
    raise ValueError(
        'top_k values are required to use NDCG metric: {}'.format(sub_keys))
  computations = []
  for model_name in model_names if model_names else ['']:
    for output_name in output_names if output_names else ['']:
      keys = []
      for sub_key in sub_keys:
        keys.append(
            metric_types.MetricKey(
                name,
                model_name=model_name,
                output_name=output_name,
                sub_key=sub_key,
                example_weighted=example_weighted))
      computations.append(
          metric_types.MetricComputation(
              keys=keys,
              preprocessor=metric_types.FeaturePreprocessor(
                  feature_keys=[query_key, gain_key]),
              combiner=_NDCGCombiner(
                  metric_keys=keys,
                  eval_config=eval_config,
                  model_name=model_name,
                  output_name=output_name,
                  example_weighted=example_weighted,
                  query_key=query_key,
                  gain_key=gain_key)))
  return computations
예제 #4
0
 def testPreprocessorsWithoutDefaults(self):
     preprocessor = metric_types.StandardMetricInputsPreprocessorList([
         metric_types.FeaturePreprocessor(
             feature_keys=['feature1', 'feature2'],
             include_default_inputs=False),
         metric_types.TransformedFeaturePreprocessor(
             feature_keys=['feature1'], include_default_inputs=False),
         metric_types.AttributionPreprocessor(feature_keys=['feature1'],
                                              include_default_inputs=False)
     ])
     self.assertEqual(
         preprocessor.include_filter, {
             'features': {
                 'feature1': {},
                 'feature2': {},
             },
             'transformed_features': {
                 'feature1': {},
             },
             'attributions': {
                 'feature1': {},
             },
         })
예제 #5
0
def _ndcg(gain_key: Text,
          name: Text = NDCG_NAME,
          eval_config: Optional[config.EvalConfig] = None,
          model_names: List[Text] = None,
          output_names: List[Text] = None,
          sub_keys: Optional[List[metric_types.SubKey]] = None,
          query_key: Text = '') -> metric_types.MetricComputations:
  """Returns metric computations for NDCG."""
  if not query_key:
    raise ValueError('a query_key is required to use NDCG metric')
  if sub_keys is None or any([sub_key.top_k is None for sub_key in sub_keys]):
    raise ValueError(
        'top_k values are required to use NDCG metric: {}'.format(sub_keys))
  computations = []
  for model_name in model_names if model_names else ['']:
    for output_name in output_names if output_names else ['']:
      keys = []
      for sub_key in sub_keys:
        keys.append(
            metric_types.MetricKey(
                name,
                model_name=model_name,
                output_name=output_name,
                sub_key=sub_key))
      computations.append(
          metric_types.MetricComputation(
              keys=keys,
              preprocessor=metric_types.FeaturePreprocessor(
                  feature_keys=[query_key, gain_key]),
              combiner=_NDCGCombiner(
                  metric_keys=keys,
                  eval_config=eval_config,
                  model_name=model_name,
                  output_name=output_name,
                  query_key=query_key,
                  gain_key=gain_key)))
  return computations
예제 #6
0
 def testPreprocessors(self):
     preprocessor = metric_types.StandardMetricInputsPreprocessorList([
         metric_types.FeaturePreprocessor(
             feature_keys=['feature1', 'feature2']),
         metric_types.TransformedFeaturePreprocessor(
             feature_keys=['feature1']),
         metric_types.AttributionPreprocessor(feature_keys=['feature1'])
     ])
     self.assertEqual(
         preprocessor.include_filter, {
             'labels': {},
             'predictions': {},
             'example_weights': {},
             'features': {
                 'feature1': {},
                 'feature2': {},
             },
             'transformed_features': {
                 'feature1': {},
             },
             'attributions': {
                 'feature1': {},
             },
         })
예제 #7
0
def metric_computations_using_keras_saved_model(
        model_name: str,
        model_loader: types.ModelLoader,
        eval_config: Optional[config_pb2.EvalConfig],
        batch_size: Optional[int] = None) -> metric_types.MetricComputations:
    """Returns computations for computing metrics natively using keras.

  Args:
    model_name: Name of model.
    model_loader: Loader for shared model containing keras saved model to use
      for metric computations.
    eval_config: Eval config.
    batch_size: Batch size to use during evaluation (testing only).
  """
    model = model_loader.load()
    # If metrics were only added using model.compile then use
    # model.compiled_metrics and model.compiled_loss to compute the metrics,
    # otherwise custom metrics added via model.add_metric were also used and we
    # need to call model.evaluate.
    if not model.metrics:
        return []
    elif (hasattr(model, 'compiled_metrics') and model.compiled_metrics
          and hasattr(model, 'compiled_loss') and model.compiled_loss
          and len(model.compiled_metrics.metrics) +
          len(model.compiled_loss.metrics) == len(model.metrics)):
        if hasattr(model, 'output_names') and model.output_names:
            output_names = model.output_names
        else:
            output_names = []
        keys = _metric_keys(
            chain(model.compiled_metrics.metrics, model.compiled_loss.metrics),
            model_name, output_names)
        return [
            metric_types.MetricComputation(
                keys=keys,
                preprocessor=None,
                combiner=_KerasCompiledMetricsCombiner(keys, model_name,
                                                       model_loader,
                                                       eval_config,
                                                       batch_size))
        ]
    else:
        if hasattr(model, 'output_names') and model.output_names:
            output_names = model.output_names
        else:
            output_names = []
        keys = _metric_keys(model.metrics, model_name, output_names)
        specs = model_util.get_input_specs(model_name, signature_name=None)
        feature_keys = list(specs.keys()) if specs else []
        return [
            metric_types.MetricComputation(
                keys=keys,
                preprocessor=metric_types.StandardMetricInputsPreprocessorList(
                    [
                        metric_types.FeaturePreprocessor(
                            feature_keys=feature_keys,
                            model_names=[model_name]),
                        metric_types.TransformedFeaturePreprocessor(
                            feature_keys=feature_keys,
                            model_names=[model_name])
                    ]),
                combiner=_KerasEvaluateCombiner(keys, model_name, model_loader,
                                                eval_config, batch_size))
        ]
예제 #8
0
def flip_count(
        counterfactual_prediction_key: Optional[str] = None,
        example_id_key: Optional[str] = None,
        example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS,
        name: str = FLIP_COUNT_NAME,
        thresholds: Sequence[float] = DEFAULT_THRESHOLDS,
        model_name: str = '',
        output_name: str = '',
        eval_config: Optional[config_pb2.EvalConfig] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metric computations for computing flip counts."""
    keys, metric_key_by_name_by_threshold = create_metric_keys(
        thresholds, METRICS_LIST, name, model_name, output_name,
        example_weighted)

    feature_keys = [counterfactual_prediction_key]
    if example_id_key:
        feature_keys.append(example_id_key)

    def extract_label_prediction_and_weight(
        inputs: metric_types.StandardMetricInputs,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        example_weighted: bool = False,
        fractional_labels: bool = False,
        flatten: bool = True,
    ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Yields label, prediction, and example weights to be used in calculations.

    This function is a customized metric_util.to_label_prediction_example_weight
    function which yields original prediction as label and counterfactual
    prediction as prediction and derive flip count metrics from false positives
    and false negatives of binary confusion matrix.

    Args:
      inputs: Standard metric inputs.
      eval_config: Eval config
      model_name: Optional model name (if multi-model evaluation).
      output_name: Optional output name (if multi-output model type).
      sub_key: Optional sub key. (unused)
      aggregation_type: Optional aggregation type. (unused)
      class_weights: Optional class weights to apply to multi-class /
        multi-label labels and predictions. (unused)
      example_weighted: True if example weights should be applied.
      fractional_labels: If true, each incoming tuple of (label, prediction,
        example weight) will be split into two tuples as follows (where l, p, w
        represent the resulting label, prediction, and example weight values):
          (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label)
          (2) l = 1.0, p = prediction, and w = example_weight * label If
          enabled, an exception will be raised if labels are not within [0, 1].
          The implementation is such that tuples associated with a weight of
          zero are not yielded. This means it is safe to enable
          fractional_labels even when the labels only take on the values of 0.0
          or 1.0. (unused)
      flatten: True to flatten the final label and prediction outputs so that
        the yielded values are always arrays of size 1. For example, multi-class
        /multi-label outputs would be converted into label and prediction pairs
        that could then be processed by a binary classification metric in order
        to compute a micro average over all classes. (unused)

    Yields:
      Tuple of (label, prediction, example_weight).

    Raises:
      ValueError: If counterfactual prediction key is not found within either
        the features or predictions.
      ValueError: If predictions is None or empty.
    """
        del (sub_key, aggregation_type, class_weights, fractional_labels,
             flatten)  # unused

        # TODO(sokeefe): Look into removing the options to pass counterfactual
        # predictions in a feature and instead as a baseline model.
        if (counterfactual_prediction_key is not None
                and counterfactual_prediction_key in inputs.features):
            counterfactual_prediction = inputs.features[
                counterfactual_prediction_key]
        elif eval_config is not None:
            counterfactual_model_spec = model_util.get_baseline_model_spec(
                eval_config)
            if counterfactual_model_spec is not None:
                _, counterfactual_prediction, _ = next(
                    metric_util.to_label_prediction_example_weight(
                        inputs,
                        eval_config=eval_config,
                        model_name=counterfactual_model_spec.name,
                        output_name=output_name,
                        example_weighted=example_weighted,
                        fractional_labels=
                        False,  # Labels are ignored for flip counts.
                        flatten=False,  # Flattened below
                        allow_none=True,  # Allow None labels
                        require_single_example_weight=True))
            else:
                raise ValueError(
                    'The Counterfactual model must be listed with '
                    f'`is_baseline` equal to `True`. Found: {eval_config}')
        else:
            raise ValueError(
                '`counterfactual_prediction` was not found within the provided '
                'inputs. It must be included as either a feature key or within the '
                'predictions. Found:\n'
                f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n'
                f'`inputs.prediction`:{inputs.prediction}')

        if counterfactual_prediction is None:
            raise ValueError(
                '%s feature key is None (required for FlipCount metric)' %
                counterfactual_prediction_key)

        def get_by_keys(value: Any, keys: List[str]) -> Any:
            if isinstance(value, dict):
                new_value = util.get_by_keys(value, keys, optional=True)
                if new_value is not None:
                    return new_value
            return value

        if model_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [model_name])
        if output_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [output_name])

        _, prediction, example_weight = next(
            metric_util.to_label_prediction_example_weight(
                inputs,
                eval_config=eval_config,
                model_name=model_name,
                output_name=output_name,
                example_weighted=example_weighted,
                fractional_labels=False,  # Labels are ignored for flip counts.
                flatten=False,  # Flattened below
                allow_none=True,  # Allow None labels
                require_single_example_weight=True))

        if prediction.size != counterfactual_prediction.size:
            raise ValueError(
                'prediction and counterfactual_prediction size should be same for '
                'FlipCount metric, %f != %f' %
                (prediction.size, counterfactual_prediction.size))

        if prediction.size == 0:
            raise ValueError(
                'prediction is empty (required for FlipCount metric)')
        else:  # Always flatten
            example_weight = np.array(
                [float(example_weight) for i in range(prediction.shape[-1])])
            for p, cfp, w in zip(prediction.flatten(),
                                 counterfactual_prediction.flatten(),
                                 example_weight.flatten()):
                yield np.array([p]), np.array([cfp]), np.array([w])

    # Setting fractional label to false, since prediction is being used as label
    # and it can be a non-binary value.
    computations = binary_confusion_matrices.binary_confusion_matrices(
        thresholds=list(thresholds),
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        example_weighted=example_weighted,
        extract_label_prediction_and_weight=extract_label_prediction_and_weight,
        preprocessor=metric_types.FeaturePreprocessor(
            feature_keys=feature_keys),
        example_id_key=example_id_key,
        example_ids_count=example_ids_count,
        fractional_labels=False)
    examples_metric_key, matrices_metric_key = computations[-1].keys

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns flip count metrics values."""
        matrix = metrics[matrices_metric_key]
        examples = metrics[examples_metric_key]

        output = {}
        for i, threshold in enumerate(matrix.thresholds):
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative']] = matrix.fn[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive']] = matrix.fp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative_examples_ids']] = np.array(
                       examples.fn_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive_examples_ids']] = np.array(
                       examples.fp_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i]

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations