def testBatchSizeLimit(self):
    temp_export_dir = self._getExportDir()
    _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier(
        None, temp_export_dir)
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
    eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
    predict_extractor = predict_extractor_v2.PredictExtractor(
        eval_config=eval_config, eval_shared_model=eval_shared_model)
    with beam.Pipeline() as pipeline:
      examples = [
          self._makeExample(classes='first', scores=0.0, labels='third'),
          self._makeExample(classes='first', scores=0.0, labels='third'),
          self._makeExample(classes='first', scores=0.0, labels='third'),
          self._makeExample(classes='first', scores=0.0, labels='third'),
      ]

      predict_extracts = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples],
                                    reshuffle=False)
          | 'FeaturesToExtracts' >> model_eval_lib.InputsToExtracts()
          | predict_extractor.stage_name >> predict_extractor.ptransform)

      def check_result(got):
        try:
          self.assertLen(got, 4)
          # We can't verify the actual predictions, but we can verify the keys.
          for item in got:
            self.assertIn(constants.PREDICTIONS_KEY, item)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(predict_extracts, check_result, label='result')
示例#2
0
    def assertMetricsComputedWithBeamAre(self,
                                         eval_saved_model_path,
                                         serialized_examples,
                                         expected_metrics,
                                         add_metrics_callbacks=None):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
      add_metrics_callbacks: Optional. Callbacks for adding additional metrics.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_shared_model=eval_shared_model)

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics, _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> Extract(extractors=extractors)
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))
            # pylint: enable=no-value-for-parameter

            beam_util.assert_that(metrics, check_metrics)
    def testPredictExtractorWithMultiOutputModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = multi_head.simple_multi_head(None, temp_export_dir)

        eval_shared_model = self.createTestEvalSharedModel(
            model_path=export_dir)
        predict_extractor = predict_extractor_v2.PredictExtractor(
            eval_shared_model)

        examples = [
            self._makeExample(age=1.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=1.0,
                              language='chinese',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='other',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=1.0)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.PREDICTIONS_KEY, item)
                        for output_name in ('chinese_head', 'english_head',
                                            'other_head'):
                            for pred_key in ('logistic', 'probabilities',
                                             'all_classes'):
                                self.assertIn(output_name + '/' + pred_key,
                                              item[constants.PREDICTIONS_KEY])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
示例#4
0
    def testEvaluateQueryBasedMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (fixed_prediction_estimator_extra_fields.
                              simple_fixed_prediction_estimator_extra_fields(
                                  None, temp_eval_export_dir))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)
        extractors = [
            legacy_predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            metrics = (
                pipeline
                | 'Create' >> beam.Create(self._get_examples())
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'EvaluateQueryBasedMetrics' >>
                query_based_metrics_evaluator.EvaluateQueryBasedMetrics(
                    prediction_key='',
                    query_id='fixed_string',
                    combine_fns=[
                        query_statistics.QueryStatisticsCombineFn(),
                        ndcg.NdcgMetricCombineFn(at_vals=[1, 2],
                                                 gain_key='fixed_float',
                                                 weight_key='fixed_int'),
                        min_label_position.MinLabelPositionCombineFn(
                            label_key='', weight_key='fixed_int'),
                    ]))

            def check_metrics(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            'post_export_metrics/total_queries':
                            3.0,
                            'post_export_metrics/total_documents':
                            6.0,
                            'post_export_metrics/min_documents':
                            1.0,
                            'post_export_metrics/max_documents':
                            3.0,
                            'post_export_metrics/ndcg@1':
                            0.9166667,
                            'post_export_metrics/ndcg@2':
                            0.9766198,
                            'post_export_metrics/average_min_label_position/__labels':
                            0.6666667,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
  def testPredictExtractorWithRegressionModel(self):
    temp_export_dir = self._getExportDir()
    export_dir, _ = (
        fixed_prediction_estimator_extra_fields
        .simple_fixed_prediction_estimator_extra_fields(temp_export_dir, None))

    eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
    predict_extractor = predict_extractor_v2.PredictExtractor(
        eval_config=eval_config, eval_shared_model=eval_shared_model)

    examples = [
        self._makeExample(
            prediction=0.2,
            label=1.0,
            fixed_int=1,
            fixed_float=1.0,
            fixed_string='fixed_string1'),
        self._makeExample(
            prediction=0.8,
            label=0.0,
            fixed_int=1,
            fixed_float=1.0,
            fixed_string='fixed_string2'),
        self._makeExample(
            prediction=0.5,
            label=0.0,
            fixed_int=2,
            fixed_float=1.0,
            fixed_string='fixed_string3')
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples],
                                    reshuffle=False)
          | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
          | predict_extractor.stage_name >> predict_extractor.ptransform)

      # pylint: enable=no-value-for-parameter

      def check_result(got_preds):
        try:
          self.assertLen(got_preds, 3)
          expected_preds = [0.2, 0.8, 0.5]
          for got_pred, expected_pred in zip(got_preds, expected_preds):
            self.assertIn(constants.PREDICTIONS_KEY, got_pred)
            self.assertAlmostEqual(got_pred[constants.PREDICTIONS_KEY],
                                   expected_pred)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
    def testEvaluateNoSlicing(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, _ = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        value, {
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
    def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel(
            self):
        # Mainly for testing that the ExampleCount post export metric works with
        # unsupervised models.
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (fixed_prediction_estimator_no_labels.
                              simple_fixed_prediction_estimator_no_labels(
                                  None, temp_eval_export_dir))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[
                post_export_metrics.example_count(),
                post_export_metrics.example_weight(
                    example_weight_key='prediction')
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(prediction=1.0)
            example2 = self._makeExample(prediction=2.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'average_loss': 2.5,
                            metric_keys.EXAMPLE_COUNT: 2.0,
                            metric_keys.EXAMPLE_WEIGHT: 3.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
    def _runTestWithCustomCheck(self,
                                examples,
                                eval_export_dir,
                                metrics_callbacks,
                                slice_spec=None,
                                custom_metrics_check=None,
                                custom_plots_check=None,
                                custom_result_check=None):
        # make sure we are doing some checks
        self.assertTrue(custom_metrics_check is not None
                        or custom_plots_check is not None
                        or custom_result_check is not None)
        serialized_examples = [ex.SerializeToString() for ex in examples]
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec()],
            model_specs=[config.ModelSpec(location=eval_export_dir)],
            output_data_specs=[config.OutputDataSpec()],
            slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])
        with beam.Pipeline() as pipeline:
            (metrics, plots), _ = (
                pipeline
                | 'Create' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >>
                metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                    eval_shared_model=eval_shared_model,
                    compute_confidence_intervals=self.
                    compute_confidence_intervals,
                    random_seed_for_testing=self.deterministic_test_seed))
            if custom_metrics_check is not None:
                util.assert_that(metrics,
                                 custom_metrics_check,
                                 label='metrics')
            if custom_plots_check is not None:
                util.assert_that(plots, custom_plots_check, label='plot')

        result = pipeline.run()
        if custom_result_check is not None:
            custom_result_check(result)
示例#9
0
def BuildAnalysisTable(  # pylint: disable=invalid-name
    examples: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
    desired_batch_size: Optional[int] = None,
    extractors: Optional[List[extractor.Extractor]] = None,
    evaluators: Optional[List[evaluator.Evaluator]] = None
) -> beam.pvalue.PCollection:
    """Builds an analysis table from data extracted from the input.

  Use this function to build an example-oriented PCollection of output data
  useful for debugging models.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_shared_model: Shared model parameters for EvalSavedModel.
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.
    extractors: Optional list of Extractors to execute prior to slicing and
      aggregating the metrics. If not provided, a default set will be run.
    evaluators: Optional list of Evaluators for evaluating Extracts. If not
      provided a default set will be used..

  Returns:
    beam.pvalue.PCollection of Extracts. The caller is responsible for
    committing to file for now.
  """
    if not slice_spec:
        slice_spec = [slicer.SingleSliceSpec()]

    if not extractors:
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model,
                                               desired_batch_size),
            feature_extractor.FeatureExtractor(),
            slice_key_extractor.SliceKeyExtractor(slice_spec)
        ]
    if not evaluators:
        evaluators = [analysis_table_evaluator.AnalysisTableEvaluator()]

    # pylint: disable=no-value-for-parameter
    return (examples
            | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
            | model_eval_lib.ExtractAndEvaluate(extractors=extractors,
                                                evaluators=evaluators))
示例#10
0
    def testPredictExtractorWithBinaryClassificationModel(self):
        temp_export_dir = self._getExportDir()
        export_dir, _ = dnn_classifier.simple_dnn_classifier(temp_export_dir,
                                                             None,
                                                             n_classes=2)

        eval_config = config.EvalConfig(
            model_specs=[config.ModelSpec(location=export_dir)])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        predict_extractor = predict_extractor_v2.PredictExtractor(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])

        examples = [
            self._makeExample(age=1.0, language='english', label=0),
            self._makeExample(age=2.0, language='chinese', label=1),
            self._makeExample(age=3.0, language='chinese', label=0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 3)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.PREDICTIONS_KEY, item)
                        for pred_key in ('logistic', 'probabilities',
                                         'all_classes'):
                            self.assertIn(pred_key,
                                          item[constants.PREDICTIONS_KEY])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[
                _addExampleCountMetricCallback,
                # Note that since everything runs in-process this doesn't
                # actually test that the py_func can be correctly recreated
                # on workers in a distributed context.
                _addPyFuncMetricCallback,
                post_export_metrics.example_count(),
                post_export_metrics.example_weight(example_weight_key='age')
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0,
                            'py_func_label_sum': 2.0,
                            metric_keys.EXAMPLE_COUNT: 4.0,
                            metric_keys.EXAMPLE_WEIGHT: 15.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
    def testEvaluateWithSlicingAndDifferentBatchSizes(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor([
                slicer.SingleSliceSpec(),
                slicer.SingleSliceSpec(columns=['slice_key'])
            ])
        ]

        for batch_size in [1, 2, 4, 8]:

            with beam.Pipeline() as pipeline:
                example1 = self._makeExample(age=3.0,
                                             language='english',
                                             label=1.0,
                                             slice_key='first_slice')
                example2 = self._makeExample(age=3.0,
                                             language='chinese',
                                             label=0.0,
                                             slice_key='first_slice')
                example3 = self._makeExample(age=4.0,
                                             language='english',
                                             label=0.0,
                                             slice_key='second_slice')
                example4 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')
                example5 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')

                metrics, plots = (
                    pipeline
                    | 'Create' >> beam.Create([
                        example1.SerializeToString(),
                        example2.SerializeToString(),
                        example3.SerializeToString(),
                        example4.SerializeToString(),
                        example5.SerializeToString(),
                    ])
                    | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                    | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                    | 'ComputeMetricsAndPlots' >>
                    metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                        eval_shared_model=eval_shared_model,
                        desired_batch_size=batch_size))

                def check_result(got):
                    try:
                        self.assertEqual(3, len(got), 'got: %s' % got)
                        slices = {}
                        for slice_key, value in got:
                            slices[slice_key] = value
                        overall_slice = ()
                        first_slice = (('slice_key', b'first_slice'), )
                        second_slice = (('slice_key', b'second_slice'), )
                        self.assertItemsEqual(
                            list(slices.keys()),
                            [overall_slice, first_slice, second_slice])
                        self.assertDictElementsAlmostEqual(
                            slices[overall_slice], {
                                'accuracy': 0.4,
                                'label/mean': 0.6,
                                'my_mean_age': 4.0,
                                'my_mean_age_times_label': 2.6,
                                'added_example_count': 5.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[first_slice], {
                                'accuracy': 1.0,
                                'label/mean': 0.5,
                                'my_mean_age': 3.0,
                                'my_mean_age_times_label': 1.5,
                                'added_example_count': 2.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[second_slice], {
                                'accuracy': 0.0,
                                'label/mean': 2.0 / 3.0,
                                'my_mean_age': 14.0 / 3.0,
                                'my_mean_age_times_label': 10.0 / 3.0,
                                'added_example_count': 3.0
                            })

                    except AssertionError as err:
                        # This function is redefined every iteration, so it will have the
                        # right value of batch_size.
                        raise util.BeamAssertException(
                            'batch_size = %d, error: %s' % (batch_size, err))  # pylint: disable=cell-var-from-loop

                util.assert_that(metrics, check_result, label='metrics')
                util.assert_that(plots, util.is_empty(), label='plots')
    def testEvaluateWithBinaryClassificationModel(self):
        n_classes = 2
        temp_export_dir = self._getExportDir()
        _, export_dir = dnn_classifier.simple_dnn_classifier(
            None, temp_export_dir, n_classes=n_classes)

        # Add mean_label, example_count, weighted_example_count, calibration_plot
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='age')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration_plot.CalibrationPlot(name='calibration_plot',
                                                 num_buckets=10)
            ]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0, language='english', label=0.0),
            self._makeExample(age=2.0, language='chinese', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics_and_plots = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            3,
                            weighted_example_count_key: (1.0 + 2.0 + 3.0),
                            label_key:
                            (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            def check_plots(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    plot_key = metric_types.PlotKey('calibration_plot')
                    self.assertIn(plot_key, got_plots)
                    # 10 buckets + 2 for edge cases
                    self.assertLen(got_plots[plot_key].buckets, 12)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics_and_plots[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
            util.assert_that(metrics_and_plots[constants.PLOTS_KEY],
                             check_plots,
                             label='plots')
示例#14
0
    def testInputExtractor(self):
        model_spec = config.ModelSpec(label_key='label',
                                      example_weight_key='example_weight')
        extractor = input_extractor.InputExtractor(
            eval_config=config.EvalConfig(model_specs=[model_spec]))

        examples = [
            self._makeExample(label=1.0,
                              example_weight=0.5,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(label=0.0,
                              example_weight=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string2'),
            self._makeExample(label=0.0,
                              example_weight=1.0,
                              fixed_int=2,
                              fixed_float=0.0,
                              fixed_string='fixed_string3')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | extractor.stage_name >> extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 3)
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.FEATURES_KEY], {
                            'fixed_int': np.array([1]),
                            'fixed_float': np.array([1.0]),
                        })
                    self.assertEqual(
                        got[0][constants.FEATURES_KEY]['fixed_string'],
                        np.array([b'fixed_string1']))
                    self.assertAlmostEqual(got[0][constants.LABELS_KEY],
                                           np.array([1.0]))
                    self.assertAlmostEqual(
                        got[0][constants.EXAMPLE_WEIGHTS_KEY], np.array([0.5]))
                    self.assertDictElementsAlmostEqual(
                        got[1][constants.FEATURES_KEY], {
                            'fixed_int': np.array([1]),
                            'fixed_float': np.array([1.0]),
                        })
                    self.assertEqual(
                        got[1][constants.FEATURES_KEY]['fixed_string'],
                        np.array([b'fixed_string2']))
                    self.assertAlmostEqual(got[1][constants.LABELS_KEY],
                                           np.array([0.0]))
                    self.assertAlmostEqual(
                        got[1][constants.EXAMPLE_WEIGHTS_KEY], np.array([0.0]))
                    self.assertDictElementsAlmostEqual(
                        got[2][constants.FEATURES_KEY], {
                            'fixed_int': np.array([2]),
                            'fixed_float': np.array([0.0]),
                        })
                    self.assertEqual(
                        got[2][constants.FEATURES_KEY]['fixed_string'],
                        np.array([b'fixed_string3']))
                    self.assertAlmostEqual(got[2][constants.LABELS_KEY],
                                           np.array([0.0]))
                    self.assertAlmostEqual(
                        got[2][constants.EXAMPLE_WEIGHTS_KEY], np.array([1.0]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
示例#15
0
    def testInputExtractorMultiModel(self):
        model_spec1 = config.ModelSpec(name='model1',
                                       label_key='label',
                                       example_weight_key='example_weight',
                                       prediction_key='fixed_float')
        model_spec2 = config.ModelSpec(name='model2',
                                       label_keys={
                                           'output1': 'label1',
                                           'output2': 'label2'
                                       },
                                       example_weight_keys={
                                           'output1': 'example_weight1',
                                           'output2': 'example_weight2'
                                       },
                                       prediction_keys={
                                           'output1': 'fixed_float',
                                           'output2': 'fixed_float'
                                       })
        extractor = input_extractor.InputExtractor(
            eval_config=config.EvalConfig(
                model_specs=[model_spec1, model_spec2]))

        examples = [
            self._makeExample(label=1.0,
                              label1=1.0,
                              label2=0.0,
                              example_weight=0.5,
                              example_weight1=0.5,
                              example_weight2=0.5,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(label=1.0,
                              label1=1.0,
                              label2=1.0,
                              example_weight=0.0,
                              example_weight1=0.0,
                              example_weight2=1.0,
                              fixed_int=1,
                              fixed_float=2.0,
                              fixed_string='fixed_string2'),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | extractor.stage_name >> extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.FEATURES_KEY], {
                            'fixed_int': np.array([1]),
                        })
                    self.assertEqual(
                        got[0][constants.FEATURES_KEY]['fixed_string'],
                        np.array([b'fixed_string1']))
                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name, got[0][constants.LABELS_KEY])
                        self.assertIn(model_name,
                                      got[0][constants.EXAMPLE_WEIGHTS_KEY])
                        self.assertIn(model_name,
                                      got[0][constants.PREDICTIONS_KEY])
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.LABELS_KEY]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([0.0])
                        })
                    self.assertAlmostEqual(
                        got[0][constants.EXAMPLE_WEIGHTS_KEY]['model1'],
                        np.array([0.5]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.EXAMPLE_WEIGHTS_KEY]['model2'], {
                            'output1': np.array([0.5]),
                            'output2': np.array([0.5])
                        })
                    self.assertAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.PREDICTIONS_KEY]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([1.0])
                        })

                    self.assertDictElementsAlmostEqual(
                        got[1][constants.FEATURES_KEY], {
                            'fixed_int': np.array([1]),
                        })
                    self.assertEqual(
                        got[1][constants.FEATURES_KEY]['fixed_string'],
                        np.array([b'fixed_string2']))
                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name, got[1][constants.LABELS_KEY])
                        self.assertIn(model_name,
                                      got[1][constants.EXAMPLE_WEIGHTS_KEY])
                        self.assertIn(model_name,
                                      got[1][constants.PREDICTIONS_KEY])
                    self.assertAlmostEqual(
                        got[1][constants.LABELS_KEY]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[1][constants.LABELS_KEY]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([1.0])
                        })
                    self.assertAlmostEqual(
                        got[1][constants.EXAMPLE_WEIGHTS_KEY]['model1'],
                        np.array([0.0]))
                    self.assertDictElementsAlmostEqual(
                        got[1][constants.EXAMPLE_WEIGHTS_KEY]['model2'], {
                            'output1': np.array([0.0]),
                            'output2': np.array([1.0])
                        })
                    self.assertAlmostEqual(
                        got[1][constants.PREDICTIONS_KEY]['model1'],
                        np.array([2.0]))
                    self.assertDictElementsAlmostEqual(
                        got[1][constants.PREDICTIONS_KEY]['model2'], {
                            'output1': np.array([2.0]),
                            'output2': np.array([2.0])
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testEvaluateWithMultiClassModel(self):
        n_classes = 3
        temp_export_dir = self._getExportDir()
        _, export_dir = dnn_classifier.simple_dnn_classifier(
            None, temp_export_dir, n_classes=n_classes)

        # Add example_count and weighted_example_count
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='age')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics(
                [calibration.MeanLabel('mean_label')],
                binarize=config.BinarizationOptions(
                    class_ids=range(n_classes))))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0, language='english', label=0),
            self._makeExample(age=2.0, language='chinese', label=1),
            self._makeExample(age=3.0, language='english', label=2),
            self._makeExample(age=4.0, language='chinese', label=1),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key_class_0 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=0))
                    label_key_class_1 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=1))
                    label_key_class_2 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=2))
                    self.assertEqual(got_slice_key, ())
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            4,
                            weighted_example_count_key:
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_0:
                            (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_1:
                            (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_2:
                            (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0)
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithSlicing(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='fixed_float')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration.MeanPrediction('mean_prediction')
            ]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir)
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            predict_extractor.PredictExtractor(
                eval_shared_model=eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        # fixed_float used as example_weight key
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=2.0,
                              fixed_string='fixed_string2')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    fixed_string1_slice = (('fixed_string',
                                            b'fixed_string1'), )
                    fixed_string2_slice = (('fixed_string',
                                            b'fixed_string2'), )
                    self.asssertCountEqual(list(slices.keys()), [
                        overall_slice, fixed_string1_slice, fixed_string2_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    pred_key = metric_types.MetricKey(name='mean_prediction')
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 4.0,
                            label_key:
                            (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0),
                            pred_key:
                            (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0),
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[fixed_string1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            label_key: (1.0 + 0.0) / (1.0 + 1.0),
                            pred_key: (0.2 + 0.8) / (1.0 + 1.0),
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[fixed_string2_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 2.0,
                            label_key: (2 * 0.0) / 2.0,
                            pred_key: (2 * 0.5) / 2.0,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

                util.assert_that(metrics[constants.METRICS_KEY],
                                 check_metrics,
                                 label='metrics')
    def testEvaluateWithEvalSavedModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = linear_classifier.simple_linear_classifier(
            None, temp_export_dir)
        eval_config = config.EvalConfig(
            model_specs=[config.ModelSpec(signature_name='eval')],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['slice_key']),
            ])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_model=eval_shared_model)
        ]

        examples = [
            self._makeExample(age=3.0,
                              language='english',
                              label=1.0,
                              slice_key='first_slice'),
            self._makeExample(age=3.0,
                              language='chinese',
                              label=0.0,
                              slice_key='first_slice'),
            self._makeExample(age=4.0,
                              language='english',
                              label=0.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    first_slice = (('slice_key', b'first_slice'), )
                    second_slice = (('slice_key', b'second_slice'), )
                    self.assertCountEqual(
                        list(slices.keys()),
                        [overall_slice, first_slice, second_slice])
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.4,
                            metric_types.MetricKey(name='label/mean'):
                            0.6,
                            metric_types.MetricKey(name='my_mean_age'):
                            4.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            2.6,
                            metric_types.MetricKey(name='added_example_count'):
                            5.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[first_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            1.0,
                            metric_types.MetricKey(name='label/mean'):
                            0.5,
                            metric_types.MetricKey(name='my_mean_age'):
                            3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            1.5,
                            metric_types.MetricKey(name='added_example_count'):
                            2.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[second_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.0,
                            metric_types.MetricKey(name='label/mean'):
                            2.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age'):
                            14.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            10.0 / 3.0,
                            metric_types.MetricKey(name='added_example_count'):
                            3.0
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testModelAgnosticConstructFn(self):
        # End to end test for the entire flow going from tf.Examples -> metrics
        # with slicing.
        with beam.Pipeline() as pipeline:
            # Set up the inputs. All we need is are tf.Examples and an example parsing
            # spec with explicit mapping for key to (Features, Predictions, Labels).
            examples = [
                self._makeExample(age=3.0,
                                  language='english',
                                  probabilities=1.0,
                                  labels=1.0),
                self._makeExample(age=3.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                self._makeExample(age=4.0,
                                  language='english',
                                  probabilities=2.0,
                                  labels=1.0),
                self._makeExample(age=5.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                # Add some examples with no language.
                self._makeExample(age=5.0, probabilities=2.0, labels=10.0),
                self._makeExample(age=6.0, probabilities=1.0, labels=0.0)
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            # Set up a config to bucket our example keys.
            feature_map = {
                'age': tf.FixedLenFeature([], tf.float32),
                'language': tf.VarLenFeature(tf.string),
                'probabilities': tf.FixedLenFeature([], tf.float32),
                'labels': tf.FixedLenFeature([], tf.float32)
            }

            model_agnostic_config = agnostic_predict.ModelAgnosticConfig(
                label_keys=['labels'],
                prediction_keys=['probabilities'],
                feature_spec=feature_map)

            # Set up the Model Agnostic Extractor
            extractors = [
                model_agnostic_extractor.ModelAgnosticExtractor(
                    model_agnostic_config=model_agnostic_config,
                    desired_batch_size=3),
                slice_key_extractor.SliceKeyExtractor([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['language'])
                ])
            ]

            # Set up the metrics we wish to calculate via a metric callback. In
            # particular, this metric calculates the mean and sum of all labels.
            eval_shared_model = types.EvalSharedModel(
                add_metrics_callbacks=[add_mean_callback],
                construct_fn=model_agnostic_evaluate_graph.make_construct_fn(
                    add_metrics_callbacks=[add_mean_callback],
                    fpl_feed_config=model_agnostic_extractor.
                    ModelAgnosticGetFPLFeedConfig(model_agnostic_config)))

            # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics.
            metrics, _ = (
                pipeline
                | 'Create Examples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            # Verify our metrics are properly generated per slice.
            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                overall_slice = ()
                english_slice = (('language', b'english'), )
                chinese_slice = (('language', b'chinese'), )

                self.assertItemsEqual(
                    list(slices.keys()),
                    [overall_slice, english_slice, chinese_slice])
                # Overall slice has label/predictions sum = 24 and 12 elements.
                self.assertDictElementsAlmostEqual(slices[overall_slice], {
                    'tf_metric_mean': 2.0,
                    'py_func_total_label': 24.0,
                })
                # English slice has label/predictions sum = 5 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[english_slice], {
                    'tf_metric_mean': 1.25,
                    'py_func_total_label': 5.0,
                })
                # Chinese slice has label/predictions sum = 6 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[chinese_slice], {
                    'tf_metric_mean': 1.5,
                    'py_func_total_label': 6.0,
                })

            util.assert_that(metrics, check_result)
    def testEvaluateWithMultiOutputModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = multi_head.simple_multi_head(None, temp_export_dir)

        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_keys={
                                     'chinese_head': 'chinese_label',
                                     'english_head': 'english_label',
                                     'other_head': 'other_label'
                                 },
                                 example_weight_keys={
                                     'chinese_head': 'age',
                                     'english_head': 'age',
                                     'other_head': 'age'
                                 })
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics({
                'chinese_head': [calibration.MeanLabel('mean_label')],
                'english_head': [calibration.MeanLabel('mean_label')],
                'other_head': [calibration.MeanLabel('mean_label')],
            }))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=1.0,
                              language='chinese',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='other',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=1.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    chinese_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='chinese_head')
                    chinese_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='chinese_head')
                    english_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='english_head')
                    english_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='english_head')
                    other_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='other_head')
                    other_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='other_head')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            4,
                            chinese_label_key:
                            (0.0 + 1.0 + 2 * 0.0 + 2 * 1.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            chinese_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0),
                            english_label_key:
                            (1.0 + 0.0 + 2 * 1.0 + 2 * 0.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            english_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0),
                            other_label_key: (0.0 + 0.0 + 2 * 0.0 + 2 * 1.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            other_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0)
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithKerasModel(self):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layer = tf.keras.layers.Dense(1,
                                             activation=tf.nn.sigmoid,
                                             name='output')(input_layer)
        model = tf.keras.models.Model(inputs, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]}
        labels = [[1], [0]]
        example_weights = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='example_weight')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics(
                [calibration.MeanLabel('mean_label')]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(input1=0.0,
                              input2=1.0,
                              label=1.0,
                              example_weight=1.0,
                              extra_feature='non_model_feature'),
            self._makeExample(input1=1.0,
                              input2=0.0,
                              label=0.0,
                              example_weight=0.5,
                              extra_feature='non_model_feature'),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key: 2,
                            weighted_example_count_key: (1.0 + 0.5),
                            label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithQueryBasedMetrics(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='fixed_int')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='fixed_float', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1, 2]),
                query_key='fixed_string'))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        # fixed_string used as query_key
        # fixed_float used as gain_key for NDCG
        # fixed_int used as example_weight_key for NDCG
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query1',
                              fixed_int=1),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_float=0.5,
                              fixed_string='query1',
                              fixed_int=1),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_float=0.5,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.9,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.1,
                              label=0.0,
                              fixed_float=0.1,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.9,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query3',
                              fixed_int=3)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 4)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    query1_slice = (('fixed_string', b'query1'), )
                    query2_slice = (('fixed_string', b'query2'), )
                    query3_slice = (('fixed_string', b'query3'), )
                    self.assertCountEqual(list(slices.keys()), [
                        overall_slice, query1_slice, query2_slice, query3_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    ndcg1_key = metric_types.MetricKey(
                        name='ndcg', sub_key=metric_types.SubKey(top_k=1))
                    ndcg2_key = metric_types.MetricKey(
                        name='ndcg', sub_key=metric_types.SubKey(top_k=2))
                    # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0)
                    # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1)
                    # Query3 (weight=3): (p=0.9, g=1.0)
                    #
                    # DCG@1:  0.5, 1.0, 1.0
                    # NDCG@1: 0.5, 1.0, 1.0
                    # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92
                    #
                    # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930
                    #        (1.0 + 0.5/log(3) ~ 1.315465
                    #        1.0
                    # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972
                    #         (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0
                    #         1.0
                    # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 6,
                            weighted_example_count_key: 11.0,
                            ndcg1_key: 0.9166667,
                            ndcg2_key: 0.9766198
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            ndcg1_key: 0.5,
                            ndcg2_key: 0.85972
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query2_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 6.0,
                            ndcg1_key: 1.0,
                            ndcg2_key: 1.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query3_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 3.0,
                            ndcg1_key: 1.0,
                            ndcg2_key: 1.0
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithPlots(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (
            fixed_prediction_estimator.simple_fixed_prediction_estimator(
                None, temp_eval_export_dir))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[
                post_export_metrics.example_count(),
                post_export_metrics.auc_plots()
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(prediction=0.0, label=1.0)
            example2 = self._makeExample(prediction=0.7, label=0.0)
            example3 = self._makeExample(prediction=0.8, label=1.0)
            example4 = self._makeExample(prediction=1.0, label=1.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_metrics(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            metric_keys.EXAMPLE_COUNT: 4.0,
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_metrics, label='metrics')

            def check_plots(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictMatrixRowsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            _full_key(metric_keys.AUC_PLOTS_MATRICES):
                            [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])],
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(plots, check_plots, label='plots')
  def testPredictExtractorWithMultiModels(self):
    temp_export_dir = self._getExportDir()
    export_dir1, _ = multi_head.simple_multi_head(temp_export_dir, None)
    export_dir2, _ = multi_head.simple_multi_head(temp_export_dir, None)

    eval_config = config.EvalConfig(model_specs=[
        config.ModelSpec(name='model1'),
        config.ModelSpec(name='model2')
    ])
    eval_shared_model1 = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir1, tags=[tf.saved_model.SERVING])
    eval_shared_model2 = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir2, tags=[tf.saved_model.SERVING])
    predict_extractor = predict_extractor_v2.PredictExtractor(
        eval_config=eval_config,
        eval_shared_model={
            'model1': eval_shared_model1,
            'model2': eval_shared_model2
        })

    examples = [
        self._makeExample(
            age=1.0,
            language='english',
            english_label=1.0,
            chinese_label=0.0,
            other_label=0.0),
        self._makeExample(
            age=1.0,
            language='chinese',
            english_label=0.0,
            chinese_label=1.0,
            other_label=0.0),
        self._makeExample(
            age=2.0,
            language='english',
            english_label=1.0,
            chinese_label=0.0,
            other_label=0.0),
        self._makeExample(
            age=2.0,
            language='other',
            english_label=0.0,
            chinese_label=1.0,
            other_label=1.0)
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples],
                                    reshuffle=False)
          | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
          | predict_extractor.stage_name >> predict_extractor.ptransform)

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 4)
          for item in got:
            # We can't verify the actual predictions, but we can verify the keys
            self.assertIn(constants.PREDICTIONS_KEY, item)
            for model_name in ('model1', 'model2'):
              self.assertIn(model_name, item[constants.PREDICTIONS_KEY])
              for output_name in ('chinese_head', 'english_head', 'other_head'):
                for pred_key in ('logistic', 'probabilities', 'all_classes'):
                  self.assertIn(output_name + '/' + pred_key,
                                item[constants.PREDICTIONS_KEY][model_name])

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
示例#25
0
    def assertGeneralMetricsComputedWithBeamAre(
            self, eval_saved_model_path: Text,
            examples_pcollection: beam.pvalue.PCollection,
            slice_spec: List[slicer.SingleSliceSpec],
            add_metrics_callbacks: List[types.AddMetricsCallbackType],
            expected_slice_metrics: Dict[Any, Dict[Text, Any]]):
        """Checks metrics computed using Beam.

    A more general version of assertMetricsComputedWithBeamAre. Note that the
    caller is responsible for setting up and running the Beam pipeline.

    Example usage:
      def add_metrics(features, predictions, labels):
       metric_ops = {
         'mse': tf.metrics.mean_squared_error(labels, predictions['logits']),
         'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']),
      }
      return metric_ops

      with beam.Pipeline() as pipeline:
        expected_slice_metrics = {
            (): {
              'mae': 0.1,
              'mse': 0.2,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
            (('age', 10),): {
              'mae': 0.2,
              'mse': 0.3,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
        }
        examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path)
        self.assertGeneralMetricsComputedWithBeamAre(
          eval_saved_model_path=path,
          examples_pcollection=examples,
          slice_spec=[tfma.slicer.SingleSliceSpec(),
                      tfma.slicer.SingleSliceSpec(columns=['age'])],
          add_metrics_callbacks=[
            add_metrics, tfma.post_export_metrics.auc()],
          expected_slice_metrics=expected_slice_metrics)

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      examples_pcollection: A PCollection of serialized example bytes.
      slice_spec: List of slice specifications.
      add_metrics_callbacks: Callbacks for adding additional metrics.
      expected_slice_metrics: Dictionary of dictionaries describing the expected
        metrics for each slice. The outer dictionary map slice keys to the
        expected metrics for that slice.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                slices = {}
                for slice_key, value in got:
                    slices[slice_key] = value
                self.assertItemsEqual(list(slices.keys()),
                                      list(expected_slice_metrics.keys()))
                for slice_key, expected_metrics in expected_slice_metrics.items(
                ):
                    self.assertDictElementsWithinBounds(
                        got_values_dict=slices[slice_key],
                        expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        # pylint: disable=no-value-for-parameter
        (metrics,
         _), _ = (examples_pcollection
                  | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                  | 'Extract' >> Extract(extractors=extractors)
                  | 'ComputeMetricsAndPlots' >>
                  legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                      eval_shared_model=eval_shared_model))
        # pylint: enable=no-value-for-parameter

        beam_util.assert_that(metrics, check_metrics)
    def testEvaluateWithConfidenceIntervals(self):
        # NOTE: This test does not actually test that confidence intervals are
        #   accurate it only tests that the proto output by the test is well formed.
        #   This test would pass if the confidence interval implementation did
        #   nothing at all except compute the unsampled value.
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        options = config.Options()
        options.compute_confidence_intervals.value = True
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(label_key='label',
                                 example_weight_key='fixed_float')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration.MeanPrediction('mean_prediction')
            ]),
            options=options)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config, eval_shared_model=eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_model=eval_shared_model)
        ]

        # fixed_float used as example_weight key
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=2.0,
                              fixed_string='fixed_string2')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    fixed_string1_slice = (('fixed_string',
                                            b'fixed_string1'), )
                    fixed_string2_slice = (('fixed_string',
                                            b'fixed_string2'), )
                    self.assertCountEqual(list(slices.keys()), [
                        overall_slice, fixed_string1_slice, fixed_string2_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    pred_key = metric_types.MetricKey(name='mean_prediction')
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 4.0,
                            label_key:
                            (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0),
                            pred_key:
                            (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0),
                        })
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[fixed_string1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            label_key: (1.0 + 0.0) / (1.0 + 1.0),
                            pred_key: (0.2 + 0.8) / (1.0 + 1.0),
                        })
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[fixed_string2_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 2.0,
                            label_key: (2 * 0.0) / 2.0,
                            pred_key: (2 * 0.5) / 2.0,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')