예제 #1
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
     eval_result = model_eval_lib.run_model_analysis(
         model_eval_lib.default_eval_shared_model(
             eval_saved_model_path=model_location,
             example_weight_key='age'),
         data_location,
         slice_spec=slice_spec,
         num_bootstrap_samples=20)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         ((b'language', b'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         ((b'language', b'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_location, model_location)
     self.assertEqual(eval_result.config.data_location, data_location)
     self.assertEqual(eval_result.config.slice_spec, slice_spec)
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
예제 #2
0
  def testBuildAnalysisTable(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)

    example1 = self._makeExample(
        age=3.0, language='english', label=1.0, slice_key='first_slice')

    with beam.Pipeline() as pipeline:
      result = (
          pipeline
          | 'CreateInput' >> beam.Create([example1.SerializeToString()])
          | 'BuildTable' >>
          contrib.BuildAnalysisTable(eval_shared_model=eval_shared_model))

      def check_result(got):
        self.assertEqual(1, len(got), 'got: %s' % got)
        extracts = got[0]

        # Values of type MaterializedColumn are emitted to signal to
        # downstream sink components to output the data to file.
        materialized_dict = dict((k, v)
                                 for k, v in extracts.items()
                                 if isinstance(v, types.MaterializedColumn))
        self._assertMaterializedColumns(
            materialized_dict,
            {
                # Slice key
                'features__slice_key':
                    types.MaterializedColumn(
                        name='features__slice_key', value=[b'first_slice']),

                # Features
                'features__language':
                    types.MaterializedColumn(
                        name='features__language', value=[b'english']),
                'features__age':
                    types.MaterializedColumn(
                        name='features__age',
                        value=np.array([3.], dtype=np.float32)),

                # Label
                'features__label':
                    types.MaterializedColumn(
                        name='features__label',
                        value=np.array([1.], dtype=np.float32)),
                'labels':
                    types.MaterializedColumn(
                        name='labels', value=np.array([1.], dtype=np.float32)),
            })
        self._assertMaterializedColumnsExist(materialized_dict, [
            'predictions__logits', 'predictions__probabilities',
            'predictions__classes', 'predictions__logistic',
            'predictions__class_ids', constants.SLICE_KEYS_KEY
        ])

      util.assert_that(result[constants.ANALYSIS_KEY], check_result)
예제 #3
0
  def assertMetricsComputedWithBeamAre(self, eval_saved_model_path,
                                       serialized_examples,
                                       expected_metrics):
    """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
    """

    def check_metrics(got):
      """Check metrics callback."""
      try:
        self.assertEqual(
            1, len(got), 'expecting metrics for exactly one slice, but got %d '
            'slices instead. metrics were: %s' % (len(got), got))
        (slice_key, value) = got[0]
        self.assertEqual((), slice_key)
        self.assertDictElementsWithinBounds(
            got_values_dict=value, expected_values_dict=expected_metrics)
      except AssertionError as err:
        raise beam_util.BeamAssertException(err)

    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=eval_saved_model_path)
    extractors = model_eval_lib.default_extractors(
        eval_shared_model=eval_shared_model)

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      metrics, _ = (
          pipeline
          | 'CreateExamples' >> beam.Create(serialized_examples)
          | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
          | 'Extract' >> Extract(extractors=extractors)
          | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator
          .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))
      # pylint: enable=no-value-for-parameter

      beam_util.assert_that(metrics, check_metrics)
예제 #4
0
    def testRunModelAnalysisWithMultiplePlots(self):
        model_location = self._exportEvalSavedModel(
            fixed_prediction_estimator.simple_fixed_prediction_estimator)
        examples = [
            self._makeExample(prediction=0.0, label=1.0),
            self._makeExample(prediction=0.7, label=0.0),
            self._makeExample(prediction=0.8, label=1.0),
            self._makeExample(prediction=1.0, label=1.0),
            self._makeExample(prediction=1.0, label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[config.ModelSpec(location=model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ])
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            add_metrics_callbacks=[
                post_export_metrics.auc_plots(),
                post_export_metrics.auc_plots(metric_tag='test')
            ])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])

        # pipeline works.
        expected_metrics = {
            (): {
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 5.0
                },
            }
        }
        expected_matrix = {
            'threshold': 0.8,
            'falseNegatives': 2.0,
            'trueNegatives': 1.0,
            'truePositives': 2.0,
            'precision': 1.0,
            'recall': 0.5
        }
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics,
                                      expected_metrics)
        self.assertEqual(len(eval_result.plots), 1)
        slice_key, plots = eval_result.plots[0]
        self.assertEqual((), slice_key)
        tf.compat.v1.logging.info(plots.keys())
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics']['confusionMatrixAtThresholds']
            ['matrices'][8001], expected_matrix)
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics/test']
            ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
    def testBuildAnalysisTableWithSlices(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location)

        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')
        slice_spec = [
            slicer.SingleSliceSpec(columns=['age']),
            slicer.SingleSliceSpec(features=[('age', 3)]),
            slicer.SingleSliceSpec(columns=['age'],
                                   features=[('language', 'english')])
        ]

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'CreateInput' >> beam.Create([example1.SerializeToString()])
                | 'BuildTable' >> contrib.BuildAnalysisTable(
                    eval_shared_model, slice_spec))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict, {
                        constants.SLICE_KEYS_KEY:
                        types.MaterializedColumn(
                            name=constants.SLICE_KEYS_KEY,
                            value=[
                                b'age:3.0', b'age:3',
                                b'age_X_language:3.0_X_english'
                            ])
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids'
                ])

            util.assert_that(result[constants.ANALYSIS_KEY], check_result)
예제 #6
0
    def testPredict(self, features_blacklist):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_export_dir,
            blacklist_feature_fetches=features_blacklist)
        with beam.Pipeline() as pipeline:
            examples = [
                self._makeExample(age=3.0, language='english', label=1.0),
                self._makeExample(age=3.0, language='chinese', label=0.0),
                self._makeExample(age=4.0, language='english', label=1.0),
                self._makeExample(age=5.0, language='chinese', label=0.0),
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            predict_extracts = (
                pipeline
                | beam.Create(serialized_examples, reshuffle=False)
                # Our diagnostic outputs, pass types.Extracts throughout, however our
                # aggregating functions do not use this interface.
                | beam.Map(lambda x: {constants.INPUT_KEY: x})
                | 'Predict' >> predict_extractor._TFMAPredict(
                    eval_shared_models={'': eval_shared_model},
                    desired_batch_size=3))

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    for item in got:
                        self.assertIn(
                            constants.FEATURES_PREDICTIONS_LABELS_KEY, item)
                        fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY]
                        # Verify fpl contains features, probabilities, and correct labels.
                        blacklisted_features = set(features_blacklist or [])
                        expected_features = (
                            set(['language', 'age', 'label']) -
                            blacklisted_features)
                        for feature in expected_features:
                            self.assertIn(feature, fpl.features)
                        for feature in blacklisted_features:
                            self.assertNotIn(feature, fpl.features)
                        self.assertAlmostEqual(fpl.features['label'],
                                               fpl.labels['__labels'])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
 def testRunModelAnalysisWithMultiplePlots(self):
     model_location = self._exportEvalSavedModel(
         fixed_prediction_estimator.simple_fixed_prediction_estimator)
     examples = [
         self._makeExample(prediction=0.0, label=1.0),
         self._makeExample(prediction=0.7, label=0.0),
         self._makeExample(prediction=0.8, label=1.0),
         self._makeExample(prediction=1.0, label=1.0),
         self._makeExample(prediction=1.0, label=1.0)
     ]
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location,
         add_metrics_callbacks=[
             post_export_metrics.auc_plots(),
             post_export_metrics.auc_plots(metric_tag='test')
         ])
     data_location = self._writeTFExamplesToTFRecords(examples)
     eval_result = model_eval_lib.run_model_analysis(
         eval_shared_model, data_location)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected_metrics = {
         (): {
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 5.0
             },
         }
     }
     expected_matrix = {
         'threshold': 0.8,
         'falseNegatives': 2.0,
         'trueNegatives': 1.0,
         'truePositives': 2.0,
         'precision': 1.0,
         'recall': 0.5
     }
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics,
                                   expected_metrics)
     self.assertEqual(len(eval_result.plots), 1)
     slice_key, plots = eval_result.plots[0]
     self.assertEqual((), slice_key)
     tf.logging.info(plots.keys())
     self.assertDictElementsAlmostEqual(
         plots['post_export_metrics']['confusionMatrixAtThresholds']
         ['matrices'][8001], expected_matrix)
     self.assertDictElementsAlmostEqual(
         plots['post_export_metrics/test']['confusionMatrixAtThresholds']
         ['matrices'][8001], expected_matrix)
예제 #8
0
    def testBatchedPredict(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_export_dir)
        eval_config = config_pb2.EvalConfig(
            model_specs=[config_pb2.ModelSpec()])
        with beam.Pipeline() as pipeline:
            examples = [
                self._makeExample(age=3.0, language='english', label=1.0),
                self._makeExample(age=3.0, language='chinese', label=0.0),
                self._makeExample(age=4.0, language='english', label=1.0),
                self._makeExample(age=5.0, language='chinese', label=0.0),
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            tfx_io = raw_tf_record.RawBeamRecordTFXIO(
                physical_format='inmemory',
                raw_record_column_name=constants.ARROW_INPUT_COLUMN,
                telemetry_descriptors=['TFMATest'])
            extractor = predict_extractor.PredictExtractor(
                eval_shared_model, eval_config=eval_config)
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(serialized_examples, reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | 'Predict' >> extractor.ptransform)

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    for item in got:
                        self.assertIn(constants.FEATURES_KEY, item)
                        for feature in ('language', 'age'):
                            for features_dict in item[constants.FEATURES_KEY]:
                                self.assertIn(feature, features_dict)
                        self.assertIn(constants.LABELS_KEY, item)
                        self.assertIn(constants.PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
예제 #9
0
  def testNoConstructFn(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    examples = [self._makeExample(age=3.0, language='english', label=1.0)]
    data_location = self._writeTFExamplesToTFRecords(examples)
    # No construct_fn should fail when Beam attempts to call the construct_fn.
    eval_shared_model = types.EvalSharedModel(model_path=model_location)
    with self.assertRaisesRegexp(TypeError,
                                 '\'NoneType\' object is not callable'):
      model_eval_lib.run_model_analysis(
          eval_shared_model=eval_shared_model, data_location=data_location)

    # Using the default_eval_shared_model should pass as it has a construct_fn.
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)
    model_eval_lib.run_model_analysis(
        eval_shared_model=eval_shared_model, data_location=data_location)
예제 #10
0
    def testPredictMultipleExampleRefPerRawExampleBytes(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (fake_multi_examples_per_input_estimator.
                              fake_multi_examples_per_input_estimator(
                                  None, temp_eval_export_dir))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_export_dir)

        # The trailing zeros make an "empty" output batch.
        raw_example_bytes = ['0', '3', '1', '0', '2', '0', '0', '0', '0']

        def check_result(got):
            try:
                self.assertLen(got, 6)
                self.assertEqual(
                    ['3', '3', '3', '1', '2', '2'],
                    [extracts[constants.INPUT_KEY] for extracts in got])

                for item in got:
                    self.assertIn(constants.FEATURES_PREDICTIONS_LABELS_KEY,
                                  item)
                    fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY]
                    self.assertIn('input_index', fpl.features)
                    self.assertIn('example_count', fpl.features)
                    self.assertIn('intra_input_index', fpl.features)

            except AssertionError as err:
                raise util.BeamAssertException(err)

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | beam.Create(raw_example_bytes, reshuffle=False)
                # Our diagnostic outputs, pass types.Extracts throughout, however our
                # aggregating functions do not use this interface.
                | beam.Map(lambda x: {constants.INPUT_KEY: x})
                | 'Predict' >> predict_extractor._TFMAPredict(
                    eval_shared_models={'': eval_shared_model},
                    desired_batch_size=3))

            util.assert_that(predict_extracts, check_result)
예제 #11
0
  def testNoConstructFn(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    examples = [self._makeExample(age=3.0, language='english', label=1.0)]
    data_location = self._writeTFExamplesToTFRecords(examples)
    eval_config = config.EvalConfig(
        input_data_specs=[config.InputDataSpec(location=data_location)],
        model_specs=[config.ModelSpec(location=model_location)],
        output_data_specs=[
            config.OutputDataSpec(default_location=self._getTempDir())
        ])
    # No construct_fn should fail when Beam attempts to call the construct_fn.
    eval_shared_model = types.EvalSharedModel(model_path=model_location)
    with self.assertRaisesRegexp(AttributeError,
                                 '\'NoneType\' object has no attribute'):
      model_eval_lib.run_model_analysis(
          eval_config=eval_config, eval_shared_models=[eval_shared_model])

    # Using the default_eval_shared_model should pass as it has a construct_fn.
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)
    model_eval_lib.run_model_analysis(
        eval_config=eval_config, eval_shared_models=[eval_shared_model])
예제 #12
0
 def testRunModelAnalysisForCSVText(self):
     model_location = self._exportEvalSavedModel(
         csv_linear_classifier.simple_csv_linear_classifier)
     examples = [
         '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0',
         '5.0,chinese,1.0'
     ]
     data_location = self._writeCSVToTextFile(examples)
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=data_location,
                                  file_format='text')
         ],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ])
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location)
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'accuracy': {
                 'doubleValue': 0.75
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 4.0
             }
         }
     }
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
예제 #13
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
예제 #14
0
    def testRunModelAnalysisWithQueryBasedMetrics(self):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='age')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy)

        features = {'age': [[20.0]]}
        labels = [[1]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0),
            self._makeExample(age=3.0, language='english', label=0.0),
            self._makeExample(age=5.0, language='chinese', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec()]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            slicing_specs=slicing_specs,
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='age', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1]),
                query_key='language'))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            tags=[tf.saved_model.SERVING])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[eval_shared_model],
            evaluators=[
                metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                    eval_config=eval_config,
                    eval_shared_models=[eval_shared_model])
            ])

        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            '': {
                'example_count': True,
                'weighted_example_count': True,
            },
            'topK:1': {
                'ndcg': True,
            },
        }
        for group in expected_metrics:
            self.assertIn(group, got_metrics)
            for k in expected_metrics[group]:
                self.assertIn(k, got_metrics[group])
예제 #15
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])
예제 #16
0
    def assertMetricsComputedWithBeamAre(
        self,
        eval_saved_model_path: str,
        serialized_examples: List[bytes],
        expected_metrics: Dict[str, Any],
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
      add_metrics_callbacks: Optional. Callbacks for adding additional metrics.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_config = config_pb2.EvalConfig()
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        tfx_io = raw_tf_record.RawBeamRecordTFXIO(
            physical_format='inmemory',
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            telemetry_descriptors=['TFMATest'])
        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            (metrics, _), _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'BatchExamples' >> tfx_io.BeamSource()
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | 'Extract' >> Extract(extractors=extractors)
                | 'ComputeMetricsAndPlots' >>
                legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots(  # pylint: disable=protected-access
                    eval_shared_model=eval_shared_model))
            # pylint: enable=no-value-for-parameter

            beam_util.assert_that(metrics, check_metrics)
예제 #17
0
 def testRunModelAnalysis(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=1.0),
       self._makeExample(age=5.0, language='chinese', label=1.0),
       self._makeExample(age=5.0, language='hindi', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
   eval_result = model_eval_lib.run_model_analysis(
       model_eval_lib.default_eval_shared_model(
           eval_saved_model_path=model_location, example_weight_key='age'),
       data_location,
       slice_spec=slice_spec,
       k_anonymization_count=2)
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (('language', b'hindi'),): {
           u'__ERROR__': {
               'debugMessage':
                   u'Example count for this slice key is lower than the '
                   u'minimum required value: 2. No data is aggregated for '
                   u'this slice.'
           },
       },
       (('language', b'chinese'),): {
           'accuracy': {
               'doubleValue': 0.5
           },
           'my_mean_label': {
               'doubleValue': 0.5
           },
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 8.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       },
       (('language', b'english'),): {
           'accuracy': {
               'doubleValue': 1.0
           },
           'my_mean_label': {
               'doubleValue': 1.0
           },
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 7.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
예제 #18
0
    def testRunModelAnalysisWithDeterministicConfidenceIntervals(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=1.0),
            self._makeExample(age=5.0, language='hindi', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
        options = config.Options()
        options.compute_confidence_intervals.value = True
        options.k_anonymization_count.value = 2
        eval_config = config.EvalConfig(slicing_specs=slicing_specs,
                                        options=options)
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_model=model_eval_lib.default_eval_shared_model(
                eval_saved_model_path=model_location,
                example_weight_key='age'),
            data_location=data_location,
            output_path=self._getTempDir(),
            random_seed_for_testing=_TEST_SEED)
        # We only check some of the metrics to ensure that the end-to-end
        # pipeline works.
        expected = {
            (('language', 'hindi'), ): {
                u'__ERROR__': {
                    'debugMessage':
                    u'Example count for this slice key is lower than the '
                    u'minimum required value: 2. No data is aggregated for '
                    u'this slice.'
                },
            },
            (('language', 'chinese'), ): {
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 8.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            },
            (('language', 'english'), ): {
                'accuracy': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                'my_mean_label': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 7.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            }
        }
        self.assertEqual(eval_result.model_location, model_location.decode())
        self.assertEqual(eval_result.data_location, data_location)
        self.assertEqual(eval_result.config.slicing_specs[0],
                         config.SlicingSpec(feature_keys=['language']))
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)

        for key, value in eval_result.slicing_metrics:
            if (('language', 'english'), ) == key:
                metric = value['']['']['average_loss']
                self.assertAlmostEqual(0.171768754720,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

                metric = value['']['']['auc_precision_recall']
                self.assertAlmostEqual(0.99999940395,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

        self.assertFalse(eval_result.plots)
예제 #19
0
 def testRunModelAnalysisWithQueryExtractor(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=0.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec()]
   eval_shared_model = model_eval_lib.default_eval_shared_model(
       eval_saved_model_path=model_location, example_weight_key='age')
   eval_result = model_eval_lib.run_model_analysis(
       eval_shared_model=eval_shared_model,
       data_location=data_location,
       slice_spec=slice_spec,
       evaluators=[
           metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
               eval_shared_model),
           query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
               query_id='language',
               prediction_key='logistic',
               combine_fns=[
                   query_statistics.QueryStatisticsCombineFn(),
                   ndcg.NdcgMetricCombineFn(
                       at_vals=[1], gain_key='label', weight_key='')
               ]),
       ])
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (): {
           'post_export_metrics/total_queries': {
               'doubleValue': 2.0
           },
           'post_export_metrics/min_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/max_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/total_documents': {
               'doubleValue': 4.0
           },
           'post_export_metrics/ndcg@1': {
               'doubleValue': 0.5
           },
           'post_export_metrics/example_weight': {
               'doubleValue': 15.0
           },
           'post_export_metrics/example_count': {
               'doubleValue': 4.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
예제 #20
0
 def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           my_slice='a'),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           my_slice='a'),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           my_slice='b'),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           my_slice='c'),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])]
     extractors_with_feature_extraction = [
         predict_extractor.PredictExtractor(eval_shared_model,
                                            desired_batch_size=3,
                                            materialize=False),
         feature_extractor.FeatureExtractor(
             extract_source=constants.INPUT_KEY,
             extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY),
         slice_key_extractor.SliceKeyExtractor(slice_spec,
                                               materialize=False)
     ]
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ],
         extractors=extractors_with_feature_extraction)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('my_slice', 'a'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 0.5
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 6.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('my_slice', 'b'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 4.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
         (('my_slice', 'c'), ): {
             'accuracy': {
                 'doubleValue': 0.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 5.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['my_slice']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)