def testBatchSizeLimit(self): temp_export_dir = self._getExportDir() _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier( None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) predict_extractor = predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: examples = [ self._makeExample(classes='first', scores=0.0, labels='third'), self._makeExample(classes='first', scores=0.0, labels='third'), self._makeExample(classes='first', scores=0.0, labels='third'), self._makeExample(classes='first', scores=0.0, labels='third'), ] predict_extracts = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples], reshuffle=False) | 'FeaturesToExtracts' >> model_eval_lib.InputsToExtracts() | predict_extractor.stage_name >> predict_extractor.ptransform) def check_result(got): try: self.assertLen(got, 4) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def assertMetricsComputedWithBeamAre(self, eval_saved_model_path, serialized_examples, expected_metrics, add_metrics_callbacks=None): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. add_metrics_callbacks: Optional. Callbacks for adding additional metrics. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics, _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testPredictExtractorWithMultiOutputModel(self): temp_export_dir = self._getExportDir() _, export_dir = multi_head.simple_multi_head(None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( model_path=export_dir) predict_extractor = predict_extractor_v2.PredictExtractor( eval_shared_model) examples = [ self._makeExample(age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample(age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertEqual(4, len(got), 'got: %s' % got) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.PREDICTIONS_KEY, item) for output_name in ('chinese_head', 'english_head', 'other_head'): for pred_key in ('logistic', 'probabilities', 'all_classes'): self.assertIn(output_name + '/' + pred_key, item[constants.PREDICTIONS_KEY]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateQueryBasedMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) extractors = [ legacy_predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: metrics = ( pipeline | 'Create' >> beam.Create(self._get_examples()) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'EvaluateQueryBasedMetrics' >> query_based_metrics_evaluator.EvaluateQueryBasedMetrics( prediction_key='', query_id='fixed_string', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), ndcg.NdcgMetricCombineFn(at_vals=[1, 2], gain_key='fixed_float', weight_key='fixed_int'), min_label_position.MinLabelPositionCombineFn( label_key='', weight_key='fixed_int'), ])) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { 'post_export_metrics/total_queries': 3.0, 'post_export_metrics/total_documents': 6.0, 'post_export_metrics/min_documents': 1.0, 'post_export_metrics/max_documents': 3.0, 'post_export_metrics/ndcg@1': 0.9166667, 'post_export_metrics/ndcg@2': 0.9766198, 'post_export_metrics/average_min_label_position/__labels': 0.6666667, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testPredictExtractorWithRegressionModel(self): temp_export_dir = self._getExportDir() export_dir, _ = ( fixed_prediction_estimator_extra_fields .simple_fixed_prediction_estimator_extra_fields(temp_export_dir, None)) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) predict_extractor = predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model) examples = [ self._makeExample( prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample( prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample( prediction=0.5, label=0.0, fixed_int=2, fixed_float=1.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples], reshuffle=False) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got_preds): try: self.assertLen(got_preds, 3) expected_preds = [0.2, 0.8, 0.5] for got_pred, expected_pred in zip(got_preds, expected_preds): self.assertIn(constants.PREDICTIONS_KEY, got_pred) self.assertAlmostEqual(got_pred[constants.PREDICTIONS_KEY], expected_pred) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel( self): # Mainly for testing that the ExampleCount post export metric works with # unsupervised models. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_no_labels. simple_fixed_prediction_estimator_no_labels( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='prediction') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=1.0) example2 = self._makeExample(prediction=2.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'average_loss': 2.5, metric_keys.EXAMPLE_COUNT: 2.0, metric_keys.EXAMPLE_WEIGHT: 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics_callbacks, slice_spec=None, custom_metrics_check=None, custom_plots_check=None, custom_result_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None or custom_result_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec(location=eval_export_dir)], output_data_specs=[config.OutputDataSpec()], slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_models=[eval_shared_model]) with beam.Pipeline() as pipeline: (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, compute_confidence_intervals=self. compute_confidence_intervals, random_seed_for_testing=self.deterministic_test_seed)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot') result = pipeline.run() if custom_result_check is not None: custom_result_check(result)
def BuildAnalysisTable( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, desired_batch_size: Optional[int] = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None ) -> beam.pvalue.PCollection: """Builds an analysis table from data extracted from the input. Use this function to build an example-oriented PCollection of output data useful for debugging models. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Shared model parameters for EvalSavedModel. slice_spec: Optional list of SingleSliceSpec specifying the slices to slice the data into. If None, defaults to the overall slice. desired_batch_size: Optional batch size for batching in Predict and Aggregate. extractors: Optional list of Extractors to execute prior to slicing and aggregating the metrics. If not provided, a default set will be run. evaluators: Optional list of Evaluators for evaluating Extracts. If not provided a default set will be used.. Returns: beam.pvalue.PCollection of Extracts. The caller is responsible for committing to file for now. """ if not slice_spec: slice_spec = [slicer.SingleSliceSpec()] if not extractors: extractors = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size), feature_extractor.FeatureExtractor(), slice_key_extractor.SliceKeyExtractor(slice_spec) ] if not evaluators: evaluators = [analysis_table_evaluator.AnalysisTableEvaluator()] # pylint: disable=no-value-for-parameter return (examples | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | model_eval_lib.ExtractAndEvaluate(extractors=extractors, evaluators=evaluators))
def testPredictExtractorWithBinaryClassificationModel(self): temp_export_dir = self._getExportDir() export_dir, _ = dnn_classifier.simple_dnn_classifier(temp_export_dir, None, n_classes=2) eval_config = config.EvalConfig( model_specs=[config.ModelSpec(location=export_dir)]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) predict_extractor = predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]) examples = [ self._makeExample(age=1.0, language='english', label=0), self._makeExample(age=2.0, language='chinese', label=1), self._makeExample(age=3.0, language='chinese', label=0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 3) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.PREDICTIONS_KEY, item) for pred_key in ('logistic', 'probabilities', 'all_classes'): self.assertIn(pred_key, item[constants.PREDICTIONS_KEY]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateWithSlicingAndDifferentBatchSizes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ]) ] for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, desired_batch_size=batch_size)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException( 'batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateWithBinaryClassificationModel(self): n_classes = 2 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add mean_label, example_count, weighted_example_count, calibration_plot eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration_plot.CalibrationPlot(name='calibration_plot', num_buckets=10) ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0.0), self._makeExample(age=2.0, language='chinese', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics_and_plots = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 3, weighted_example_count_key: (1.0 + 2.0 + 3.0), label_key: (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0), }) except AssertionError as err: raise util.BeamAssertException(err) def check_plots(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) plot_key = metric_types.PlotKey('calibration_plot') self.assertIn(plot_key, got_plots) # 10 buckets + 2 for edge cases self.assertLen(got_plots[plot_key].buckets, 12) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics_and_plots[constants.METRICS_KEY], check_metrics, label='metrics') util.assert_that(metrics_and_plots[constants.PLOTS_KEY], check_plots, label='plots')
def testInputExtractor(self): model_spec = config.ModelSpec(label_key='label', example_weight_key='example_weight') extractor = input_extractor.InputExtractor( eval_config=config.EvalConfig(model_specs=[model_spec])) examples = [ self._makeExample(label=1.0, example_weight=0.5, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(label=0.0, example_weight=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample(label=0.0, example_weight=1.0, fixed_int=2, fixed_float=0.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | extractor.stage_name >> extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 3) self.assertDictElementsAlmostEqual( got[0][constants.FEATURES_KEY], { 'fixed_int': np.array([1]), 'fixed_float': np.array([1.0]), }) self.assertEqual( got[0][constants.FEATURES_KEY]['fixed_string'], np.array([b'fixed_string1'])) self.assertAlmostEqual(got[0][constants.LABELS_KEY], np.array([1.0])) self.assertAlmostEqual( got[0][constants.EXAMPLE_WEIGHTS_KEY], np.array([0.5])) self.assertDictElementsAlmostEqual( got[1][constants.FEATURES_KEY], { 'fixed_int': np.array([1]), 'fixed_float': np.array([1.0]), }) self.assertEqual( got[1][constants.FEATURES_KEY]['fixed_string'], np.array([b'fixed_string2'])) self.assertAlmostEqual(got[1][constants.LABELS_KEY], np.array([0.0])) self.assertAlmostEqual( got[1][constants.EXAMPLE_WEIGHTS_KEY], np.array([0.0])) self.assertDictElementsAlmostEqual( got[2][constants.FEATURES_KEY], { 'fixed_int': np.array([2]), 'fixed_float': np.array([0.0]), }) self.assertEqual( got[2][constants.FEATURES_KEY]['fixed_string'], np.array([b'fixed_string3'])) self.assertAlmostEqual(got[2][constants.LABELS_KEY], np.array([0.0])) self.assertAlmostEqual( got[2][constants.EXAMPLE_WEIGHTS_KEY], np.array([1.0])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testInputExtractorMultiModel(self): model_spec1 = config.ModelSpec(name='model1', label_key='label', example_weight_key='example_weight', prediction_key='fixed_float') model_spec2 = config.ModelSpec(name='model2', label_keys={ 'output1': 'label1', 'output2': 'label2' }, example_weight_keys={ 'output1': 'example_weight1', 'output2': 'example_weight2' }, prediction_keys={ 'output1': 'fixed_float', 'output2': 'fixed_float' }) extractor = input_extractor.InputExtractor( eval_config=config.EvalConfig( model_specs=[model_spec1, model_spec2])) examples = [ self._makeExample(label=1.0, label1=1.0, label2=0.0, example_weight=0.5, example_weight1=0.5, example_weight2=0.5, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(label=1.0, label1=1.0, label2=1.0, example_weight=0.0, example_weight1=0.0, example_weight2=1.0, fixed_int=1, fixed_float=2.0, fixed_string='fixed_string2'), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | extractor.stage_name >> extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 2) self.assertDictElementsAlmostEqual( got[0][constants.FEATURES_KEY], { 'fixed_int': np.array([1]), }) self.assertEqual( got[0][constants.FEATURES_KEY]['fixed_string'], np.array([b'fixed_string1'])) for model_name in ('model1', 'model2'): self.assertIn(model_name, got[0][constants.LABELS_KEY]) self.assertIn(model_name, got[0][constants.EXAMPLE_WEIGHTS_KEY]) self.assertIn(model_name, got[0][constants.PREDICTIONS_KEY]) self.assertAlmostEqual( got[0][constants.LABELS_KEY]['model1'], np.array([1.0])) self.assertDictElementsAlmostEqual( got[0][constants.LABELS_KEY]['model2'], { 'output1': np.array([1.0]), 'output2': np.array([0.0]) }) self.assertAlmostEqual( got[0][constants.EXAMPLE_WEIGHTS_KEY]['model1'], np.array([0.5])) self.assertDictElementsAlmostEqual( got[0][constants.EXAMPLE_WEIGHTS_KEY]['model2'], { 'output1': np.array([0.5]), 'output2': np.array([0.5]) }) self.assertAlmostEqual( got[0][constants.PREDICTIONS_KEY]['model1'], np.array([1.0])) self.assertDictElementsAlmostEqual( got[0][constants.PREDICTIONS_KEY]['model2'], { 'output1': np.array([1.0]), 'output2': np.array([1.0]) }) self.assertDictElementsAlmostEqual( got[1][constants.FEATURES_KEY], { 'fixed_int': np.array([1]), }) self.assertEqual( got[1][constants.FEATURES_KEY]['fixed_string'], np.array([b'fixed_string2'])) for model_name in ('model1', 'model2'): self.assertIn(model_name, got[1][constants.LABELS_KEY]) self.assertIn(model_name, got[1][constants.EXAMPLE_WEIGHTS_KEY]) self.assertIn(model_name, got[1][constants.PREDICTIONS_KEY]) self.assertAlmostEqual( got[1][constants.LABELS_KEY]['model1'], np.array([1.0])) self.assertDictElementsAlmostEqual( got[1][constants.LABELS_KEY]['model2'], { 'output1': np.array([1.0]), 'output2': np.array([1.0]) }) self.assertAlmostEqual( got[1][constants.EXAMPLE_WEIGHTS_KEY]['model1'], np.array([0.0])) self.assertDictElementsAlmostEqual( got[1][constants.EXAMPLE_WEIGHTS_KEY]['model2'], { 'output1': np.array([0.0]), 'output2': np.array([1.0]) }) self.assertAlmostEqual( got[1][constants.PREDICTIONS_KEY]['model1'], np.array([2.0])) self.assertDictElementsAlmostEqual( got[1][constants.PREDICTIONS_KEY]['model2'], { 'output1': np.array([2.0]), 'output2': np.array([2.0]) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateWithMultiClassModel(self): n_classes = 3 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add example_count and weighted_example_count eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')], binarize=config.BinarizationOptions( class_ids=range(n_classes)))) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0), self._makeExample(age=2.0, language='chinese', label=1), self._makeExample(age=3.0, language='english', label=2), self._makeExample(age=4.0, language='chinese', label=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key_class_0 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=0)) label_key_class_1 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=1)) label_key_class_2 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=2)) self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, weighted_example_count_key: (1.0 + 2.0 + 3.0 + 4.0), label_key_class_0: (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_1: (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_2: (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithSlicing(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor( eval_shared_model=eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.asssertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithEvalSavedModel(self): temp_export_dir = self._getExportDir() _, export_dir = linear_classifier.simple_linear_classifier( None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[config.ModelSpec(signature_name='eval')], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['slice_key']), ]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ] examples = [ self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice'), self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice'), self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertCountEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { metric_types.MetricKey(name='accuracy'): 0.4, metric_types.MetricKey(name='label/mean'): 0.6, metric_types.MetricKey(name='my_mean_age'): 4.0, metric_types.MetricKey(name='my_mean_age_times_label'): 2.6, metric_types.MetricKey(name='added_example_count'): 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { metric_types.MetricKey(name='accuracy'): 1.0, metric_types.MetricKey(name='label/mean'): 0.5, metric_types.MetricKey(name='my_mean_age'): 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 1.5, metric_types.MetricKey(name='added_example_count'): 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { metric_types.MetricKey(name='accuracy'): 0.0, metric_types.MetricKey(name='label/mean'): 2.0 / 3.0, metric_types.MetricKey(name='my_mean_age'): 14.0 / 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 10.0 / 3.0, metric_types.MetricKey(name='added_example_count'): 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testModelAgnosticConstructFn(self): # End to end test for the entire flow going from tf.Examples -> metrics # with slicing. with beam.Pipeline() as pipeline: # Set up the inputs. All we need is are tf.Examples and an example parsing # spec with explicit mapping for key to (Features, Predictions, Labels). examples = [ self._makeExample(age=3.0, language='english', probabilities=1.0, labels=1.0), self._makeExample(age=3.0, language='chinese', probabilities=3.0, labels=0.0), self._makeExample(age=4.0, language='english', probabilities=2.0, labels=1.0), self._makeExample(age=5.0, language='chinese', probabilities=3.0, labels=0.0), # Add some examples with no language. self._makeExample(age=5.0, probabilities=2.0, labels=10.0), self._makeExample(age=6.0, probabilities=1.0, labels=0.0) ] serialized_examples = [e.SerializeToString() for e in examples] # Set up a config to bucket our example keys. feature_map = { 'age': tf.FixedLenFeature([], tf.float32), 'language': tf.VarLenFeature(tf.string), 'probabilities': tf.FixedLenFeature([], tf.float32), 'labels': tf.FixedLenFeature([], tf.float32) } model_agnostic_config = agnostic_predict.ModelAgnosticConfig( label_keys=['labels'], prediction_keys=['probabilities'], feature_spec=feature_map) # Set up the Model Agnostic Extractor extractors = [ model_agnostic_extractor.ModelAgnosticExtractor( model_agnostic_config=model_agnostic_config, desired_batch_size=3), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['language']) ]) ] # Set up the metrics we wish to calculate via a metric callback. In # particular, this metric calculates the mean and sum of all labels. eval_shared_model = types.EvalSharedModel( add_metrics_callbacks=[add_mean_callback], construct_fn=model_agnostic_evaluate_graph.make_construct_fn( add_metrics_callbacks=[add_mean_callback], fpl_feed_config=model_agnostic_extractor. ModelAgnosticGetFPLFeedConfig(model_agnostic_config))) # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics. metrics, _ = ( pipeline | 'Create Examples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # Verify our metrics are properly generated per slice. def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics overall_slice = () english_slice = (('language', b'english'), ) chinese_slice = (('language', b'chinese'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, english_slice, chinese_slice]) # Overall slice has label/predictions sum = 24 and 12 elements. self.assertDictElementsAlmostEqual(slices[overall_slice], { 'tf_metric_mean': 2.0, 'py_func_total_label': 24.0, }) # English slice has label/predictions sum = 5 and 4 elements. self.assertDictElementsAlmostEqual(slices[english_slice], { 'tf_metric_mean': 1.25, 'py_func_total_label': 5.0, }) # Chinese slice has label/predictions sum = 6 and 4 elements. self.assertDictElementsAlmostEqual(slices[chinese_slice], { 'tf_metric_mean': 1.5, 'py_func_total_label': 6.0, }) util.assert_that(metrics, check_result)
def testEvaluateWithMultiOutputModel(self): temp_export_dir = self._getExportDir() _, export_dir = multi_head.simple_multi_head(None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_keys={ 'chinese_head': 'chinese_label', 'english_head': 'english_label', 'other_head': 'other_label' }, example_weight_keys={ 'chinese_head': 'age', 'english_head': 'age', 'other_head': 'age' }) ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics({ 'chinese_head': [calibration.MeanLabel('mean_label')], 'english_head': [calibration.MeanLabel('mean_label')], 'other_head': [calibration.MeanLabel('mean_label')], })) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample(age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') chinese_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='chinese_head') chinese_label_key = metric_types.MetricKey( name='mean_label', output_name='chinese_head') english_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='english_head') english_label_key = metric_types.MetricKey( name='mean_label', output_name='english_head') other_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='other_head') other_label_key = metric_types.MetricKey( name='mean_label', output_name='other_head') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, chinese_label_key: (0.0 + 1.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), chinese_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), english_label_key: (1.0 + 0.0 + 2 * 1.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0 + 2.0), english_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), other_label_key: (0.0 + 0.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), other_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithKerasModel(self): input1 = tf.keras.layers.Input(shape=(1, ), name='input1') input2 = tf.keras.layers.Input(shape=(1, ), name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layer = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, name='output')(input_layer) model = tf.keras.models.Model(inputs, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]} labels = [[1], [0]] example_weights = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='example_weight') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(input1=0.0, input2=1.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample(input1=1.0, input2=0.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 2, weighted_example_count_key: (1.0 + 0.5), label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5), }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithQueryBasedMetrics(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_int') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='fixed_float', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1, 2]), query_key='fixed_string')) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_string used as query_key # fixed_float used as gain_key for NDCG # fixed_int used as example_weight_key for NDCG examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_float=1.0, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.8, label=0.0, fixed_float=0.5, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.5, label=0.0, fixed_float=0.5, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.1, label=0.0, fixed_float=0.1, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query3', fixed_int=3) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 4) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () query1_slice = (('fixed_string', b'query1'), ) query2_slice = (('fixed_string', b'query2'), ) query3_slice = (('fixed_string', b'query3'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, query1_slice, query2_slice, query3_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') ndcg1_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=1)) ndcg2_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=2)) # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0) # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1) # Query3 (weight=3): (p=0.9, g=1.0) # # DCG@1: 0.5, 1.0, 1.0 # NDCG@1: 0.5, 1.0, 1.0 # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92 # # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930 # (1.0 + 0.5/log(3) ~ 1.315465 # 1.0 # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972 # (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0 # 1.0 # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97 self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 6, weighted_example_count_key: 11.0, ndcg1_key: 0.9166667, ndcg2_key: 0.9766198 }) self.assertDictElementsAlmostEqual( slices[query1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, ndcg1_key: 0.5, ndcg2_key: 0.85972 }) self.assertDictElementsAlmostEqual( slices[query2_slice], { example_count_key: 3, weighted_example_count_key: 6.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) self.assertDictElementsAlmostEqual( slices[query3_slice], { example_count_key: 1, weighted_example_count_key: 3.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ _full_key(metric_keys.AUC_PLOTS_MATRICES): [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
def testPredictExtractorWithMultiModels(self): temp_export_dir = self._getExportDir() export_dir1, _ = multi_head.simple_multi_head(temp_export_dir, None) export_dir2, _ = multi_head.simple_multi_head(temp_export_dir, None) eval_config = config.EvalConfig(model_specs=[ config.ModelSpec(name='model1'), config.ModelSpec(name='model2') ]) eval_shared_model1 = self.createTestEvalSharedModel( eval_saved_model_path=export_dir1, tags=[tf.saved_model.SERVING]) eval_shared_model2 = self.createTestEvalSharedModel( eval_saved_model_path=export_dir2, tags=[tf.saved_model.SERVING]) predict_extractor = predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model={ 'model1': eval_shared_model1, 'model2': eval_shared_model2 }) examples = [ self._makeExample( age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample( age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample( age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample( age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples], reshuffle=False) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 4) for item in got: # We can't verify the actual predictions, but we can verify the keys self.assertIn(constants.PREDICTIONS_KEY, item) for model_name in ('model1', 'model2'): self.assertIn(model_name, item[constants.PREDICTIONS_KEY]) for output_name in ('chinese_head', 'english_head', 'other_head'): for pred_key in ('logistic', 'probabilities', 'all_classes'): self.assertIn(output_name + '/' + pred_key, item[constants.PREDICTIONS_KEY][model_name]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def assertGeneralMetricsComputedWithBeamAre( self, eval_saved_model_path: Text, examples_pcollection: beam.pvalue.PCollection, slice_spec: List[slicer.SingleSliceSpec], add_metrics_callbacks: List[types.AddMetricsCallbackType], expected_slice_metrics: Dict[Any, Dict[Text, Any]]): """Checks metrics computed using Beam. A more general version of assertMetricsComputedWithBeamAre. Note that the caller is responsible for setting up and running the Beam pipeline. Example usage: def add_metrics(features, predictions, labels): metric_ops = { 'mse': tf.metrics.mean_squared_error(labels, predictions['logits']), 'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']), } return metric_ops with beam.Pipeline() as pipeline: expected_slice_metrics = { (): { 'mae': 0.1, 'mse': 0.2, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, (('age', 10),): { 'mae': 0.2, 'mse': 0.3, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, } examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=path, examples_pcollection=examples, slice_spec=[tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['age'])], add_metrics_callbacks=[ add_metrics, tfma.post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. examples_pcollection: A PCollection of serialized example bytes. slice_spec: List of slice specifications. add_metrics_callbacks: Callbacks for adding additional metrics. expected_slice_metrics: Dictionary of dictionaries describing the expected metrics for each slice. The outer dictionary map slice keys to the expected metrics for that slice. """ def check_metrics(got): """Check metrics callback.""" try: slices = {} for slice_key, value in got: slices[slice_key] = value self.assertItemsEqual(list(slices.keys()), list(expected_slice_metrics.keys())) for slice_key, expected_metrics in expected_slice_metrics.items( ): self.assertDictElementsWithinBounds( got_values_dict=slices[slice_key], expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig(slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) # pylint: disable=no-value-for-parameter (metrics, _), _ = (examples_pcollection | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testEvaluateWithConfidenceIntervals(self): # NOTE: This test does not actually test that confidence intervals are # accurate it only tests that the proto output by the test is well formed. # This test would pass if the confidence interval implementation did # nothing at all except compute the unsampled value. temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) options = config.Options() options.compute_confidence_intervals.value = True eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ]), options=options) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsWithTDistributionAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')