def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics, custom_metrics_check=None, custom_plots_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=metrics) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: metrics, plots = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot')
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) metrics, plots = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ])) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.AUC_PLOTS_MATRICES: [(8001, [ 2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0 ])], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = (pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='age') ])) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def assertMetricsComputedWithBeamAre(self, eval_saved_model_path, serialized_examples, expected_metrics): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_shared_model = types.EvalSharedModel( model_path=eval_saved_model_path) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: metrics, _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) beam_util.assert_that(metrics, check_metrics)
def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel( self): # Mainly for testing that the ExampleCount post export metric works with # unsupervised models. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_no_labels. simple_fixed_prediction_estimator_no_labels( None, temp_eval_export_dir)) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='prediction') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=1.0) example2 = self._makeExample(prediction=2.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'average_loss': 2.5, metric_keys.EXAMPLE_COUNT: 2.0, metric_keys.EXAMPLE_WEIGHT: 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics, custom_metrics_check=None, custom_plots_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] with beam.Pipeline() as pipeline: metrics, plots = (pipeline | beam.Create(serialized_examples) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot')
def assertGeneralMetricsComputedWithBeamAre(self, eval_saved_model_path, examples_pcollection, slice_spec, add_metrics_callbacks, expected_slice_metrics): """Checks metrics computed using Beam. A more general version of assertMetricsComputedWithBeamAre. Note that the caller is responsible for setting up and running the Beam pipeline. Example usage: def add_metrics(features, predictions, labels): metric_ops = { 'mse': tf.metrics.mean_squared_error(labels, predictions['logits']), 'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']), } return metric_ops with beam.Pipeline() as pipeline: expected_slice_metrics = { (): { 'mae': 0.1, 'mse': 0.2, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, (('age', 10),): { 'mae': 0.2, 'mse': 0.3, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, } examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=path, examples_pcollection=examples, slice_spec=[tfma.SingleSliceSpec(), tfma.SingleSliceSpec(columns=['age'])], add_metrics_callbacks=[ add_metrics, tfma.post_export_metrics.post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. examples_pcollection: A PCollection of serialized example bytes. slice_spec: List of slice specifications. add_metrics_callbacks: Callbacks for adding additional metrics. expected_slice_metrics: Dictionary of dictionaries describing the expected metrics for each slice. The outer dictionary map slice keys to the expected metrics for that slice. """ def check_metrics(got): """Check metrics callback.""" try: slices = {} for slice_key, value in got: slices[slice_key] = value self.assertItemsEqual(slices.keys(), expected_slice_metrics.keys()) for slice_key, expected_metrics in expected_slice_metrics.items( ): self.assertDictElementsWithinBounds( got_values_dict=slices[slice_key], expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) metrics, _ = (examples_pcollection | 'Evaluate' >> evaluate.Evaluate( eval_saved_model_path=eval_saved_model_path, slice_spec=slice_spec, add_metrics_callbacks=add_metrics_callbacks)) beam_util.assert_that(metrics, check_metrics)
def EvaluateAndWriteResults( # pylint: disable=invalid-name examples, eval_saved_model_path, output_path, display_only_data_location = None, slice_spec = None, example_weight_key = None, add_metrics_callbacks = None, # pylint: disable=bad-whitespace desired_batch_size = None, ): """Public API version of evaluate.Evaluate that handles example weights. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=model_location, output_path=output_path, display_only_data_location=data_location, slice_spec=slice_spec, example_weight_key=example_weight_key, ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_saved_model_path: Path to EvalSavedModel. This directory should contain the saved_model.pb file. output_path: Path to output metrics and plots results. display_only_data_location: Optional path indicating where the examples were read from. This is used only for display purposes - data will not actually be read from this path. slice_spec: Optional list of SingleSliceSpec specifying the slices to slice the data into. If None, defaults to the overall slice. example_weight_key: The key of the example weight column. If None, weight will be 1 for each example. add_metrics_callbacks: Optional list of callbacks for adding additional metrics to the graph. The names of the metrics added by the callbacks should not conflict with existing metrics, or metrics added by other callbacks. See below for more details about what each callback should do. desired_batch_size: Optional batch size for batching in Predict and Aggregate. Returns: PDone. """ if add_metrics_callbacks is None: add_metrics_callbacks = [] # Always compute example weight and example count. # pytype: disable=module-attr example_count_callback = post_export_metrics.example_count() example_weight_metric_key = metric_keys.EXAMPLE_COUNT add_metrics_callbacks.append(example_count_callback) if example_weight_key: example_weight_metric_key = metric_keys.EXAMPLE_WEIGHT example_weight_callback = post_export_metrics.example_weight( example_weight_key) add_metrics_callbacks.append(example_weight_callback) # pytype: enable=module-attr metrics, plots = examples | 'Evaluate' >> evaluate.Evaluate( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks, slice_spec=slice_spec, desired_batch_size=desired_batch_size) data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location eval_config = api_types.EvalConfig( model_location=eval_saved_model_path, data_location=data_location, slice_spec=slice_spec, example_weight_metric_key=example_weight_metric_key) _ = ((metrics, plots) | 'SerializeMetricsAndPlots' >> serialization.SerializeMetricsAndPlots( post_export_metrics=add_metrics_callbacks) | 'WriteMetricsPlotsAndConfig' >> serialization.WriteMetricsPlotsAndConfig( output_path=output_path, eval_config=eval_config)) return beam.pvalue.PDone(examples.pipeline)
def testEvaluateWithSlicingAndDifferentBatchSizes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') metrics, plots = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback], slice_spec=[ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ], desired_batch_size=batch_size)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', 'first_slice'), ) second_slice = (('slice_key', 'second_slice'), ) self.assertItemsEqual( slices.keys(), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException( 'batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples, eval_shared_model, output_path, display_only_data_location=None, slice_spec=None, desired_batch_size=None, extractors=None, fanout=16, ): """Public API version of evaluate.Evaluate that handles example weights. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[...], example_weight_key=example_weight_key) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, output_path=output_path, display_only_data_location=data_location, slice_spec=slice_spec, ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Shared model parameters for EvalSavedModel including any additional metrics (see EvalSharedModel for more information on how to configure additional metrics). output_path: Path to output metrics and plots results. display_only_data_location: Optional path indicating where the examples were read from. This is used only for display purposes - data will not actually be read from this path. slice_spec: Optional list of SingleSliceSpec specifying the slices to slice the data into. If None, defaults to the overall slice. desired_batch_size: Optional batch size for batching in Predict and Aggregate. extractors: Optional list of Extractors to apply to ExampleAndExtracts. If provided, the extracts MUST contain a FeaturesPredictionsLabels extract with key 'fpl' and a list of SliceKeyType extracts with key 'slice_keys'. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. Raises: ValueError: If PredictExtractor or SliceKeyExtractor is not present in extractors. Returns: PDone. """ if not extractors: extractors = default_extractors(eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=desired_batch_size, materialize=False) metrics, plots = ( examples | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, fanout=fanout)) data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location example_weight_metric_key = metric_keys.EXAMPLE_COUNT if eval_shared_model.example_weight_key: example_weight_metric_key = metric_keys.EXAMPLE_WEIGHT eval_config = api_types.EvalConfig( model_location=eval_shared_model.model_path, data_location=data_location, slice_spec=slice_spec, example_weight_metric_key=example_weight_metric_key) _ = ((metrics, plots) | 'SerializeMetricsAndPlots' >> serialization.SerializeMetricsAndPlots( post_export_metrics=eval_shared_model.add_metrics_callbacks) | 'WriteMetricsPlotsAndConfig' >> serialization.WriteMetricsPlotsAndConfig(output_path=output_path, eval_config=eval_config)) return beam.pvalue.PDone(examples.pipeline)
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')