def load_eval_config(output_path: Text) -> config.EvalConfig: """Loads eval config.""" path = os.path.join(output_path, _EVAL_CONFIG_FILE) if tf.io.gfile.exists(path): with tf.io.gfile.GFile(path, 'r') as f: pb = json_format.Parse(f.read(), config_pb2.EvalConfigAndVersion()) _check_version(pb.version, output_path) return pb.eval_config else: # Legacy suppport (to be removed in future). # The previous version did not include file extension. path = os.path.splitext(path)[0] serialized_record = six.next( tf.compat.v1.python_io.tf_record_iterator(path)) final_dict = pickle.loads(serialized_record) _check_version(final_dict, output_path) old_config = final_dict['eval_config'] slicing_specs = None if old_config.slice_spec: slicing_specs = [s.to_proto() for s in old_config.slice_spec] options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.k_anonymization_count.value = old_config.k_anonymization_count return config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=old_config.data_location) ], model_specs=[config.ModelSpec(location=old_config.model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=slicing_specs, options=options)
def default_writers( eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, output_path: Optional[Text] = None, eval_config: config.EvalConfig = None, ) -> List[writer.Writer]: # pylint: disable=invalid-name """Returns the default writers for use in WriteResults. Args: eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). output_path: Deprecated (use EvalConfig). eval_config: Eval config. """ # TODO(b/141016373): Add support for multiple models. if eval_config is not None: output_spec = eval_config.output_data_specs[0] elif output_path is not None: output_spec = config.OutputDataSpec(default_location=output_path) if eval_shared_model is not None: eval_shared_models = [eval_shared_model] output_paths = { constants.METRICS_KEY: output_filename(output_spec, constants.METRICS_KEY), constants.PLOTS_KEY: output_filename(output_spec, constants.PLOTS_KEY) } return [ metrics_and_plots_writer.MetricsAndPlotsWriter( eval_shared_model=eval_shared_models[0], output_paths=output_paths) ]
def testRunModelAnalysisForCSVText(self): model_location = self._exportEvalSavedModel( csv_linear_classifier.simple_csv_linear_classifier) examples = [ '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0', '5.0,chinese,1.0' ] data_location = self._writeCSVToTextFile(examples) eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format='text') ], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'accuracy': { 'doubleValue': 0.75 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 4.0 } } } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testSerializeDeserializeEvalConfig(self): output_path = self._getTempDir() options = config.Options() options.compute_confidence_intervals.value = False options.k_anonymization_count.value = 1 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location='/path/to/data')], model_specs=[config.ModelSpec(location='/path/to/model')], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'), 'w') as f: f.write(model_eval_lib._serialize_eval_config(eval_config)) got_eval_config = model_eval_lib.load_eval_config(output_path) self.assertEqual(eval_config, got_eval_config)
def testRunModelAnalysisWithMultiplePlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0.0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[ post_export_metrics.auc_plots(), post_export_metrics.auc_plots(metric_tag='test') ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: { 'doubleValue': 5.0 }, } } expected_matrix = { 'threshold': 0.8, 'falseNegatives': 2.0, 'trueNegatives': 1.0, 'truePositives': 2.0, 'precision': 1.0, 'recall': 0.5 } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertEqual(len(eval_result.plots), 1) slice_key, plots = eval_result.plots[0] self.assertEqual((), slice_key) tf.compat.v1.logging.info(plots.keys()) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics/test'] ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics_callbacks, slice_spec=None, custom_metrics_check=None, custom_plots_check=None, custom_result_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None or custom_result_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec(location=eval_export_dir)], output_data_specs=[config.OutputDataSpec()], slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_models=[eval_shared_model]) with beam.Pipeline() as pipeline: (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, compute_confidence_intervals=self. compute_confidence_intervals, random_seed_for_testing=self.deterministic_test_seed)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot') result = pipeline.run() if custom_result_check is not None: custom_result_check(result)
def testSerializeDeserializeLegacyEvalConfig(self): output_path = self._getTempDir() old_config = LegacyConfig( model_location='/path/to/model', data_location='/path/to/data', slice_spec=[ slicer.SingleSliceSpec(columns=['country'], features=[('age', 5), ('gender', 'f')]), slicer.SingleSliceSpec(columns=['interest'], features=[('age', 6), ('gender', 'm')]) ], example_count_metric_key=None, example_weight_metric_key='key', compute_confidence_intervals=False, k_anonymization_count=1) final_dict = {} final_dict['tfma_version'] = tfma_version.VERSION_STRING final_dict['eval_config'] = old_config with tf.io.TFRecordWriter(os.path.join(output_path, 'eval_config')) as w: w.write(pickle.dumps(final_dict)) got_eval_config = model_eval_lib.load_eval_config(output_path) options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.k_anonymization_count.value = old_config.k_anonymization_count eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=old_config.data_location) ], model_specs=[config.ModelSpec(location=old_config.model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) self.assertEqual(eval_config, got_eval_config)
def single_model_analysis( model_location: Text, data_location: Text, output_path: Text = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None ) -> EvalResult: """Run model analysis for a single model on a single data set. This is a convenience wrapper around run_model_analysis for a single model with a single data set. For more complex use cases, use tfma.run_model_analysis. Args: model_location: Path to the export eval saved model. data_location: The location of the data files. output_path: The directory to output metrics and results to. If None, we use a temporary directory. slice_spec: A list of tfma.slicer.SingleSliceSpec. Returns: An EvalResult that can be used with the TFMA visualization functions. """ # Get working_dir ready. if output_path is None: output_path = tempfile.mkdtemp() if not tf.io.gfile.exists(output_path): tf.io.gfile.makedirs(output_path) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[s.to_proto() for s in slice_spec]) return run_model_analysis( eval_config=eval_config, eval_shared_models=[ default_eval_shared_model(eval_saved_model_path=model_location) ])
def testNoConstructFn(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [self._makeExample(age=3.0, language='english', label=1.0)] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) # No construct_fn should fail when Beam attempts to call the construct_fn. eval_shared_model = types.EvalSharedModel(model_path=model_location) with self.assertRaisesRegexp(AttributeError, '\'NoneType\' object has no attribute'): model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # Using the default_eval_shared_model should pass as it has a construct_fn. eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model])
def assertGeneralMetricsComputedWithBeamAre( self, eval_saved_model_path: Text, examples_pcollection: beam.pvalue.PCollection, slice_spec: List[slicer.SingleSliceSpec], add_metrics_callbacks: List[types.AddMetricsCallbackType], expected_slice_metrics: Dict[Any, Dict[Text, Any]]): """Checks metrics computed using Beam. A more general version of assertMetricsComputedWithBeamAre. Note that the caller is responsible for setting up and running the Beam pipeline. Example usage: def add_metrics(features, predictions, labels): metric_ops = { 'mse': tf.metrics.mean_squared_error(labels, predictions['logits']), 'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']), } return metric_ops with beam.Pipeline() as pipeline: expected_slice_metrics = { (): { 'mae': 0.1, 'mse': 0.2, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, (('age', 10),): { 'mae': 0.2, 'mse': 0.3, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, } examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=path, examples_pcollection=examples, slice_spec=[tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['age'])], add_metrics_callbacks=[ add_metrics, tfma.post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. examples_pcollection: A PCollection of serialized example bytes. slice_spec: List of slice specifications. add_metrics_callbacks: Callbacks for adding additional metrics. expected_slice_metrics: Dictionary of dictionaries describing the expected metrics for each slice. The outer dictionary map slice keys to the expected metrics for that slice. """ def check_metrics(got): """Check metrics callback.""" try: slices = {} for slice_key, value in got: slices[slice_key] = value self.assertItemsEqual(list(slices.keys()), list(expected_slice_metrics.keys())) for slice_key, expected_metrics in expected_slice_metrics.items( ): self.assertDictElementsWithinBounds( got_values_dict=slices[slice_key], expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec(location=eval_saved_model_path)], output_data_specs=[config.OutputDataSpec()], slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) # pylint: disable=no-value-for-parameter (metrics, _), _ = (examples_pcollection | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, output_path: Optional[Text] = None, display_only_data_location: Optional[Text] = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, desired_batch_size: Optional[int] = None, write_config: Optional[bool] = True, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1) -> beam.pvalue.PDone: """PTransform for performing extraction, evaluation, and writing results. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_config = tfma.EvalConfig( input_data_specs=[tfma.InputDataSpec(location=data_location)], model_specs=[tfma.ModelSpec(location=model_location)], output_data_specs=[tfma.OutputDataSpec(default_location=output_path)], slicing_specs=[...], metrics_specs=[...]) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[...]) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_models=[eval_shared_model], ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. output_path: Deprecated (use EvalConfig). display_only_data_location: Deprecated (use EvalConfig). slice_spec: Deprecated (use EvalConfig). desired_batch_size: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). Raises: ValueError: If matching Extractor not found for an Evaluator. Returns: PDone. """ if eval_shared_model is not None: eval_shared_models = [eval_shared_model] if eval_config is None: data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location disabled_outputs = None if not write_config: disabled_outputs = [_EVAL_CONFIG_FILE] model_specs = [] for m in eval_shared_models: example_weight_key = m.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec(location=m.model_path, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if desired_batch_size: options.desired_batch_size.value = desired_batch_size eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=model_specs, output_data_specs=[ config.OutputDataSpec(default_location=output_path, disabled_outputs=disabled_outputs) ], slicing_specs=slicing_specs, options=options) if not extractors: extractors = default_extractors(eval_config=eval_config, eval_shared_models=eval_shared_models, materialize=False) if not evaluators: evaluators = default_evaluators(eval_config=eval_config, eval_shared_models=eval_shared_models) for v in evaluators: evaluator.verify_evaluator(v, extractors) if not writers: writers = default_writers(eval_config=eval_config, eval_shared_models=eval_shared_models) # pylint: disable=no-value-for-parameter _ = (examples | 'InputsToExtracts' >> InputsToExtracts() | 'ExtractAndEvaluate' >> ExtractAndEvaluate(extractors=extractors, evaluators=evaluators) | 'WriteResults' >> WriteResults(writers=writers)) # TODO(b/141016373): Add support for multiple models. if _EVAL_CONFIG_FILE not in eval_config.output_data_specs[ 0].disabled_outputs: _ = examples.pipeline | WriteEvalConfig(eval_config) # pylint: enable=no-value-for-parameter return beam.pvalue.PDone(examples.pipeline)
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testRunModelAnalysisWithLegacyQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), legacy_ndcg.NdcgMetricCombineFn(at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec()) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithKerasModel(self): input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data') output_layer = tf.keras.layers.Dense( 10, activation=tf.nn.softmax)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.categorical_crossentropy) features = {'data': [[0.0] * 28 * 28]} labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(data=[0.0] * 28 * 28, label=1.0), self._makeExample(data=[1.0] * 28 * 28, label=5.0), self._makeExample(data=[1.0] * 28 * 28, label=9.0), ] data_location = self._writeTFExamplesToTFRecords(examples) metrics_spec = config.MetricsSpec() for metric in (tf.keras.metrics.AUC(), ): cfg = tf.keras.utils.serialize_keras_object(metric) metrics_spec.metrics.append( config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))) for class_id in (0, 5, 9): metrics_spec.binarize.class_ids.append(class_id) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], metrics_specs=[metrics_spec]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { 'classId:0': { 'auc': True, }, 'classId:5': { 'auc': True, }, 'classId:9': { 'auc': True, }, } for class_id in expected_metrics: self.assertIn(class_id, got_metrics) for k in expected_metrics[class_id]: self.assertIn(k, got_metrics[class_id])
def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0, my_slice='a'), self._makeExample(age=3.0, language='chinese', label=0.0, my_slice='a'), self._makeExample(age=4.0, language='english', label=1.0, my_slice='b'), self._makeExample(age=5.0, language='chinese', label=1.0, my_slice='c'), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])] extractors_with_feature_extraction = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size=3, materialize=False), feature_extractor.FeatureExtractor( extract_source=constants.INPUT_KEY, extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY), slice_key_extractor.SliceKeyExtractor(slice_spec, materialize=False) ] eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ], extractors=extractors_with_feature_extraction) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('my_slice', 'a'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 6.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('my_slice', 'b'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 4.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, (('my_slice', 'c'), ): { 'accuracy': { 'doubleValue': 0.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 5.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['my_slice'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def run_model_analysis( eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, pipeline_options: Optional[Any] = None, data_location: Optional[Text] = None, file_format: Optional[Text] = 'tfrecords', slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, output_path: Optional[Text] = None, write_config: Optional[bool] = True, desired_batch_size: Optional[int] = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1) -> EvalResult: """Runs TensorFlow model analysis. It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow Eval SavedModel and returns the results. This is a simplified API for users who want to quickly get something running locally. Users who wish to create their own Beam pipelines can use the Evaluate PTransform instead. Args: eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. pipeline_options: Optional arguments to run the Pipeline, for instance whether to run directly. data_location: Deprecated (use EvalConfig). file_format: Deprecated (use EvalConfig). slice_spec: Deprecated (use EvalConfig). output_path: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). desired_batch_size: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). Returns: An EvalResult that can be used with the TFMA visualization functions. Raises: ValueError: If the file_format is unknown to us. """ _assert_tensorflow_version() if eval_shared_model is not None: eval_shared_models = [eval_shared_model] if eval_config is None: if output_path is None: output_path = tempfile.mkdtemp() if not tf.io.gfile.exists(output_path): tf.io.gfile.makedirs(output_path) disabled_outputs = None if not write_config: disabled_outputs = [_EVAL_CONFIG_FILE] model_specs = [] for m in eval_shared_models: example_weight_key = m.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec(location=m.model_path, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if desired_batch_size: options.desired_batch_size.value = desired_batch_size eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format=file_format) ], model_specs=model_specs, output_data_specs=[ config.OutputDataSpec(default_location=output_path, disabled_outputs=disabled_outputs) ], slicing_specs=slicing_specs, options=options) if len(eval_config.input_data_specs) != 1: raise NotImplementedError( 'multiple input_data_specs are not yet supported.') if len(eval_config.model_specs) != 1: raise NotImplementedError( 'multiple model_specs are not yet supported.') if len(eval_config.output_data_specs) != 1: raise NotImplementedError( 'multiple output_data_specs are not yet supported.') with beam.Pipeline(options=pipeline_options) as p: if (not eval_config.input_data_specs[0].file_format or eval_config.input_data_specs[0].file_format == 'tfrecords'): data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=eval_config.input_data_specs[0].location, compression_type=beam.io.filesystem.CompressionTypes.AUTO) elif eval_config.input_data_specs[0].file_format == 'text': data = p | 'ReadFromText' >> beam.io.textio.ReadFromText( eval_config.input_data_specs[0].location) else: raise ValueError('unknown file_format: {}'.format( eval_config.input_data_specs[0].file_format)) # pylint: disable=no-value-for-parameter _ = ( data | 'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_models=eval_shared_models, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter # TODO(b/141016373): Add support for multiple models. return load_eval_result(eval_config.output_data_specs[0].default_location)
def testWriteMetricsAndPlots(self): metrics_file = os.path.join(self._getTempDir(), 'metrics') plots_file = os.path.join(self._getTempDir(), 'plots') temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir') _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec()], output_data_specs=[ config.OutputDataSpec(disabled_outputs=['eval_config.json']) ]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.calibration_plot_and_prediction_histogram( num_buckets=2) ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] evaluators = [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model) ] output_paths = { constants.METRICS_KEY: metrics_file, constants.PLOTS_KEY: plots_file } writers = [ metrics_and_plots_writer.MetricsAndPlotsWriter( eval_shared_model, output_paths) ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=1.0, label=1.0) # pylint: disable=no-value-for-parameter _ = (pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_models=[eval_shared_model], extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "average_loss" value { double_value { value: 0.5 } } } metrics { key: "post_export_metrics/example_count" value { double_value { value: 2.0 } } } """, metrics_for_slice_pb2.MetricsForSlice()) metric_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file): metric_records.append( metrics_for_slice_pb2.MetricsForSlice.FromString(record)) self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records) self.assertProtoEquals(expected_metrics_for_slice, metric_records[0]) expected_plots_for_slice = text_format.Parse( """ slice_key {} plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf num_weighted_examples {} total_weighted_label {} total_weighted_refined_prediction {} } buckets { upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { } total_weighted_label {} total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 1.0 } } } } } """, metrics_for_slice_pb2.PlotsForSlice()) plot_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(plots_file): plot_records.append( metrics_for_slice_pb2.PlotsForSlice.FromString(record)) self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records) self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
def assertMetricsComputedWithBeamAre( self, eval_saved_model_path: Text, serialized_examples: List[bytes], expected_metrics: Dict[Text, Any], add_metrics_callbacks: Optional[List[ types.AddMetricsCallbackType]] = None): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. add_metrics_callbacks: Optional. Callbacks for adding additional metrics. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec(location=eval_saved_model_path)], output_data_specs=[config.OutputDataSpec()]) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter (metrics, _), _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)