def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec(columns=['language'])] eval_result = model_eval_lib.run_model_analysis( model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location, slice_spec=slice_spec, num_bootstrap_samples=20) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { ((b'language', b'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, ((b'language', b'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testBuildAnalysisTable(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) example1 = self._makeExample( age=3.0, language='english', label=1.0, slice_key='first_slice') with beam.Pipeline() as pipeline: result = ( pipeline | 'CreateInput' >> beam.Create([example1.SerializeToString()]) | 'BuildTable' >> contrib.BuildAnalysisTable(eval_shared_model=eval_shared_model)) def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) extracts = got[0] # Values of type MaterializedColumn are emitted to signal to # downstream sink components to output the data to file. materialized_dict = dict((k, v) for k, v in extracts.items() if isinstance(v, types.MaterializedColumn)) self._assertMaterializedColumns( materialized_dict, { # Slice key 'features__slice_key': types.MaterializedColumn( name='features__slice_key', value=[b'first_slice']), # Features 'features__language': types.MaterializedColumn( name='features__language', value=[b'english']), 'features__age': types.MaterializedColumn( name='features__age', value=np.array([3.], dtype=np.float32)), # Label 'features__label': types.MaterializedColumn( name='features__label', value=np.array([1.], dtype=np.float32)), 'labels': types.MaterializedColumn( name='labels', value=np.array([1.], dtype=np.float32)), }) self._assertMaterializedColumnsExist(materialized_dict, [ 'predictions__logits', 'predictions__probabilities', 'predictions__classes', 'predictions__logistic', 'predictions__class_ids', constants.SLICE_KEYS_KEY ]) util.assert_that(result[constants.ANALYSIS_KEY], check_result)
def assertMetricsComputedWithBeamAre(self, eval_saved_model_path, serialized_examples, expected_metrics): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_saved_model_path) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics, _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator .ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testRunModelAnalysisWithMultiplePlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0.0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[ post_export_metrics.auc_plots(), post_export_metrics.auc_plots(metric_tag='test') ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: { 'doubleValue': 5.0 }, } } expected_matrix = { 'threshold': 0.8, 'falseNegatives': 2.0, 'trueNegatives': 1.0, 'truePositives': 2.0, 'precision': 1.0, 'recall': 0.5 } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertEqual(len(eval_result.plots), 1) slice_key, plots = eval_result.plots[0] self.assertEqual((), slice_key) tf.compat.v1.logging.info(plots.keys()) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics/test'] ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
def testBuildAnalysisTableWithSlices(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') slice_spec = [ slicer.SingleSliceSpec(columns=['age']), slicer.SingleSliceSpec(features=[('age', 3)]), slicer.SingleSliceSpec(columns=['age'], features=[('language', 'english')]) ] with beam.Pipeline() as pipeline: result = ( pipeline | 'CreateInput' >> beam.Create([example1.SerializeToString()]) | 'BuildTable' >> contrib.BuildAnalysisTable( eval_shared_model, slice_spec)) def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) extracts = got[0] # Values of type MaterializedColumn are emitted to signal to # downstream sink components to output the data to file. materialized_dict = dict( (k, v) for k, v in extracts.items() if isinstance(v, types.MaterializedColumn)) self._assertMaterializedColumns( materialized_dict, { constants.SLICE_KEYS_KEY: types.MaterializedColumn( name=constants.SLICE_KEYS_KEY, value=[ b'age:3.0', b'age:3', b'age_X_language:3.0_X_english' ]) }) self._assertMaterializedColumnsExist(materialized_dict, [ 'predictions__logits', 'predictions__probabilities', 'predictions__classes', 'predictions__logistic', 'predictions__class_ids' ]) util.assert_that(result[constants.ANALYSIS_KEY], check_result)
def testPredict(self, features_blacklist): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_export_dir, blacklist_feature_fetches=features_blacklist) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] predict_extracts = ( pipeline | beam.Create(serialized_examples, reshuffle=False) # Our diagnostic outputs, pass types.Extracts throughout, however our # aggregating functions do not use this interface. | beam.Map(lambda x: {constants.INPUT_KEY: x}) | 'Predict' >> predict_extractor._TFMAPredict( eval_shared_models={'': eval_shared_model}, desired_batch_size=3)) def check_result(got): try: self.assertLen(got, 4) for item in got: self.assertIn( constants.FEATURES_PREDICTIONS_LABELS_KEY, item) fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY] # Verify fpl contains features, probabilities, and correct labels. blacklisted_features = set(features_blacklist or []) expected_features = ( set(['language', 'age', 'label']) - blacklisted_features) for feature in expected_features: self.assertIn(feature, fpl.features) for feature in blacklisted_features: self.assertNotIn(feature, fpl.features) self.assertAlmostEqual(fpl.features['label'], fpl.labels['__labels']) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testRunModelAnalysisWithMultiplePlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0.0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[ post_export_metrics.auc_plots(), post_export_metrics.auc_plots(metric_tag='test') ]) data_location = self._writeTFExamplesToTFRecords(examples) eval_result = model_eval_lib.run_model_analysis( eval_shared_model, data_location) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: { 'doubleValue': 5.0 }, } } expected_matrix = { 'threshold': 0.8, 'falseNegatives': 2.0, 'trueNegatives': 1.0, 'truePositives': 2.0, 'precision': 1.0, 'recall': 0.5 } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertEqual(len(eval_result.plots), 1) slice_key, plots = eval_result.plots[0] self.assertEqual((), slice_key) tf.logging.info(plots.keys()) self.assertDictElementsAlmostEqual( plots['post_export_metrics']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix) self.assertDictElementsAlmostEqual( plots['post_export_metrics/test']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix)
def testBatchedPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_export_dir) eval_config = config_pb2.EvalConfig( model_specs=[config_pb2.ModelSpec()]) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) extractor = predict_extractor.PredictExtractor( eval_shared_model, eval_config=eval_config) predict_extracts = ( pipeline | 'Create' >> beam.Create(serialized_examples, reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Predict' >> extractor.ptransform) def check_result(got): try: self.assertLen(got, 2) for item in got: self.assertIn(constants.FEATURES_KEY, item) for feature in ('language', 'age'): for features_dict in item[constants.FEATURES_KEY]: self.assertIn(feature, features_dict) self.assertIn(constants.LABELS_KEY, item) self.assertIn(constants.PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def testNoConstructFn(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [self._makeExample(age=3.0, language='english', label=1.0)] data_location = self._writeTFExamplesToTFRecords(examples) # No construct_fn should fail when Beam attempts to call the construct_fn. eval_shared_model = types.EvalSharedModel(model_path=model_location) with self.assertRaisesRegexp(TypeError, '\'NoneType\' object is not callable'): model_eval_lib.run_model_analysis( eval_shared_model=eval_shared_model, data_location=data_location) # Using the default_eval_shared_model should pass as it has a construct_fn. eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) model_eval_lib.run_model_analysis( eval_shared_model=eval_shared_model, data_location=data_location)
def testPredictMultipleExampleRefPerRawExampleBytes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fake_multi_examples_per_input_estimator. fake_multi_examples_per_input_estimator( None, temp_eval_export_dir)) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_export_dir) # The trailing zeros make an "empty" output batch. raw_example_bytes = ['0', '3', '1', '0', '2', '0', '0', '0', '0'] def check_result(got): try: self.assertLen(got, 6) self.assertEqual( ['3', '3', '3', '1', '2', '2'], [extracts[constants.INPUT_KEY] for extracts in got]) for item in got: self.assertIn(constants.FEATURES_PREDICTIONS_LABELS_KEY, item) fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY] self.assertIn('input_index', fpl.features) self.assertIn('example_count', fpl.features) self.assertIn('intra_input_index', fpl.features) except AssertionError as err: raise util.BeamAssertException(err) with beam.Pipeline() as pipeline: predict_extracts = ( pipeline | beam.Create(raw_example_bytes, reshuffle=False) # Our diagnostic outputs, pass types.Extracts throughout, however our # aggregating functions do not use this interface. | beam.Map(lambda x: {constants.INPUT_KEY: x}) | 'Predict' >> predict_extractor._TFMAPredict( eval_shared_models={'': eval_shared_model}, desired_batch_size=3)) util.assert_that(predict_extracts, check_result)
def testNoConstructFn(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [self._makeExample(age=3.0, language='english', label=1.0)] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) # No construct_fn should fail when Beam attempts to call the construct_fn. eval_shared_model = types.EvalSharedModel(model_path=model_location) with self.assertRaisesRegexp(AttributeError, '\'NoneType\' object has no attribute'): model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # Using the default_eval_shared_model should pass as it has a construct_fn. eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model])
def testRunModelAnalysisForCSVText(self): model_location = self._exportEvalSavedModel( csv_linear_classifier.simple_csv_linear_classifier) examples = [ '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0', '5.0,chinese,1.0' ] data_location = self._writeCSVToTextFile(examples) eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format='text') ], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'accuracy': { 'doubleValue': 0.75 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 4.0 } } } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testRunModelAnalysisWithKerasModel(self): input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data') output_layer = tf.keras.layers.Dense( 10, activation=tf.nn.softmax)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.categorical_crossentropy) features = {'data': [[0.0] * 28 * 28]} labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(data=[0.0] * 28 * 28, label=1.0), self._makeExample(data=[1.0] * 28 * 28, label=5.0), self._makeExample(data=[1.0] * 28 * 28, label=9.0), ] data_location = self._writeTFExamplesToTFRecords(examples) metrics_spec = config.MetricsSpec() for metric in (tf.keras.metrics.AUC(), ): cfg = tf.keras.utils.serialize_keras_object(metric) metrics_spec.metrics.append( config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))) for class_id in (0, 5, 9): metrics_spec.binarize.class_ids.append(class_id) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], metrics_specs=[metrics_spec]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { 'classId:0': { 'auc': True, }, 'classId:5': { 'auc': True, }, 'classId:9': { 'auc': True, }, } for class_id in expected_metrics: self.assertIn(class_id, got_metrics) for k in expected_metrics[class_id]: self.assertIn(k, got_metrics[class_id])
def assertMetricsComputedWithBeamAre( self, eval_saved_model_path: str, serialized_examples: List[bytes], expected_metrics: Dict[str, Any], add_metrics_callbacks: Optional[List[ types.AddMetricsCallbackType]] = None): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. add_metrics_callbacks: Optional. Callbacks for adding additional metrics. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_config = config_pb2.EvalConfig() eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter (metrics, _), _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots( # pylint: disable=protected-access eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testRunModelAnalysis(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec(columns=['language'])] eval_result = model_eval_lib.run_model_analysis( model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location, slice_spec=slice_spec, k_anonymization_count=2) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', b'hindi'),): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', b'chinese'),): { 'accuracy': { 'doubleValue': 0.5 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', b'english'),): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithDeterministicConfidenceIntervals(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig(slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_model=model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location=data_location, output_path=self._getTempDir(), random_seed_for_testing=_TEST_SEED) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.model_location, model_location.decode()) self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) for key, value in eval_result.slicing_metrics: if (('language', 'english'), ) == key: metric = value['']['']['average_loss'] self.assertAlmostEqual(0.171768754720, metric['boundedValue']['value'], delta=0.1) metric = value['']['']['auc_precision_recall'] self.assertAlmostEqual(0.99999940395, metric['boundedValue']['value'], delta=0.1) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec()] eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_shared_model=eval_shared_model, data_location=data_location, slice_spec=slice_spec, evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), ndcg.NdcgMetricCombineFn( at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0, my_slice='a'), self._makeExample(age=3.0, language='chinese', label=0.0, my_slice='a'), self._makeExample(age=4.0, language='english', label=1.0, my_slice='b'), self._makeExample(age=5.0, language='chinese', label=1.0, my_slice='c'), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])] extractors_with_feature_extraction = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size=3, materialize=False), feature_extractor.FeatureExtractor( extract_source=constants.INPUT_KEY, extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY), slice_key_extractor.SliceKeyExtractor(slice_spec, materialize=False) ] eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ], extractors=extractors_with_feature_extraction) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('my_slice', 'a'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 6.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('my_slice', 'b'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 4.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, (('my_slice', 'c'), ): { 'accuracy': { 'doubleValue': 0.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 5.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['my_slice'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)