def single_model_analysis( model_location: Text, data_location: Text, output_path: Text = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None ) -> EvalResult: """Run model analysis for a single model on a single data set. This is a convenience wrapper around run_model_analysis for a single model with a single data set. For more complex use cases, use tfma.run_model_analysis. Args: model_location: Path to the export eval saved model. data_location: The location of the data files. output_path: The directory to output metrics and results to. If None, we use a temporary directory. slice_spec: A list of tfma.slicer.SingleSliceSpec. Returns: An EvalResult that can be used with the TFMA visualization functions. """ # Get working_dir ready. if output_path is None: output_path = tempfile.mkdtemp() if not tf.io.gfile.exists(output_path): tf.io.gfile.makedirs(output_path) eval_config = config.EvalConfig( slicing_specs=[s.to_proto() for s in slice_spec]) return run_model_analysis(eval_config=eval_config, eval_shared_model=default_eval_shared_model( eval_saved_model_path=model_location), data_location=data_location, output_path=output_path) # pytype: disable=bad-return-type
def testValidateMetricsChangeThresholdAbsoluteFail(self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = 0 - .333 = -.333 < -1, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testSliceKeys(self, model_names, extracts, slice_specs, expected_slices): eval_config = config.EvalConfig( model_specs=[config.ModelSpec(name=name) for name in model_names]) with beam.Pipeline() as pipeline: slice_keys_extracts = ( pipeline | 'CreateTestInput' >> beam.Create(extracts) | 'ExtractSlices' >> slice_key_extractor.ExtractSliceKeys( slice_spec=slice_specs, eval_config=eval_config)) def check_result(got): try: self.assertLen(got, 2) got_results = [] for item in got: self.assertIn(constants.SLICE_KEY_TYPES_KEY, item) got_results.append( sorted(item[constants.SLICE_KEY_TYPES_KEY])) self.assertCountEqual(got_results, expected_slices) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(slice_keys_extracts, check_result)
def test_features_extractor_no_features(self): model_spec = config.ModelSpec() eval_config = config.EvalConfig(model_specs=[model_spec]) feature_extractor = features_extractor.FeaturesExtractor(eval_config) tfx_io = tf_example_record.TFExampleBeamRecord( raw_record_column_name=constants.ARROW_INPUT_COLUMN, physical_format='inmem', telemetry_descriptors=['testing']) with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create([b''] * 3) | 'DecodeToRecordBatch' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform) def check_result(got): self.assertLen(got, 1) self.assertLen(got[0], 3) for d in got[0][constants.FEATURES_KEY]: self.assertEmpty(d) util.assert_that(result, check_result, label='CheckResult')
def testValidateMetricsDivByZero(self): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, relative={'value': 0.1})) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(name='candidate'), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['baseline', 'candidate']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.0, metric_types.MetricKey( name='mean_prediction', model_name='candidate', is_diff=True): 0.1, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testRunModelAnalysisForCSVText(self): model_location = self._exportEvalSavedModel( csv_linear_classifier.simple_csv_linear_classifier) examples = [ '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0', '5.0,chinese,1.0' ] data_location = self._writeCSVToTextFile(examples) eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format='text') ], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'accuracy': { 'doubleValue': 0.75 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 4.0 } } } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testNoConstructFn(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [self._makeExample(age=3.0, language='english', label=1.0)] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) # No construct_fn should fail when Beam attempts to call the construct_fn. eval_shared_model = types.EvalSharedModel(model_path=model_location) with self.assertRaisesRegexp(AttributeError, '\'NoneType\' object has no attribute'): model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # Using the default_eval_shared_model should pass as it has a construct_fn. eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model])
def testValidateMetricsValueThresholdUpperBoundFail(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def assertMetricsComputedWithBeamAre( self, eval_saved_model_path: Text, serialized_examples: List[bytes], expected_metrics: Dict[Text, Any], add_metrics_callbacks: Optional[List[ types.AddMetricsCallbackType]] = None): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. add_metrics_callbacks: Optional. Callbacks for adding additional metrics. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_config = config.EvalConfig() eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter (metrics, _), _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testLabelsExtractorMultiModel(self): model_spec1 = config.ModelSpec(name='model1', label_key='label') model_spec2 = config.ModelSpec(name='model2', label_keys={ 'output1': 'label1', 'output2': 'label2' }) eval_config = config.EvalConfig(model_specs=[model_spec1, model_spec2]) feature_extractor = features_extractor.FeaturesExtractor(eval_config) label_extractor = labels_extractor.LabelsExtractor(eval_config) schema = text_format.Parse( """ feature { name: "label" type: FLOAT } feature { name: "label1" type: FLOAT } feature { name: "label2" type: FLOAT } feature { name: "fixed_int" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) examples = [ self._makeExample(label=1.0, label1=1.0, label2=0.0, fixed_int=1), self._makeExample(label=1.0, label1=1.0, label2=1.0, fixed_int=1) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | label_extractor.stage_name >> label_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) for model_name in ('model1', 'model2'): self.assertIn(model_name, got[0][constants.LABELS_KEY][0]) self.assertAlmostEqual( got[0][constants.LABELS_KEY][0]['model1'], np.array([1.0])) self.assertDictElementsAlmostEqual( got[0][constants.LABELS_KEY][0]['model2'], { 'output1': np.array([1.0]), 'output2': np.array([0.0]) }) for model_name in ('model1', 'model2'): self.assertIn(model_name, got[0][constants.LABELS_KEY][1]) self.assertAlmostEqual( got[0][constants.LABELS_KEY][1]['model1'], np.array([1.0])) self.assertDictElementsAlmostEqual( got[0][constants.LABELS_KEY][1]['model2'], { 'output1': np.array([1.0]), 'output2': np.array([1.0]) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testWriteMetricsAndPlots(self): metrics_file = os.path.join(self._getTempDir(), 'metrics') plots_file = os.path.join(self._getTempDir(), 'plots') temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir') _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_config = config.EvalConfig( model_specs=[config.ModelSpec()], options=config.Options( disabled_outputs={'values': ['eval_config.json']})) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.calibration_plot_and_prediction_histogram( num_buckets=2) ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] evaluators = [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model) ] output_paths = { constants.METRICS_KEY: metrics_file, constants.PLOTS_KEY: plots_file } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, eval_shared_model.add_metrics_callbacks) ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=1.0, label=1.0) # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "average_loss" value { double_value { value: 0.5 } } } metrics { key: "post_export_metrics/example_count" value { double_value { value: 2.0 } } } """, metrics_for_slice_pb2.MetricsForSlice()) metric_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file): metric_records.append( metrics_for_slice_pb2.MetricsForSlice.FromString(record)) self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records) self.assertProtoEquals(expected_metrics_for_slice, metric_records[0]) expected_plots_for_slice = text_format.Parse( """ slice_key {} plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf num_weighted_examples {} total_weighted_label {} total_weighted_refined_prediction {} } buckets { upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { } total_weighted_label {} total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 1.0 } } } } } """, metrics_for_slice_pb2.PlotsForSlice()) plot_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(plots_file): plot_records.append( metrics_for_slice_pb2.PlotsForSlice.FromString(record)) self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records) self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
class ModelUtilTest(testutil.TensorflowModelAnalysisTest, parameterized.TestCase): def createDenseInputsSchema(self): return text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "input_1" value { dense_tensor { column_name: "input_1" shape { dim { size: 1 } } } } } tensor_representation { key: "input_2" value { dense_tensor { column_name: "input_2" shape { dim { size: 1 } } } } } } } feature { name: "input_1" type: FLOAT } feature { name: "input_2" type: FLOAT } feature { name: "non_model_feature" type: INT } """, schema_pb2.Schema()) def createModelWithSingleInput(self, save_as_keras): input_layer = tf.keras.layers.Input(shape=(1, ), name='input') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) @tf.function def serving_default(s): return model(s) input_spec = { 'input': tf.TensorSpec(shape=(None, 1), dtype=tf.string, name='input'), } signatures = { 'serving_default': serving_default.get_concrete_function(input_spec), 'custom_signature': serving_default.get_concrete_function(input_spec), } export_path = tempfile.mkdtemp() if save_as_keras: model.save(export_path, save_format='tf', signatures=signatures) else: tf.saved_model.save(model, export_path, signatures=signatures) return export_path def createModelWithMultipleDenseInputs(self, save_as_keras): input1 = tf.keras.layers.Input(shape=(1, ), name='input_1') input2 = tf.keras.layers.Input(shape=(1, ), name='input_2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layer = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, name='output')(input_layer) model = tf.keras.models.Model(inputs, output_layer) # Add custom attribute to model to test callables stored as attributes model.custom_attribute = tf.keras.models.Model(inputs, output_layer) @tf.function def serving_default(serialized_tf_examples): parsed_features = tf.io.parse_example( serialized_tf_examples, { 'input_1': tf.io.FixedLenFeature([1], dtype=tf.float32), 'input_2': tf.io.FixedLenFeature([1], dtype=tf.float32) }) return model(parsed_features) @tf.function def custom_single_output(features): return model(features) @tf.function def custom_multi_output(features): return {'output1': model(features), 'output2': model(features)} input_spec = tf.TensorSpec(shape=(None, ), dtype=tf.string, name='examples') custom_input_spec = { 'input_1': tf.TensorSpec(shape=(None, 1), dtype=tf.float32, name='input_1'), 'input_2': tf.TensorSpec(shape=(None, 1), dtype=tf.float32, name='input_2') } signatures = { 'serving_default': serving_default.get_concrete_function(input_spec), 'custom_single_output': custom_single_output.get_concrete_function(custom_input_spec), 'custom_multi_output': custom_multi_output.get_concrete_function(custom_input_spec) } export_path = tempfile.mkdtemp() if save_as_keras: model.save(export_path, save_format='tf', signatures=signatures) else: tf.saved_model.save(model, export_path, signatures=signatures) return export_path def createModelWithMultipleMixedInputs(self, save_as_keras): dense_input = tf.keras.layers.Input(shape=(2, ), name='input_1', dtype=tf.int64) dense_float_input = tf.cast(dense_input, tf.float32) sparse_input = tf.keras.layers.Input(shape=(1, ), name='input_2', sparse=True) dense_sparse_input = tf.keras.layers.Dense( 1, name='dense_input2')(sparse_input) ragged_input = tf.keras.layers.Input(shape=(None, ), name='input_3', ragged=True) dense_ragged_input = tf.keras.layers.Lambda(lambda x: x.to_tensor())( ragged_input) dense_ragged_input.set_shape((None, 1)) inputs = [dense_input, sparse_input, ragged_input] input_layer = tf.keras.layers.concatenate( [dense_float_input, dense_sparse_input, dense_ragged_input]) output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(inputs, output_layer) @tf.function def serving_default(features): return model(features) input_spec = { 'input_1': tf.TensorSpec(shape=(None, 2), dtype=tf.int64, name='input_1'), 'input_2': tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32), 'input_3': tf.RaggedTensorSpec(shape=(None, 1), dtype=tf.float32) } signatures = { 'serving_default': serving_default.get_concrete_function(input_spec), 'custom_signature': serving_default.get_concrete_function(input_spec), } export_path = tempfile.mkdtemp() if save_as_keras: model.save(export_path, save_format='tf', signatures=signatures) else: tf.saved_model.save(model, export_path, signatures=signatures) return export_path def testFilterByInputNames(self): tensors = { 'f1': tf.constant([[1.1], [2.1]], dtype=tf.float32), 'f2': tf.constant([[1], [2]], dtype=tf.int64), 'f3': tf.constant([['hello'], ['world']], dtype=tf.string) } filtered_tensors = model_util.filter_by_input_names( tensors, ['f1', 'f3']) self.assertLen(filtered_tensors, 2) self.assertAllEqual(tf.constant([[1.1], [2.1]], dtype=tf.float32), filtered_tensors['f1']) self.assertAllEqual( tf.constant([['hello'], ['world']], dtype=tf.string), filtered_tensors['f3']) @parameterized.named_parameters( ('one_baseline', text_format.Parse( """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } """, config.EvalConfig()), text_format.Parse( """ name: "baseline" is_baseline: true """, config.ModelSpec())), ('no_baseline', text_format.Parse( """ model_specs { name: "candidate" } """, config.EvalConfig()), None), ) def test_get_baseline_model(self, eval_config, expected_baseline_model_spec): self.assertEqual(expected_baseline_model_spec, model_util.get_baseline_model_spec(eval_config)) @parameterized.named_parameters( ('one_non_baseline', text_format.Parse( """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } """, config.EvalConfig()), [ text_format.Parse( """ name: "candidate" """, config.ModelSpec()) ]), ('no_non_baseline', text_format.Parse( """ model_specs { name: "baseline" is_baseline: true } """, config.EvalConfig()), []), ) def test_get_non_baseline_model(self, eval_config, expected_non_baseline_model_specs): self.assertCountEqual( expected_non_baseline_model_specs, model_util.get_non_baseline_model_specs(eval_config)) def testFilterByInputNamesKeras(self): tensors = { 'f1': tf.constant([[1.1], [2.1]], dtype=tf.float32), 'f2': tf.constant([[1], [2]], dtype=tf.int64), 'f3': tf.constant([['hello'], ['world']], dtype=tf.string) } filtered_tensors = model_util.filter_by_input_names( tensors, [ 'f1' + model_util.KERAS_INPUT_SUFFIX, 'f3' + model_util.KERAS_INPUT_SUFFIX ]) self.assertLen(filtered_tensors, 2) self.assertAllEqual( tf.constant([[1.1], [2.1]], dtype=tf.float32), filtered_tensors['f1' + model_util.KERAS_INPUT_SUFFIX]) self.assertAllEqual( tf.constant([['hello'], ['world']], dtype=tf.string), filtered_tensors['f3' + model_util.KERAS_INPUT_SUFFIX]) @parameterized.named_parameters( ('output_name_and_label_key', config.ModelSpec(label_key='label'), 'output', 'label'), ('output_name_and_label_keys', config.ModelSpec(label_keys={'output': 'label'}), 'output', 'label'), ('output_name_and_no_label_keys', config.ModelSpec(), 'output', None), ('no_output_name_and_label_key', config.ModelSpec(label_key='label'), '', 'label'), ('no_output_name_and_no_label_keys', config.ModelSpec(), '', None)) def testGetLabelKey(self, model_spec, output_name, expected_label_key): self.assertEqual(expected_label_key, model_util.get_label_key(model_spec, output_name)) def testGetLabelKeyNoOutputAndLabelKeys(self): with self.assertRaises(ValueError): model_util.get_label_key( config.ModelSpec(label_keys={'output1': 'label'}), '') @parameterized.named_parameters( { 'testcase_name': 'single_model_single_key', 'model_specs': [config.ModelSpec(label_key='feature1')], 'field': 'label_key', 'multi_output_field': 'label_keys', 'expected_values': [ [1.0, 1.1, 1.2], ] }, { 'testcase_name': 'single_model_multi_key', 'model_specs': [ config.ModelSpec(label_keys={ 'output1': 'feature1', 'output2': 'feature2' }) ], 'field': 'label_key', 'multi_output_field': 'label_keys', 'expected_values': [ { 'output1': [1.0, 1.1, 1.2], 'output2': [2.0, 2.1, 2.2] }, ] }, { 'testcase_name': 'multi_model_single_key', 'model_specs': [ config.ModelSpec(name='model1', example_weight_key='feature2'), config.ModelSpec(name='model2', example_weight_key='feature3') ], 'field': 'example_weight_key', 'multi_output_field': 'example_weight_keys', 'expected_values': [ { 'model1': [2.0, 2.1, 2.2], 'model2': [3.0, 3.1, 3.2] }, ] }, { 'testcase_name': 'multi_model_multi_key', 'model_specs': [ config.ModelSpec(name='model1', prediction_keys={ 'output1': 'feature1', 'output2': 'feature2' }), config.ModelSpec(name='model2', prediction_keys={ 'output1': 'feature1', 'output3': 'feature3' }) ], 'field': 'prediction_key', 'multi_output_field': 'prediction_keys', 'expected_values': [ { 'model1': { 'output1': [1.0, 1.1, 1.2], 'output2': [2.0, 2.1, 2.2] }, 'model2': { 'output1': [1.0, 1.1, 1.2], 'output3': [3.0, 3.1, 3.2] } }, ] }, ) def testGetFeatureValuesForModelSpecField(self, model_specs, field, multi_output_field, expected_values): extracts = { # Only need the num_rows from RecordBatch so use fake array of same len # as features. constants.ARROW_RECORD_BATCH_KEY: pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']), constants.FEATURES_KEY: [ { 'feature1': [1.0, 1.1, 1.2], 'feature2': [2.0, 2.1, 2.2], 'feature3': [3.0, 3.1, 3.2], }, ] } got = model_util.get_feature_values_for_model_spec_field( model_specs, field, multi_output_field, extracts) self.assertAlmostEqual(expected_values, got) @parameterized.named_parameters( { 'testcase_name': 'single_model_single_key', 'model_specs': [config.ModelSpec(label_key='feature2')], 'field': 'label_key', 'multi_output_field': 'label_keys', 'expected_values': [ [4.0, 4.1, 4.2], ] }, { 'testcase_name': 'single_model_multi_key', 'model_specs': [ config.ModelSpec(label_keys={ 'output1': 'feature1', 'output2': 'feature2' }) ], 'field': 'label_key', 'multi_output_field': 'label_keys', 'expected_values': [ { 'output1': [1.0, 1.1, 1.2], 'output2': [4.0, 4.1, 4.2] }, ] }, ) def testGetFeatureValuesForModelSpecFieldWithSingleModelTransforedFeatures( self, model_specs, field, multi_output_field, expected_values): extracts = { # Only need the num_rows from RecordBatch so use fake array of same len # as features. constants.ARROW_RECORD_BATCH_KEY: pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']), constants.FEATURES_KEY: [ { 'feature1': [1.0, 1.1, 1.2], 'feature2': [2.0, 2.1, 2.2], }, ], constants.TRANSFORMED_FEATURES_KEY: [ { 'feature2': [4.0, 4.1, 4.2], }, ] } got = model_util.get_feature_values_for_model_spec_field( model_specs, field, multi_output_field, extracts) self.assertAlmostEqual(expected_values, got) @parameterized.named_parameters( { 'testcase_name': 'multi_model_single_key', 'model_specs': [ config.ModelSpec(name='model1', example_weight_key='feature2'), config.ModelSpec(name='model2', example_weight_key='feature3') ], 'field': 'example_weight_key', 'multi_output_field': 'example_weight_keys', 'expected_values': [ { 'model1': [4.0, 4.1, 4.2], 'model2': [7.0, 7.1, 7.2] }, ] }, { 'testcase_name': 'multi_model_multi_key', 'model_specs': [ config.ModelSpec(name='model1', example_weight_keys={ 'output1': 'feature1', 'output2': 'feature2' }), config.ModelSpec(name='model2', example_weight_keys={ 'output1': 'feature1', 'output3': 'feature3' }) ], 'field': 'example_weight_key', 'multi_output_field': 'example_weight_keys', 'expected_values': [ { 'model1': { 'output1': [1.0, 1.1, 1.2], 'output2': [4.0, 4.1, 4.2] }, 'model2': { 'output1': [1.0, 1.1, 1.2], 'output3': [7.0, 7.1, 7.2] } }, ] }, ) def testGetFeatureValuesForModelSpecFieldWithMultiModelTransforedFeatures( self, model_specs, field, multi_output_field, expected_values): extracts = { # Only need the num_rows from RecordBatch so use fake array of same len # as features. constants.ARROW_RECORD_BATCH_KEY: pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']), constants.FEATURES_KEY: [ { 'feature1': [1.0, 1.1, 1.2], 'feature2': [2.0, 2.1, 2.2], }, ], constants.TRANSFORMED_FEATURES_KEY: [ { 'model1': { 'feature2': [4.0, 4.1, 4.2], 'feature3': [5.0, 5.1, 5.2] }, 'model2': { 'feature2': [6.0, 6.1, 6.2], 'feature3': [7.0, 7.1, 7.2] } }, ] } got = model_util.get_feature_values_for_model_spec_field( model_specs, field, multi_output_field, extracts) self.assertAlmostEqual(expected_values, got) def testGetFeatureValuesForModelSpecFieldNoValues(self): model_spec = config.ModelSpec(name='model1', example_weight_key='feature2') extracts = { constants.ARROW_RECORD_BATCH_KEY: pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']), } got = model_util.get_feature_values_for_model_spec_field( [model_spec], 'example_weight', 'example_weights', extracts) self.assertIsNone(got) @parameterized.named_parameters( ('keras_serving_default', True, 'serving_default'), ('keras_custom_signature', True, 'custom_signature'), ('tf2_serving_default', False, 'serving_default'), ('tf2_custom_signature', False, 'custom_signature')) def testGetCallableWithSignatures(self, save_as_keras, signature_name): export_path = self.createModelWithSingleInput(save_as_keras) if save_as_keras: model = tf.keras.models.load_model(export_path) else: model = tf.compat.v1.saved_model.load_v2(export_path) self.assertIsNotNone(model_util.get_callable(model, signature_name)) @parameterized.named_parameters(('keras', True), ('tf2', False)) def testGetCallableWithMissingSignatures(self, save_as_keras): export_path = self.createModelWithSingleInput(save_as_keras) if save_as_keras: model = tf.keras.models.load_model(export_path) else: model = tf.compat.v1.saved_model.load_v2(export_path) with self.assertRaises(ValueError): model_util.get_callable(model, 'non_existent') @unittest.skipIf(_TF_MAJOR_VERSION < 2, 'not all input types supported for TF1') def testGetCallableWithKerasModel(self): export_path = self.createModelWithMultipleMixedInputs(True) model = tf.keras.models.load_model(export_path) self.assertEqual(model, model_util.get_callable(model)) @parameterized.named_parameters( ('keras_serving_default', True, 'serving_default'), ('keras_custom_signature', True, 'custom_signature'), ('tf2_serving_default', False, None), ('tf2_custom_signature', False, 'custom_signature')) def testGetInputSpecsWithSignatures(self, save_as_keras, signature_name): export_path = self.createModelWithSingleInput(save_as_keras) if save_as_keras: model = tf.keras.models.load_model(export_path) else: model = tf.compat.v1.saved_model.load_v2(export_path) self.assertEqual( { 'input': tf.TensorSpec(name='input', shape=(None, 1), dtype=tf.string), }, model_util.get_input_specs(model, signature_name)) @parameterized.named_parameters(('keras', True), ('tf2', False)) def testGetInputSpecsWithMissingSignatures(self, save_as_keras): export_path = self.createModelWithSingleInput(save_as_keras) if save_as_keras: model = tf.keras.models.load_model(export_path) else: model = tf.compat.v1.saved_model.load_v2(export_path) with self.assertRaises(ValueError): model_util.get_callable(model, 'non_existent') @unittest.skipIf(_TF_MAJOR_VERSION < 2, 'not all input types supported for TF1') def testGetInputSpecsWithKerasModel(self): export_path = self.createModelWithMultipleMixedInputs(True) model = tf.keras.models.load_model(export_path) # Some versions of TF set the TensorSpec.name and others do not. Since we # don't care about the name, clear it from the output for testing purposes specs = model_util.get_input_specs(model) for k, v in specs.items(): if isinstance(v, tf.TensorSpec): specs[k] = tf.TensorSpec(shape=v.shape, dtype=v.dtype) self.assertEqual( { 'input_1': tf.TensorSpec(shape=(None, 2), dtype=tf.int64), 'input_2': tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32), 'input_3': tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32), }, specs) def testInputSpecsToTensorRepresentations(self): tensor_representations = model_util.input_specs_to_tensor_representations( { 'input_1': tf.TensorSpec(shape=(None, 2), dtype=tf.int64), 'input_2': tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32), 'input_3': tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32), }) dense_tensor_representation = text_format.Parse( """ dense_tensor { column_name: "input_1" shape { dim { size: 2 } } } """, schema_pb2.TensorRepresentation()) sparse_tensor_representation = text_format.Parse( """ varlen_sparse_tensor { column_name: "input_2" } """, schema_pb2.TensorRepresentation()) ragged_tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "input_3" } } """, schema_pb2.TensorRepresentation()) self.assertEqual( { 'input_1': dense_tensor_representation, 'input_2': sparse_tensor_representation, 'input_3': ragged_tensor_representation }, tensor_representations) def testInputSpecsToTensorRepresentationsRaisesWithUnknownDims(self): with self.assertRaises(ValueError): model_util.input_specs_to_tensor_representations({ 'input_1': tf.TensorSpec(shape=(None, None), dtype=tf.int64), }) @parameterized.named_parameters( ('keras_default', True, { constants.PREDICTIONS_KEY: { '': [None] } }, None, False, True, 1), ('tf_default', False, { constants.PREDICTIONS_KEY: { '': [None] } }, None, False, True, 1), ('keras_serving_default', True, { constants.PREDICTIONS_KEY: { '': ['serving_default'] } }, None, False, True, 1), ('tf_serving_default', False, { constants.PREDICTIONS_KEY: { '': ['serving_default'] } }, None, False, True, 1), ('keras_custom_single_output', True, { constants.PREDICTIONS_KEY: { '': ['custom_single_output'] } }, None, False, True, 1), ('tf_custom_single_output', False, { constants.PREDICTIONS_KEY: { '': ['custom_single_output'] } }, None, False, True, 1), ('keras_custom_multi_output', True, { constants.PREDICTIONS_KEY: { '': ['custom_multi_output'] } }, None, False, True, 2), ('tf_custom_multi_output', False, { constants.PREDICTIONS_KEY: { '': ['custom_multi_output'] } }, None, False, True, 2), ('multi_model', True, { constants.PREDICTIONS_KEY: { 'model1': ['custom_multi_output'], 'model2': ['custom_multi_output'] } }, None, False, True, 2), ('default_signatures', True, { constants.PREDICTIONS_KEY: { '': [], } }, ['unknown', 'custom_single_output'], False, True, 1), ('keras_prefer_dict_outputs', True, { constants.FEATURES_KEY: { '': [], } }, ['unknown', 'custom_single_output', 'custom_multi_output' ], True, True, 3), ('tf_prefer_dict_outputs', False, { constants.FEATURES_KEY: { '': [], } }, ['unknown', 'custom_single_output', 'custom_multi_output' ], True, True, 3), ('custom_attribute', True, { constants.FEATURES_KEY: { '': ['custom_attribute'], } }, None, True, True, 1), ('keras_no_schema', True, { constants.PREDICTIONS_KEY: { '': [None] } }, None, False, False, 1), ('tf_no_schema', False, { constants.PREDICTIONS_KEY: { '': [None] } }, None, False, False, 1), ) @unittest.skipIf(_TF_MAJOR_VERSION < 2, 'not all signatures supported for TF1') def testModelSignaturesDoFn(self, save_as_keras, signature_names, default_signature_names, prefer_dict_outputs, use_schema, expected_num_outputs): export_path = self.createModelWithMultipleDenseInputs(save_as_keras) eval_shared_models = {} model_specs = [] for sigs in signature_names.values(): for model_name in sigs: if model_name not in eval_shared_models: eval_shared_models[ model_name] = self.createTestEvalSharedModel( eval_saved_model_path=export_path, model_name=model_name, tags=[tf.saved_model.SERVING]) model_specs.append(config.ModelSpec(name=model_name)) eval_config = config.EvalConfig(model_specs=model_specs) schema = self.createDenseInputsSchema() if use_schema else None tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='text', schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = None if use_schema: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample(input_1=1.0, input_2=2.0), self._makeExample(input_1=3.0, input_2=4.0), self._makeExample(input_1=5.0, input_2=6.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'ToExtracts' >> beam.Map(_record_batch_to_extracts) | 'ModelSignatures' >> beam.ParDo( model_util.ModelSignaturesDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models, signature_names=signature_names, default_signature_names=default_signature_names, prefer_dict_outputs=prefer_dict_outputs, tensor_adapter_config=tensor_adapter_config))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) for key in signature_names: self.assertIn(key, got[0]) if prefer_dict_outputs: for entry in got[0][key]: self.assertIsInstance(entry, dict) self.assertLen(entry, expected_num_outputs) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result') def testHasRubberStamp(self): # Model agnostic. self.assertFalse(model_util.has_rubber_stamp(None)) # All non baseline models has rubber stamp. baseline = self.createTestEvalSharedModel( model_name=constants.BASELINE_KEY, is_baseline=True) candidate = self.createTestEvalSharedModel( model_name=constants.CANDIDATE_KEY, rubber_stamp=True) self.assertTrue(model_util.has_rubber_stamp([baseline, candidate])) # Not all non baseline has rubber stamp. candidate_nr = self.createTestEvalSharedModel( model_name=constants.CANDIDATE_KEY) self.assertFalse(model_util.has_rubber_stamp([candidate_nr])) self.assertFalse( model_util.has_rubber_stamp([baseline, candidate, candidate_nr]))
def testValidateMetricsMetricTDistributionChangeAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ( slice_key, { # This is the mean of the diff. metric_types.MetricKey(name='auc', model_name='baseline'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.6), metric_types.MetricKey(name='auc', is_diff=True): types.ValueWithTDistribution(sample_mean=0.1, unsampled_value=0.1), }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" is_diff: true } metric_value { double_value { value: 0.1 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertAlmostEqual(result, expected)
def testPredictExtractorWithSequentialKerasModel(self): # Note that the input will be called 'test_input' model = tf.keras.models.Sequential([ tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, input_shape=(2, ), name='test') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) train_features = {'test_input': [[0.0, 0.0], [1.0, 1.0]]} labels = [[1], [0]] example_weights = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (train_features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "test" value { dense_tensor { column_name: "test" shape { dim { size: 2 } } } } } } } feature { name: "test" type: FLOAT } feature { name: "non_model_feature" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) # Notice that the features are 'test' but the model expects 'test_input'. # This tests that the PredictExtractor properly handles this case. examples = [ self._makeExample( test=[0.0, 0.0], non_model_feature=0), # should be ignored by model self._makeExample( test=[1.0, 1.0], non_model_feature=1), # should be ignored by model ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testTFlitePredictExtractorWithKerasModel(self, multi_model, multi_output): input1 = tf.keras.layers.Input(shape=(1, ), name='input1') input2 = tf.keras.layers.Input(shape=(1, ), name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layers = {} output_layers['output1'] = (tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, name='output1')(input_layer)) if multi_output: output_layers['output2'] = (tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid, name='output2')(input_layer)) model = tf.keras.models.Model(inputs, output_layers) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) train_features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]} labels = {'output1': [[1], [0]]} if multi_output: labels['output2'] = [[1], [0]] example_weights = {'output1': [1.0, 0.5]} if multi_output: example_weights['output2'] = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (train_features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) converter = tf.compat.v2.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() tflite_model_dir = tempfile.mkdtemp() with tf.io.gfile.GFile(os.path.join(tflite_model_dir, 'tflite'), 'wb') as f: f.write(tflite_model) model_specs = [config.ModelSpec(name='model1', model_type='tf_lite')] if multi_model: model_specs.append( config.ModelSpec(name='model2', model_type='tf_lite')) eval_config = config.EvalConfig(model_specs=model_specs) eval_shared_models = [ self.createTestEvalSharedModel( model_name='model1', eval_saved_model_path=tflite_model_dir, model_type='tf_lite') ] if multi_model: eval_shared_models.append( self.createTestEvalSharedModel( model_name='model2', eval_saved_model_path=tflite_model_dir, model_type='tf_lite')) schema = text_format.Parse( """ feature { name: "input1" type: FLOAT } feature { name: "input2" type: FLOAT } feature { name: "non_model_feature" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) feature_extractor = features_extractor.FeaturesExtractor(eval_config) predictor = tflite_predict_extractor.TFLitePredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_models) examples = [ self._makeExample(input1=0.0, input2=1.0, non_model_feature=0), self._makeExample(input1=1.0, input2=0.0, non_model_feature=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | predictor.stage_name >> predictor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got = got[0] self.assertIn(constants.PREDICTIONS_KEY, got) self.assertLen(got[constants.PREDICTIONS_KEY], 2) for item in got[constants.PREDICTIONS_KEY]: if multi_model: self.assertIn('model1', item) self.assertIn('model2', item) if multi_output: self.assertIn('Identity', item['model1']) self.assertIn('Identity_1', item['model1']) elif multi_output: self.assertIn('Identity', item) self.assertIn('Identity_1', item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testPredictExtractorWithMultiModels(self): temp_export_dir = self._getExportDir() export_dir1, _ = multi_head.simple_multi_head(temp_export_dir, None) export_dir2, _ = multi_head.simple_multi_head(temp_export_dir, None) eval_config = config.EvalConfig(model_specs=[ config.ModelSpec(name='model1'), config.ModelSpec(name='model2') ]) eval_shared_model1 = self.createTestEvalSharedModel( eval_saved_model_path=export_dir1, tags=[tf.saved_model.SERVING]) eval_shared_model2 = self.createTestEvalSharedModel( eval_saved_model_path=export_dir2, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ feature { name: "age" type: FLOAT } feature { name: "langauge" type: BYTES } feature { name: "english_label" type: FLOAT } feature { name: "chinese_label" type: FLOAT } feature { name: "other_label" type: FLOAT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model={ 'model1': eval_shared_model1, 'model2': eval_shared_model2 }, tensor_adapter_config=tensor_adapter_config) examples = [ self._makeExample(age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample(age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=4) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) for item in got: # We can't verify the actual predictions, but we can verify the keys self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) for pred in item[constants.BATCHED_PREDICTIONS_KEY]: for model_name in ('model1', 'model2'): self.assertIn(model_name, pred) for output_name in ('chinese_head', 'english_head', 'other_head'): for pred_key in ('logistic', 'probabilities', 'all_classes'): self.assertIn( output_name + '/' + pred_key, pred[model_name]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testPredictExtractorWithRegressionModel(self): temp_export_dir = self._getExportDir() export_dir, _ = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( temp_export_dir, None)) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) schema = text_format.Parse( """ feature { name: "prediction" type: FLOAT } feature { name: "label" type: FLOAT } feature { name: "fixed_int" type: INT } feature { name: "fixed_float" type: FLOAT } feature { name: "fixed_string" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=1.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) self.assertIn(constants.BATCHED_PREDICTIONS_KEY, got[0]) expected_preds = [0.2, 0.8, 0.5] self.assertAlmostEqual( got[0][constants.BATCHED_PREDICTIONS_KEY], expected_preds) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testPreprocessedFeaturesExtractor(self, save_as_keras, preprocessing_function_names, expected_extract_keys): export_path = self.createModelWithMultipleDenseInputs(save_as_keras) eval_config = config.EvalConfig(model_specs=[ config.ModelSpec( preprocessing_function_names=preprocessing_function_names) ]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_path, tags=[tf.saved_model.SERVING]) schema = self.createDenseInputsSchema() tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) feature_extractor = features_extractor.FeaturesExtractor(eval_config) transformation_extractor = ( transformed_features_extractor.TransformedFeaturesExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config)) examples = [ self._makeExample(input_1=1.0, input_2=2.0), self._makeExample(input_1=3.0, input_2=4.0), self._makeExample(input_1=5.0, input_2=6.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | transformation_extractor.stage_name >> transformation_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 2) for item in got: for extracts_key, feature_keys in expected_extract_keys.items( ): self.assertIn(extracts_key, item) for value in item[extracts_key]: self.assertEqual(set(feature_keys), set(value.keys()), msg='got={}'.format(item)) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testBatchSizeLimit(self): temp_export_dir = self._getExportDir() _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier( None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) eval_config = config.EvalConfig(model_specs=[config.ModelSpec()]) schema = text_format.Parse( """ feature { name: "classes" type: BYTES } feature { name: "scores" type: FLOAT } feature { name: "labels" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) input_extractor = batched_input_extractor.BatchedInputExtractor( eval_config) predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, tensor_adapter_config=tensor_adapter_config) examples = [] for _ in range(4): examples.append( self._makeExample(classes='first', scores=0.0, labels='third')) with beam.Pipeline() as pipeline: predict_extracts = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | input_extractor.stage_name >> input_extractor.ptransform | predict_extractor.stage_name >> predict_extractor.ptransform) def check_result(got): try: self.assertLen(got, 4) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0, my_slice='a'), self._makeExample(age=3.0, language='chinese', label=0.0, my_slice='a'), self._makeExample(age=4.0, language='english', label=1.0, my_slice='b'), self._makeExample(age=5.0, language='chinese', label=1.0, my_slice='c'), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])] extractors_with_feature_extraction = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size=3, materialize=False), feature_extractor.FeatureExtractor( extract_source=constants.INPUT_KEY, extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY), slice_key_extractor.SliceKeyExtractor(slice_spec, materialize=False) ] eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ], extractors=extractors_with_feature_extraction) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('my_slice', 'a'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 6.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('my_slice', 'b'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 4.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, (('my_slice', 'c'), ): { 'accuracy': { 'doubleValue': 0.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 5.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['my_slice'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testValidateMetricsMetricValueAndThreshold(self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "weighted_example_count" } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertEqual(result, expected)
def testRunModelAnalysisWithKerasModel(self): input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data') output_layer = tf.keras.layers.Dense( 10, activation=tf.nn.softmax)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.categorical_crossentropy) features = {'data': [[0.0] * 28 * 28]} labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(data=[0.0] * 28 * 28, label=1.0), self._makeExample(data=[1.0] * 28 * 28, label=5.0), self._makeExample(data=[1.0] * 28 * 28, label=9.0), ] data_location = self._writeTFExamplesToTFRecords(examples) metrics_spec = config.MetricsSpec() for metric in (tf.keras.metrics.AUC(), ): cfg = tf.keras.utils.serialize_keras_object(metric) metrics_spec.metrics.append( config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))) for class_id in (0, 5, 9): metrics_spec.binarize.class_ids.append(class_id) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], metrics_specs=[metrics_spec]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { 'classId:0': { 'auc': True, }, 'classId:5': { 'auc': True, }, 'classId:9': { 'auc': True, }, } for class_id in expected_metrics: self.assertIn(class_id, got_metrics) for k in expected_metrics[class_id]: self.assertIn(k, got_metrics[class_id])
def testModelSignaturesDoFn(self, save_as_keras, signature_names, default_signature_names, prefer_dict_outputs, use_schema, expected_num_outputs): export_path = self.createModelWithMultipleDenseInputs(save_as_keras) eval_shared_models = {} model_specs = [] for sigs in signature_names.values(): for model_name in sigs: if model_name not in eval_shared_models: eval_shared_models[ model_name] = self.createTestEvalSharedModel( eval_saved_model_path=export_path, model_name=model_name, tags=[tf.saved_model.SERVING]) model_specs.append(config.ModelSpec(name=model_name)) eval_config = config.EvalConfig(model_specs=model_specs) schema = self.createDenseInputsSchema() if use_schema else None tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='text', schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = None if use_schema: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample(input_1=1.0, input_2=2.0), self._makeExample(input_1=3.0, input_2=4.0), self._makeExample(input_1=5.0, input_2=6.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'ToExtracts' >> beam.Map(_record_batch_to_extracts) | 'ModelSignatures' >> beam.ParDo( model_util.ModelSignaturesDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models, signature_names=signature_names, default_signature_names=default_signature_names, prefer_dict_outputs=prefer_dict_outputs, tensor_adapter_config=tensor_adapter_config))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) for key in signature_names: self.assertIn(key, got[0]) if prefer_dict_outputs: for entry in got[0][key]: self.assertIsInstance(entry, dict) self.assertLen(entry, expected_num_outputs) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testWriteValidationResults(self): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ input_extractor.InputExtractor(eval_config), predict_extractor_v2.PredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[]) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter validation_result = model_eval_lib.load_validation_result( os.path.dirname(validations_file)) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def testRunModelAnalysisWithLegacyQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), legacy_ndcg.NdcgMetricCombineFn(at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec()) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testLabelsExtractor(self, label): model_spec = config.ModelSpec(label_key=label) eval_config = config.EvalConfig(model_specs=[model_spec]) feature_extractor = features_extractor.FeaturesExtractor(eval_config) label_extractor = labels_extractor.LabelsExtractor(eval_config) label_feature = '' if label is not None: label_feature = """ feature { name: "%s" type: FLOAT } """ % label schema = text_format.Parse( label_feature + """ feature { name: "fixed_int" type: INT } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) def maybe_add_key(d, key, value): if key is not None: d[key] = value return d example_kwargs = [ maybe_add_key({ 'fixed_int': 1, }, label, 1.0), maybe_add_key({ 'fixed_int': 1, }, label, 0.0), maybe_add_key({ 'fixed_int': 2, }, label, 0.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([ self._makeExample(**kwargs).SerializeToString() for kwargs in example_kwargs ], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | label_extractor.stage_name >> label_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) tf.compat.v1.logging.error('HERE >>>> {}'.format(got)) self.assertAlmostEqual( got[0][constants.LABELS_KEY][0], np.array([1.0]) if label is not None else None) self.assertAlmostEqual( got[0][constants.LABELS_KEY][1], np.array([0.0]) if label is not None else None) self.assertAlmostEqual( got[0][constants.LABELS_KEY][2], np.array([0.0]) if label is not None else None) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def assertGeneralMetricsComputedWithBeamAre( self, eval_saved_model_path: Text, examples_pcollection: beam.pvalue.PCollection, slice_spec: List[slicer.SingleSliceSpec], add_metrics_callbacks: List[types.AddMetricsCallbackType], expected_slice_metrics: Dict[Any, Dict[Text, Any]]): """Checks metrics computed using Beam. A more general version of assertMetricsComputedWithBeamAre. Note that the caller is responsible for setting up and running the Beam pipeline. Example usage: def add_metrics(features, predictions, labels): metric_ops = { 'mse': tf.metrics.mean_squared_error(labels, predictions['logits']), 'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']), } return metric_ops with beam.Pipeline() as pipeline: expected_slice_metrics = { (): { 'mae': 0.1, 'mse': 0.2, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, (('age', 10),): { 'mae': 0.2, 'mse': 0.3, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, } examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=path, examples_pcollection=examples, slice_spec=[tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['age'])], add_metrics_callbacks=[ add_metrics, tfma.post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. examples_pcollection: A PCollection of serialized example bytes. slice_spec: List of slice specifications. add_metrics_callbacks: Callbacks for adding additional metrics. expected_slice_metrics: Dictionary of dictionaries describing the expected metrics for each slice. The outer dictionary map slice keys to the expected metrics for that slice. """ def check_metrics(got): """Check metrics callback.""" try: slices = {} for slice_key, value in got: slices[slice_key] = value self.assertItemsEqual(list(slices.keys()), list(expected_slice_metrics.keys())) for slice_key, expected_metrics in expected_slice_metrics.items( ): self.assertDictElementsWithinBounds( got_values_dict=slices[slice_key], expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig(slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) # pylint: disable=no-value-for-parameter (metrics, _), _ = (examples_pcollection | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> Extract(extractors=extractors) | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model)) # pylint: enable=no-value-for-parameter beam_util.assert_that(metrics, check_metrics)
def testUpdateConfigWithDefaultsRemoveBaselineModel(self): eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "MeanLabel" threshold { value_threshold { lower_bound { value: 0.9 } } change_threshold { direction: HIGHER_IS_BETTER absolute{ value: -1e-10 } } } } thresholds { key: "my_metric" value { value_threshold { lower_bound { value: 0.9 } } change_threshold { direction: HIGHER_IS_BETTER absolute{ value: -1e-10 } } } } } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs {} metrics_specs { metrics { class_name: "MeanLabel" threshold { value_threshold { lower_bound { value: 0.9 } } } } thresholds { key: "my_metric" value { value_threshold { lower_bound { value: 0.9 } } } } model_names: [""] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults( eval_config, maybe_remove_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)