def testUpdateConfigWithDefaultsMultiModel(self): eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" } metrics_specs { metrics { class_name: "WeightedExampleCount" } } metrics_specs { metrics { class_name: "MeanLabel" } model_names: ["model1"] } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["model1", "model2"] } metrics_specs { metrics { class_name: "MeanLabel" } model_names: ["model1"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config_pb2.EvalConfig()) got_eval_config = config_util.update_eval_config_with_defaults( eval_config) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithoutBaselineModelWhenModelNameProvided(self): eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: "candidate" } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["candidate"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config_pb2.EvalConfig()) got_eval_config = config_util.update_eval_config_with_defaults( eval_config, has_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithDefaultsDoesNotAutomaticallyAddBaselineModel(self): eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["model1", "model2"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config_pb2.EvalConfig()) got_eval_config = config_util.update_eval_config_with_defaults( eval_config, has_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testHasChangeThreshold(self): eval_config = text_format.Parse( """ metrics_specs { metrics { class_name: "MeanLabel" threshold { change_threshold { direction: HIGHER_IS_BETTER absolute { value: 0.1 } } } } } """, config_pb2.EvalConfig()) self.assertTrue(config_util.has_change_threshold(eval_config)) eval_config = text_format.Parse( """ metrics_specs { thresholds { key: "my_metric" value { change_threshold { direction: HIGHER_IS_BETTER absolute { value: 0.1 } } } } } """, config_pb2.EvalConfig()) self.assertTrue(config_util.has_change_threshold(eval_config)) eval_config = text_format.Parse( """ metrics_specs { metrics { class_name: "MeanLabel" threshold { value_threshold { lower_bound { value: 0.9 } } } } } """, config_pb2.EvalConfig()) self.assertFalse(config_util.has_change_threshold(eval_config))
def test_features_extractor_no_features(self): model_spec = config_pb2.ModelSpec() eval_config = config_pb2.EvalConfig(model_specs=[model_spec]) feature_extractor = features_extractor.FeaturesExtractor(eval_config) tfx_io = tf_example_record.TFExampleBeamRecord( raw_record_column_name=constants.ARROW_INPUT_COLUMN, physical_format='inmem', telemetry_descriptors=['testing']) with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create([b''] * 3) | 'DecodeToRecordBatch' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform) def check_result(got): self.assertLen(got, 1) self.assertIn(constants.FEATURES_KEY, got[0]) self.assertEmpty(got[0][constants.FEATURES_KEY]) self.assertIn(constants.INPUT_KEY, got[0]) self.assertLen(got[0][constants.INPUT_KEY], 3) util.assert_that(result, check_result, label='CheckResult')
def testSerializeDeserializeEvalConfig(self): output_path = self._getTempDir() options = config_pb2.Options() options.compute_confidence_intervals.value = False options.min_slice_size.value = 1 eval_config = config_pb2.EvalConfig(slicing_specs=[ config_pb2.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config_pb2.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) data_location = '/path/to/data' file_format = 'tfrecords' model_location = '/path/to/model' with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'), 'w') as f: f.write( eval_config_writer._serialize_eval_run(eval_config, data_location, file_format, {'': model_location})) got_eval_config, got_data_location, got_file_format, got_model_locations = ( eval_config_writer.load_eval_run(output_path)) self.assertEqual(eval_config, got_eval_config) self.assertEqual(data_location, got_data_location) self.assertEqual(file_format, got_file_format) self.assertEqual({'': model_location}, got_model_locations)
def testMetricKeysToSkipForConfidenceIntervals(self): metrics_specs = [ config_pb2.MetricsSpec(metrics=[ config_pb2.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config_pb2.MetricThreshold( value_threshold=config_pb2.GenericValueThreshold())), config_pb2.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config_pb2.MetricThreshold( change_threshold=config_pb2.GenericChangeThreshold())), config_pb2.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config_pb2.MetricThreshold( change_threshold=config_pb2.GenericChangeThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=[ 'output_name1', 'output_name2' ]), ] metrics_specs += metric_specs.specs_from_metrics( [tf.keras.metrics.MeanSquaredError('mse')], model_names=['model_name1', 'model_name2']) keys = metric_specs.metric_keys_to_skip_for_confidence_intervals( metrics_specs, eval_config=config_pb2.EvalConfig()) self.assertLen(keys, 8) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', example_weighted=True), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', example_weighted=True), keys)
def testSliceKeys(self, model_names, extracts, slice_specs, expected_slices): eval_config = config_pb2.EvalConfig(model_specs=[ config_pb2.ModelSpec(name=name) for name in model_names ]) with beam.Pipeline() as pipeline: slice_keys_extracts = ( pipeline | 'CreateTestInput' >> beam.Create(extracts) | 'ExtractSlices' >> slice_key_extractor.ExtractSliceKeys( slice_spec=slice_specs, eval_config=eval_config)) def check_result(got): try: self.assertLen(got, 2) got_results = [] for item in got: self.assertIn(constants.SLICE_KEY_TYPES_KEY, item) got_results.append( sorted(item[constants.SLICE_KEY_TYPES_KEY])) self.assertCountEqual(got_results, expected_slices) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(slice_keys_extracts, check_result)
def testGetModelAndOutputNamesEmptyPredictions(self): eval_config = config_pb2.EvalConfig( model_specs=[config_pb2.ModelSpec()]) self.assertEmpty( util.StandardExtracts({ constants.PREDICTIONS_KEY: {} }).get_model_and_output_names(eval_config))
def _assert_test(self, num_buckets, baseline_examples, comparison_examples, lift_metric_value, ignore_out_of_bound_examples=False): eval_config = config_pb2.EvalConfig( cross_slicing_specs=[config_pb2.CrossSlicingSpec()]) computations = lift.Lift( num_buckets=num_buckets, ignore_out_of_bound_examples=ignore_out_of_bound_examples ).computations(eval_config=eval_config) histogram = computations[0] lift_metrics = computations[1] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter baseline_result = ( pipeline | 'CreateB' >> beam.Create(baseline_examples) | 'ProcessB' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSliceB' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogramB' >> beam.CombinePerKey(histogram.combiner) ) # pyformat: ignore comparison_result = ( pipeline | 'CreateC' >> beam.Create(comparison_examples) | 'ProcessC' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSliceC' >> beam.Map(lambda x: (('slice'), x)) | 'ComputeHistogramC' >> beam.CombinePerKey(histogram.combiner) ) # pyformat: ignore # pylint: enable=no-value-for-parameter merged_result = ((baseline_result, comparison_result) | 'MergePCollections' >> beam.Flatten()) def check_result(got): try: self.assertLen(got, 2) slice_1, metric_1 = got[0] slice_2, metric_2 = got[1] lift_value = None if not slice_1: lift_value = lift_metrics.cross_slice_comparison( metric_1, metric_2) else: lift_value = lift_metrics.cross_slice_comparison( metric_2, metric_1) self.assertDictElementsAlmostEqual( lift_value, { metric_types.MetricKey(name=f'lift@{num_buckets}'): lift_metric_value, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(merged_result, check_result, label='result')
def testStandardMetricInputsWithCustomLabelKeys(self): example = metric_types.StandardMetricInputs( labels={ 'custom_label': np.array([2]), 'other_label': np.array([0]) }, predictions={'custom_prediction': np.array([0, 0.5, 0.3, 0.9])}, example_weights=np.array([1.0])) eval_config = config_pb2.EvalConfig(model_specs=[ config_pb2.ModelSpec(label_key='custom_label', prediction_key='custom_prediction') ]) iterator = metric_util.to_label_prediction_example_weight( example, eval_config=eval_config) for expected_label, expected_prediction in zip((0.0, 0.0, 1.0, 0.0), (0.0, 0.5, 0.3, 0.9)): got_label, got_pred, got_example_weight = next(iterator) self.assertAllClose(got_label, np.array([expected_label]), atol=0, rtol=0) self.assertAllClose(got_pred, np.array([expected_prediction]), atol=0, rtol=0) self.assertAllClose(got_example_weight, np.array([1.0]), atol=0, rtol=0)
def testUpdateConfigWithDefaultsNoBaselineModelNonRubberstamp(self): eval_config_pbtxt = """ model_specs { name: "" } metrics_specs { metrics { class_name: "MeanLabel" per_slice_thresholds { slicing_specs: {} threshold { value_threshold { lower_bound { value: 0.9 } } change_threshold { direction: HIGHER_IS_BETTER absolute { value: -1e-10 } } } } } } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) with self.assertRaises(RuntimeError): config_util.update_eval_config_with_defaults(eval_config, has_baseline=False, rubber_stamp=False)
def testModelSignaturesDoFn(self, save_as_keras, signature_names, default_signature_names, prefer_dict_outputs, use_schema, expected_num_outputs): export_path = self.createModelWithMultipleDenseInputs(save_as_keras) eval_shared_models = {} model_specs = [] for sigs in signature_names.values(): for model_name in sigs: if model_name not in eval_shared_models: eval_shared_models[ model_name] = self.createTestEvalSharedModel( eval_saved_model_path=export_path, model_name=model_name, tags=[tf.saved_model.SERVING]) model_specs.append(config_pb2.ModelSpec(name=model_name)) eval_config = config_pb2.EvalConfig(model_specs=model_specs) schema = self.createDenseInputsSchema() if use_schema else None tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='text', schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) examples = [ self._makeExample(input_1=1.0, input_2=2.0), self._makeExample(input_1=3.0, input_2=4.0), self._makeExample(input_1=5.0, input_2=6.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'ToExtracts' >> beam.Map(_record_batch_to_extracts) | 'ModelSignatures' >> beam.ParDo( model_util.ModelSignaturesDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models, signature_names=signature_names, default_signature_names=default_signature_names, prefer_dict_outputs=prefer_dict_outputs))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) for key in signature_names: self.assertIn(key, got[0]) if prefer_dict_outputs: self.assertIsInstance(got[0][key], dict) self.assertEqual(tfma_util.batch_size(got[0][key]), expected_num_outputs) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testBatchSizeLimit(self): temp_export_dir = self._getExportDir() _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier( None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) eval_config = config_pb2.EvalConfig(model_specs=[config_pb2.ModelSpec()]) schema = text_format.Parse( """ feature { name: "classes" type: BYTES } feature { name: "scores" type: FLOAT } feature { name: "labels" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) feature_extractor = features_extractor.FeaturesExtractor( eval_config=eval_config, tensor_representations=tensor_adapter_config.tensor_representations) prediction_extractor = predictions_extractor.PredictionsExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model) examples = [] for _ in range(4): examples.append( self._makeExample(classes='first', scores=0.0, labels='third')) with beam.Pipeline() as pipeline: predict_extracts = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | prediction_extractor.stage_name >> prediction_extractor.ptransform) def check_result(got): try: self.assertLen(got, 4) # We can't verify the actual predictions, but we can verify the keys. for item in got: self.assertIn(constants.PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def _makeEvalResults(self): result_a = view_types.EvalResult(slicing_metrics=self._makeTestData(), plots=None, attributions=None, config=config_pb2.EvalConfig(), data_location=self.data_location_1, file_format='tfrecords', model_location=self.model_location_1) result_b = view_types.EvalResult( slicing_metrics=[self.result_c2], plots=None, attributions=None, config=config_pb2.EvalConfig(), data_location=self.full_data_location_2, file_format='tfrecords', model_location=self.full_model_location_2) return view_types.EvalResults([result_a, result_b], constants.MODEL_CENTRIC_MODE)
def testCustomTFMetricWithPadding(self, example_indices, expected): computation = tf_metric_wrapper.tf_metric_computations( [ _CustomMetric(name='custom_label', update_y_pred=False), _CustomMetric(name='custom_pred', update_y_pred=True), ], eval_config=config_pb2.EvalConfig(model_specs=[ config_pb2.ModelSpec(padding_options=config_pb2.PaddingOptions( label_int_padding=-1, prediction_float_padding=-1.0, )) ]), example_weighted=True)[0] examples = [{ 'labels': np.array([1], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.3, 0.0]), 'example_weights': np.array([1.0]) }, { 'labels': np.array([1, 2], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.0]), 'example_weights': np.array([1.0]) }, { 'labels': np.array([1, 2, 3], dtype=np.int64), 'predictions': np.array([0.1, 0.2, 0.3]), 'example_weights': np.array([2.0]) }] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([examples[i] for i in example_indices]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_label_key = metric_types.MetricKey( name='custom_label', example_weighted=True) custom_pred_key = metric_types.MetricKey( name='custom_pred', example_weighted=True) self.assertDictElementsAlmostEqual(got_metrics, expected) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testGetModelAndOutputNamesMultiOutput(self): eval_config = config_pb2.EvalConfig( model_specs=[config_pb2.ModelSpec()]) self.assertEqual([(None, 'output1'), (None, 'output2')], util.StandardExtracts({ constants.PREDICTIONS_KEY: { 'output1': np.array([]), 'output2': np.array([]) } }).get_model_and_output_names(eval_config))
def testSqlSliceKeyExtractorWithMultipleSchema(self): eval_config = config_pb2.EvalConfig(slicing_specs=[ config_pb2.SlicingSpec(slice_keys_sql=""" SELECT STRUCT(fixed_string) FROM example.fixed_string, example.fixed_int WHERE fixed_int = 1 """) ]) slice_key_extractor = sql_slice_key_extractor.SqlSliceKeyExtractor( eval_config) record_batch_1 = pa.RecordBatch.from_arrays([ pa.array([[1], [1], [2]], type=pa.list_(pa.int64())), pa.array([[1.0], [1.0], [2.0]], type=pa.list_(pa.float64())), pa.array([['fixed_string1'], ['fixed_string2'], ['fixed_string3']], type=pa.list_(pa.string())), ], ['fixed_int', 'fixed_float', 'fixed_string']) record_batch_2 = pa.RecordBatch.from_arrays([ pa.array([[1], [1], [2]], type=pa.list_(pa.int64())), pa.array([[1.0], [1.0], [2.0]], type=pa.list_(pa.float64())), pa.array([['fixed_string1'], ['fixed_string2'], ['fixed_string3']], type=pa.list_(pa.string())), pa.array([['extra_field1'], ['extra_field2'], ['extra_field3']], type=pa.list_(pa.string())), ], ['fixed_int', 'fixed_float', 'fixed_string', 'extra_field']) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([record_batch_1, record_batch_2], reshuffle=False) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | slice_key_extractor.stage_name >> slice_key_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 2) self.assertEqual(got[0][constants.SLICE_KEY_TYPES_KEY], [[(('fixed_string', 'fixed_string1'),)], [(('fixed_string', 'fixed_string2'),)], []]) self.assertEqual(got[1][constants.SLICE_KEY_TYPES_KEY], [[(('fixed_string', 'fixed_string1'),)], [(('fixed_string', 'fixed_string2'),)], []]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result)
def testSqlSliceKeyExtractorWithEmptySqlConfig(self): eval_config = config_pb2.EvalConfig() feature_extractor = features_extractor.FeaturesExtractor( eval_config=eval_config) slice_key_extractor = sql_slice_key_extractor.SqlSliceKeyExtractor( eval_config) tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='inmem', telemetry_descriptors=['test', 'component'], schema=_SCHEMA, raw_record_column_name=constants.ARROW_INPUT_COLUMN) examples = [ self._makeExample(fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample(fixed_int=2, fixed_float=0.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | feature_extractor.stage_name >> feature_extractor.ptransform | slice_key_extractor.stage_name >> slice_key_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) np.testing.assert_equal( got[0][constants.SLICE_KEY_TYPES_KEY], types.VarLenTensorValue.from_dense_rows( [np.array([]), np.array([]), np.array([])])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result)
def get_evaluation_config( problem_type: constants.ProblemType, evaluation_column_specs: EvaluationColumnSpecs, slice_features: List[List[ColumnSpec]], class_names: Optional[List[Text]] = None, positive_class_names: Optional[List[Text]] = None, top_k_list: Optional[List[int]] = None ) -> model_evaluation_pb2.EvaluationConfig: """Build a Model Evaluation Configuration. Args: problem_type: One of the ProblemType enum. evaluation_column_specs: column specs necessary for parsing evaluation data. slice_features: List of slice specs, each a list of keys to slice. The default slice over all values will automatically be added. class_names: For classification-type problems, a list of string names for classes. positive_class_names: For classification-type problems, a list of string names for classes to be treated as positively valued. top_k_list: For classification-type problems, if specified, a list of top-k aggregations. Returns: An EvaluationConfig. """ tfma_eval_config = config_pb2.EvalConfig() tfma_eval_config.model_specs.append( config_pb2.ModelSpec( prediction_key=evaluation_column_specs.predicted_score_column_spec .as_string(), prediction_keys=None, label_key=evaluation_column_specs.ground_truth_column_spec.as_string( ), label_keys=None)) metric_specs = _get_metric_specs(problem_type, class_names, positive_class_names, top_k_list) assert metric_specs, 'At least one metric_spec must be defined %r' % metric_specs tfma_eval_config.metrics_specs.extend(metric_specs) slicing_specs = _get_tfma_slicing_specs(slice_features) assert slicing_specs, 'At least one slicing_spec must be defined %r' % slicing_specs tfma_eval_config.slicing_specs.extend(slicing_specs) adapter = tfma_adapter.TFMAToME( class_name_list=class_names, predicted_label_column_spec=evaluation_column_specs .predicted_label_column_spec, predicted_label_id_column_spec=evaluation_column_specs .predicted_label_id_column_spec) return adapter.eval_config(tfma_eval_config)
def testGetModelAndOutputNamesMultiModel(self): eval_config = config_pb2.EvalConfig(model_specs=[ config_pb2.ModelSpec(name=constants.BASELINE_KEY), config_pb2.ModelSpec(name=constants.CANDIDATE_KEY) ]) self.assertEqual([(constants.BASELINE_KEY, None), (constants.CANDIDATE_KEY, None)], util.StandardExtracts({ constants.PREDICTIONS_KEY: { constants.BASELINE_KEY: np.array([]), constants.CANDIDATE_KEY: np.array([]) } }).get_model_and_output_names(eval_config))
def testSqlSliceKeyExtractor(self): eval_config = config_pb2.EvalConfig(slicing_specs=[ config_pb2.SlicingSpec(slice_keys_sql=""" SELECT STRUCT(fixed_string) FROM example.fixed_string, example.fixed_int WHERE fixed_int = 1 """) ]) slice_key_extractor = sql_slice_key_extractor.SqlSliceKeyExtractor( eval_config) tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='inmem', telemetry_descriptors=['test', 'component'], schema=_SCHEMA, raw_record_column_name=constants.ARROW_INPUT_COLUMN) examples = [ self._makeExample( fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample( fixed_int=1, fixed_float=1.0, fixed_string='fixed_string2'), self._makeExample( fixed_int=2, fixed_float=0.0, fixed_string='fixed_string3') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples], reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | slice_key_extractor.stage_name >> slice_key_extractor.ptransform) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) self.assertEqual(got[0][constants.SLICE_KEY_TYPES_KEY], [[(('fixed_string', 'fixed_string1'),)], [(('fixed_string', 'fixed_string2'),)], []]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result)
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics_callbacks, slice_spec=None, custom_metrics_check=None, custom_plots_check=None, custom_result_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None or custom_result_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config_pb2.EvalConfig(slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) with beam.Pipeline() as pipeline: (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots( # pylint: disable=protected-access eval_shared_model=eval_shared_model, compute_confidence_intervals=self. compute_confidence_intervals, random_seed_for_testing=self.deterministic_test_seed)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot') result = pipeline.run() if custom_result_check is not None: custom_result_check(result)
def testUpdateConfigWithDefaultsEmtpyModelName(self): eval_config_pbtxt = """ model_specs { name: "" } metrics_specs { metrics { class_name: "ExampleCount" } } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "" } metrics_specs { metrics { class_name: "ExampleCount" } model_names: [""] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config_pb2.EvalConfig()) got_eval_config = config_util.update_eval_config_with_defaults( eval_config) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithDefaultsAutomaticallyAddsBaselineModel(self): eval_config_pbtxt = """ model_specs { label_key: "my_label" } metrics_specs { metrics { class_name: "ExampleCount" } } """ eval_config = text_format.Parse(eval_config_pbtxt, config_pb2.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "candidate" label_key: "my_label" } model_specs { name: "baseline" label_key: "my_label" is_baseline: true } metrics_specs { metrics { class_name: "ExampleCount" } model_names: ["candidate", "baseline"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config_pb2.EvalConfig()) got_eval_config = config_util.update_eval_config_with_defaults( eval_config, has_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testBatchedPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_export_dir) eval_config = config_pb2.EvalConfig( model_specs=[config_pb2.ModelSpec()]) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) extractor = predict_extractor.PredictExtractor( eval_shared_model, eval_config=eval_config) predict_extracts = ( pipeline | 'Create' >> beam.Create(serialized_examples, reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Predict' >> extractor.ptransform) def check_result(got): try: self.assertLen(got, 2) for item in got: self.assertIn(constants.FEATURES_KEY, item) for feature in ('language', 'age'): for features_dict in item[constants.FEATURES_KEY]: self.assertIn(feature, features_dict) self.assertIn(constants.LABELS_KEY, item) self.assertIn(constants.PREDICTIONS_KEY, item) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result, label='result')
def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self): computations = metric_specs.to_computations([ config_pb2.MetricsSpec(metrics=[ config_pb2.MetricConfig(class_name='CategoricalAccuracy') ]), config_pb2.MetricsSpec( metrics=[ config_pb2.MetricConfig(class_name='BinaryCrossentropy') ], binarize=config_pb2.BinarizationOptions( class_ids={'values': [1]}), aggregate=config_pb2.AggregationOptions(micro_average=True)) ], config_pb2.EvalConfig()) # 3 separate computations should be used (one for aggregated metrics, one # for non-aggregated metrics, and one for metrics associated with class 1) self.assertLen(computations, 3)
def testMacroAverage(self): metric_name = 'test' class_ids = [0, 1, 2] sub_keys = [metric_types.SubKey(class_id=i) for i in class_ids] sub_key_values = [0.1, 0.2, 0.3] computations = aggregation.macro_average( metric_name, sub_keys, eval_config=config_pb2.EvalConfig(), class_weights={ 0: 1.0, 1: 1.0, 2: 1.0 }) metric = computations[0] sub_metrics = {} for sub_key, value in zip(sub_keys, sub_key_values): key = metric_types.MetricKey(name=metric_name, sub_key=sub_key) sub_metrics[key] = value with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create([((), sub_metrics)]) | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] expected_value = (0.1 + 0.2 + 0.3) / 3.0 self.assertDictElementsAlmostEqual(got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testSerializeDeserializeLegacyEvalConfig(self): output_path = self._getTempDir() old_config = LegacyConfig( model_location='/path/to/model', data_location='/path/to/data', slice_spec=[ slicer.SingleSliceSpec(columns=['country'], features=[('age', 5), ('gender', 'f')]), slicer.SingleSliceSpec(columns=['interest'], features=[('age', 6), ('gender', 'm')]) ], example_count_metric_key=None, example_weight_metric_key='key', compute_confidence_intervals=False, k_anonymization_count=1) final_dict = {} final_dict['tfma_version'] = tfma_version.VERSION final_dict['eval_config'] = old_config with tf.io.TFRecordWriter(os.path.join(output_path, 'eval_config')) as w: w.write(pickle.dumps(final_dict)) got_eval_config, got_data_location, _, got_model_locations = ( eval_config_writer.load_eval_run(output_path)) options = config_pb2.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.min_slice_size.value = old_config.k_anonymization_count eval_config = config_pb2.EvalConfig(slicing_specs=[ config_pb2.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config_pb2.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) self.assertEqual(eval_config, got_eval_config) self.assertEqual(old_config.data_location, got_data_location) self.assertLen(got_model_locations, 1) self.assertEqual(old_config.model_location, list(got_model_locations.values())[0])
def testModelSignaturesDoFnError(self): export_path = self.createModelWithInvalidOutputShape() signature_names = {constants.PREDICTIONS_KEY: {'': [None]}} eval_shared_models = { '': self.createTestEvalSharedModel(eval_saved_model_path=export_path, tags=[tf.saved_model.SERVING]) } model_specs = [config_pb2.ModelSpec()] eval_config = config_pb2.EvalConfig(model_specs=model_specs) schema = self.createDenseInputsSchema() tfx_io = tf_example_record.TFExampleBeamRecord( physical_format='text', schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample(input_1=1.0, input_2=2.0), self._makeExample(input_1=3.0, input_2=4.0), self._makeExample(input_1=5.0, input_2=6.0), ] with self.assertRaisesRegex( ValueError, 'First dimension does not correspond with batch size.'): with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = (pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3) | 'ToExtracts' >> beam.Map(_record_batch_to_extracts) | 'ModelSignatures' >> beam.ParDo( model_util.ModelSignaturesDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models, signature_names=signature_names, default_signature_names=None, prefer_dict_outputs=False, tensor_adapter_config=tensor_adapter_config)))