def _init_model(self, multi_model, validation): # The benchmark runner will instantiate this class twice - once to determine # the benchmarks to run, and once to actually to run them. However, Keras # freezes if we try to load the same model twice. As such, we have to pull # the model loading out of the constructor into a separate method which we # call before each benchmark. if multi_model: metric_specs = metric_specs_util.specs_from_metrics( [tf.keras.metrics.AUC(name="auc", num_thresholds=10000)], model_names=["candidate", "baseline"]) if validation: # Only one metric, adding a threshold for all slices. metric_specs[0].metrics[0].threshold.CopyFrom( tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={"value": 0.5}, upper_bound={"value": 0.5}), change_threshold=tfma.GenericChangeThreshold( absolute={"value": -0.001}, direction=tfma.MetricDirection.HIGHER_IS_BETTER))) self._eval_config = tfma.EvalConfig(model_specs=[ tfma.ModelSpec(name="candidate", label_key="tips"), tfma.ModelSpec(name="baseline", label_key="tips", is_baseline=True) ], metrics_specs=metric_specs) self._eval_shared_models = { "candidate": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config, model_name="candidate"), "baseline": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config, model_name="baseline") } else: metric_specs = metric_specs_util.specs_from_metrics( [tf.keras.metrics.AUC(name="auc", num_thresholds=10000)]) if validation: # Only one metric, adding a threshold for all slices. metric_specs[0].metrics[0].threshold.CopyFrom( tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={"value": 0.5}, upper_bound={"value": 0.5}))) self._eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key="tips")], metrics_specs=metric_specs) self._eval_shared_models = { "": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config) }
def testHasAttributionsMetrics(self): specs_with_attributions = metric_specs.specs_from_metrics({ 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), attributions.TotalAttributions() ] }) self.assertTrue( attributions.has_attributions_metrics(specs_with_attributions)) specs_without_attributions = metric_specs.specs_from_metrics([ tf.keras.metrics.MeanSquaredError('mse'), ]) self.assertFalse( attributions.has_attributions_metrics(specs_without_attributions))
def testMetricKeysToSkipForConfidenceIntervals(self): metrics_specs = [ config_pb2.MetricsSpec(metrics=[ config_pb2.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config_pb2.MetricThreshold( value_threshold=config_pb2.GenericValueThreshold())), config_pb2.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config_pb2.MetricThreshold( change_threshold=config_pb2.GenericChangeThreshold())), config_pb2.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config_pb2.MetricThreshold( change_threshold=config_pb2.GenericChangeThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=[ 'output_name1', 'output_name2' ]), ] metrics_specs += metric_specs.specs_from_metrics( [tf.keras.metrics.MeanSquaredError('mse')], model_names=['model_name1', 'model_name2']) keys = metric_specs.metric_keys_to_skip_for_confidence_intervals( metrics_specs, eval_config=config_pb2.EvalConfig()) self.assertLen(keys, 8) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name1'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', example_weighted=True), keys) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name2'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', example_weighted=True), keys)
def testMetricKeysToSkipForConfidenceIntervals(self): metrics_specs = [ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())) ], # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), ] metrics_specs += metric_specs.specs_from_metrics( [tf.keras.metrics.MeanSquaredError('mse')]) keys = metric_specs.metric_keys_to_skip_for_confidence_intervals( metrics_specs) self.assertLen(keys, 1) self.assertIn(metric_types.MetricKey(name='example_count'), keys)
def benchmarkMetricsPlotsAndValidationsEvaluatorAUC10k(self): self._runMetricsPlotsAndValidationsEvaluatorManualActuation( with_confidence_intervals=False, multi_model=False, metrics_specs=metric_specs.specs_from_metrics([ tf.keras.metrics.AUC(name="auc", num_thresholds=10000), ]))
def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 8) self.assertIn(metric_types.MetricKey(name='example_count'), keys) self.assertIn( metric_types.MetricKey( name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name'), keys)
def benchmarkMetricsPlotsAndValidationsEvaluatorAUC10kMultiModel(self): self._runMetricsPlotsAndValidationsEvaluatorManualActuation( with_confidence_intervals=False, multi_model=True, metrics_specs=metric_specs_util.specs_from_metrics( [ tf.keras.metrics.AUC(name="auc", num_thresholds=10000), ], model_names=["candidate", "baseline"]), validation=True)
def _init_model(self, multi_model): # The benchmark runner will instantiate this class twice - once to determine # the benchmarks to run, and once to actually to run them. However, Keras # freezes if we try to load the same model twice. As such, we have to pull # the model loading out of the constructor into a separate method which we # call before each benchmark. if multi_model: self._eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(name="candidate", label_key="tips"), tfma.ModelSpec(name="baseline", label_key="tips", is_baseline=True) ], metrics_specs=metric_specs.specs_from_metrics( [ tf.keras.metrics.AUC(name="auc", num_thresholds=10000), ], model_names=["candidate", "baseline"])) self._eval_shared_models = { "candidate": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config, model_name="candidate"), "baseline": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config, model_name="baseline") } else: self._eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key="tips")], metrics_specs=metric_specs.specs_from_metrics([ tf.keras.metrics.AUC(name="auc", num_thresholds=10000), ])) self._eval_shared_models = { "": tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config) }
def _init_model(self): # The benchmark runner will instantiate this class twice - once to determine # the benchmarks to run, and once to actually to run them. However, Keras # freezes if we try to load the same model twice. As such, we have to pull # the model loading out of the constructor into a separate method which we # call before each benchmark. self._eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key="tips")], metrics_specs=metric_specs.specs_from_metrics([ tf.keras.metrics.AUC(name="auc", num_thresholds=10000), ])) # metrics_specs=metric_specs.example_count_specs()) self._eval_shared_model = tfma.default_eval_shared_model( self._dataset.trained_saved_model_path(), eval_config=self._eval_config)
def benchmarkMetricsPlotsAndValidationsEvaluatorBinaryClassification(self): self._runMetricsPlotsAndValidationsEvaluatorManualActuation( with_confidence_intervals=False, metrics_specs=metric_specs.specs_from_metrics([ tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.AUC(name="auc", num_thresholds=10000), tf.keras.metrics.AUC( name="auc_precison_recall", curve="PR", num_thresholds=10000), tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall"), tfma.metrics.MeanLabel(name="mean_label"), tfma.metrics.MeanPrediction(name="mean_prediction"), tfma.metrics.Calibration(name="calibration"), tfma.metrics.ConfusionMatrixPlot(name="confusion_matrix_plot"), tfma.metrics.CalibrationPlot(name="calibration_plot"), ]))
def testEvaluateWithQueryBasedMetrics(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_int') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='fixed_float', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1, 2]), query_key='fixed_string')) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_string used as query_key # fixed_float used as gain_key for NDCG # fixed_int used as example_weight_key for NDCG examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_float=1.0, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.8, label=0.0, fixed_float=0.5, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.5, label=0.0, fixed_float=0.5, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.1, label=0.0, fixed_float=0.1, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query3', fixed_int=3) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 4) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () query1_slice = (('fixed_string', b'query1'), ) query2_slice = (('fixed_string', b'query2'), ) query3_slice = (('fixed_string', b'query3'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, query1_slice, query2_slice, query3_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') ndcg1_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=1)) ndcg2_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=2)) # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0) # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1) # Query3 (weight=3): (p=0.9, g=1.0) # # DCG@1: 0.5, 1.0, 1.0 # NDCG@1: 0.5, 1.0, 1.0 # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92 # # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930 # (1.0 + 0.5/log(3) ~ 1.315465 # 1.0 # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972 # (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0 # 1.0 # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97 self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 6, weighted_example_count_key: 11.0, ndcg1_key: 0.9166667, ndcg2_key: 0.9766198 }) self.assertDictElementsAlmostEqual( slices[query1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, ndcg1_key: 0.5, ndcg2_key: 0.85972 }) self.assertDictElementsAlmostEqual( slices[query2_slice], { example_count_key: 3, weighted_example_count_key: 6.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) self.assertDictElementsAlmostEqual( slices[query3_slice], { example_count_key: 1, weighted_example_count_key: 3.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithKerasModel(self): input1 = tf.keras.layers.Input(shape=(1, ), name='input1') input2 = tf.keras.layers.Input(shape=(1, ), name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layer = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, name='output')(input_layer) model = tf.keras.models.Model(inputs, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]} labels = [[1], [0]] example_weights = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='example_weight') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(input1=0.0, input2=1.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample(input1=1.0, input2=0.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 2, weighted_example_count_key: (1.0 + 0.5), label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5), }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def metrics_specs_from_keras( model_name: Text, model_loader: types.ModelLoader, ) -> List[config.MetricsSpec]: """Returns metrics specs for metrics and losses associated with the model.""" model = model_loader.construct_fn(lambda x: None)() if model is None: return [] metric_names = [] metrics = [] if hasattr(model, 'loss_functions'): # Legacy keras metrics separate the losses from the metrics and store them # under loss_functions. The first name in metric_names is always 'loss' # followed by the loss_function names (prefixed by output_name if multiple # outputs) and then followed by the metric names (also prefixed by output # name). Note that names in loss_functions will not have any output name # prefixes (if used) while the metrics will so we need to use the names in # metric_names for matching with outputs not the names in the functions. metric_names = model.metrics_names metrics.extend(model.loss_functions) metrics.extend(model.metrics) if len(metric_names) > len(metrics) and metric_names[0] == 'loss': metric_names = metric_names[1:] elif hasattr(model, 'compiled_loss') and hasattr(model, 'compiled_metrics'): # In the new keras metric setup the metrics include the losses (in the form # of a metric type not a loss type) and the metrics_names align with the # names in the metric classes. The metrics itself contains compiled_loss, # compiled_metrics, and custom metrics (added via add_metric). Since we only # care about compiled metrics we use these APIs instead. Note that the # overall loss metric is an average of the other losses which doesn't take # y_true, y_pred as inputs so it can't be calculated via standard inputs so # we remove it. metrics.extend(model.compiled_loss.metrics[1:]) metrics.extend(model.compiled_metrics.metrics) metric_names = [m.name for m in metrics] specs = [] if hasattr(model, 'output_names') and len(model.output_names) > 1: unmatched_metrics = {m for m in metrics} for output_name in model.output_names: per_output_metrics = [] for (name, metric) in zip(metric_names, metrics): if name.startswith(output_name + '_'): per_output_metrics.append(metric) unmatched_metrics.remove(metric) if per_output_metrics: specs.extend( metric_specs.specs_from_metrics( metrics=per_output_metrics, model_names=[model_name], output_names=[output_name], include_example_count=False, include_weighted_example_count=False)) metrics = list(unmatched_metrics) if metrics: specs.extend( metric_specs.specs_from_metrics( metrics=metrics, model_names=[model_name], include_example_count=False, include_weighted_example_count=False)) return specs
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanLabel', config=json.dumps( {'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps({ 'name': 'rmse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)))
def testEvaluateWithBinaryClassificationModel(self): n_classes = 2 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add mean_label, example_count, weighted_example_count, calibration_plot eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration_plot.CalibrationPlot(name='calibration_plot', num_buckets=10) ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0.0), self._makeExample(age=2.0, language='chinese', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics_and_plots = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 3, weighted_example_count_key: (1.0 + 2.0 + 3.0), label_key: (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0), }) except AssertionError as err: raise util.BeamAssertException(err) def check_plots(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) plot_key = metric_types.PlotKey('calibration_plot') self.assertIn(plot_key, got_plots) # 10 buckets + 2 for edge cases self.assertLen(got_plots[plot_key].buckets, 12) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics_and_plots[constants.METRICS_KEY], check_metrics, label='metrics') util.assert_that(metrics_and_plots[constants.PLOTS_KEY], check_plots, label='plots')
def testEvaluateWithConfidenceIntervals(self): # NOTE: This test does not actually test that confidence intervals are # accurate it only tests that the proto output by the test is well formed. # This test would pass if the confidence interval implementation did # nothing at all except compute the unsampled value. temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) options = config.Options() options.compute_confidence_intervals.value = True eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ]), options=options) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsWithTDistributionAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), # Add a loss exactly same as metric # (https://github.com/tensorflow/tfx/issues/1550) tf.keras.losses.MeanSquaredError(name='loss'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 11) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name'), keys)
def testEvaluateWithMultiClassModel(self): n_classes = 3 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add example_count and weighted_example_count eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')], binarize=config.BinarizationOptions( class_ids=range(n_classes)))) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0), self._makeExample(age=2.0, language='chinese', label=1), self._makeExample(age=3.0, language='english', label=2), self._makeExample(age=4.0, language='chinese', label=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key_class_0 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=0)) label_key_class_1 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=1)) label_key_class_2 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=2)) self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, weighted_example_count_key: (1.0 + 2.0 + 3.0 + 4.0), label_key_class_0: (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_1: (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_2: (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testEvaluateWithSlicing(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor( eval_shared_model=eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.asssertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), tf.keras.losses.MeanAbsoluteError(name='mae'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), tf.keras.losses.MeanAbsolutePercentageError(name='mape'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig(class_name='MeanAbsoluteError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps( { 'reduction': 'auto', 'name': 'mae' }, sort_keys=True)), config.MetricConfig(class_name='MeanLabel', config=json.dumps({'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps( { 'name': 'rmse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig( class_name='MeanAbsolutePercentageError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps({ 'reduction': 'auto', 'name': 'mape' }, sort_keys=True)), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)))
def testEvaluateWithMultiOutputModel(self): temp_export_dir = self._getExportDir() _, export_dir = multi_head.simple_multi_head(None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_keys={ 'chinese_head': 'chinese_label', 'english_head': 'english_label', 'other_head': 'other_label' }, example_weight_keys={ 'chinese_head': 'age', 'english_head': 'age', 'other_head': 'age' }) ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics({ 'chinese_head': [calibration.MeanLabel('mean_label')], 'english_head': [calibration.MeanLabel('mean_label')], 'other_head': [calibration.MeanLabel('mean_label')], })) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample(age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') chinese_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='chinese_head') chinese_label_key = metric_types.MetricKey( name='mean_label', output_name='chinese_head') english_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='english_head') english_label_key = metric_types.MetricKey( name='mean_label', output_name='english_head') other_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='other_head') other_label_key = metric_types.MetricKey( name='mean_label', output_name='other_head') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, chinese_label_key: (0.0 + 1.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), chinese_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), english_label_key: (1.0 + 0.0 + 2 * 1.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0 + 2.0), english_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), other_label_key: (0.0 + 0.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), other_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')