def testAucUnweighted(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) examples = [ self._makeExample(prediction=0.0000, label=0.0000), self._makeExample(prediction=0.0000, label=1.0000), self._makeExample(prediction=0.7000, label=1.0000), self._makeExample(prediction=0.8000, label=0.0000), self._makeExample(prediction=1.0000, label=1.0000), ] expected_values_dict = { metric_keys.AUC: 0.58333, metric_keys.lower_bound(metric_keys.AUC): 0.5, metric_keys.upper_bound(metric_keys.AUC): 0.66667, metric_keys.lower_bound(metric_keys.AUPRC): 0.74075, metric_keys.lower_bound(metric_keys.AUPRC): 0.70000, metric_keys.upper_bound(metric_keys.AUPRC): 0.77778, } self._runTest( examples, eval_export_dir, [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')], expected_values_dict)
def testConvertSliceMetricsToProtoEmptyMetrics(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = {metric_keys.ERROR_METRIC: 'error_message'} actual_metrics = ( metrics_plots_and_validations_writer.convert_slice_metrics_to_proto( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')])) expected_metrics = metrics_for_slice_pb2.MetricsForSlice() expected_metrics.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) expected_metrics.metrics[ metric_keys.ERROR_METRIC].debug_message = 'error_message' self.assertProtoEquals(expected_metrics, actual_metrics)
def testSerializeMetrics_emptyMetrics(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = {metric_keys.ERROR_METRIC: 'error_message'} actual_metrics = metrics_and_plots_serialization._serialize_metrics( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) expected_metrics = metrics_for_slice_pb2.MetricsForSlice() expected_metrics.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) expected_metrics.metrics[ metric_keys.ERROR_METRIC].debug_message = 'error_message' self.assertProtoEquals( expected_metrics, metrics_for_slice_pb2.MetricsForSlice.FromString(actual_metrics))
def testAucUnweightedSerialization(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) examples = [ self._makeExample(prediction=0.0000, label=0.0000), self._makeExample(prediction=0.0000, label=1.0000), self._makeExample(prediction=0.7000, label=1.0000), self._makeExample(prediction=0.8000, label=0.0000), self._makeExample(prediction=1.0000, label=1.0000), ] expected_values_dict = { metric_keys.lower_bound(metric_keys.AUPRC): 0.74075, metric_keys.lower_bound(metric_keys.AUPRC): 0.70000, metric_keys.upper_bound(metric_keys.AUPRC): 0.77778, } auc_metric = post_export_metrics.auc(curve='PR') def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual(value, expected_values_dict) # Check serialization too. # Note that we can't just make this a dict, since proto maps # allow uninitialized key access, i.e. they act like defaultdicts. output_metrics = metrics_for_slice_pb2.MetricsForSlice().metrics auc_metric.populate_stats_and_pop(value, output_metrics) self.assertProtoEquals( """ bounded_value { lower_bound { value: 0.6999999 } upper_bound { value: 0.7777776 } value { value: 0.7407472 } } """, output_metrics[metric_keys.AUPRC]) except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck( examples, eval_export_dir, [auc_metric], custom_metrics_check=check_result)
def testMetricComputedBeamCounter(self): with beam.Pipeline() as pipeline: auc = post_export_metrics.auc() _ = pipeline | counter_util.IncrementMetricsComputationCounters([auc]) result = pipeline.run() metric_filter = beam.metrics.metric.MetricsFilter().with_namespace( constants.METRICS_NAMESPACE).with_name('metric_computed_auc') actual_metrics_count = result.metrics().query( filter=metric_filter)['counters'][0].committed self.assertEqual(actual_metrics_count, 1)
def testSerializeMetrics(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': 0.8, _full_key(metric_keys.AUPRC): 0.1, _full_key(metric_keys.lower_bound(metric_keys.AUPRC)): 0.05, _full_key(metric_keys.upper_bound(metric_keys.AUPRC)): 0.17, _full_key(metric_keys.AUC): 0.2, _full_key(metric_keys.lower_bound(metric_keys.AUC)): 0.1, _full_key(metric_keys.upper_bound(metric_keys.AUC)): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { double_value { value: 0.8 } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=_full_key(metric_keys.AUC), auprc=_full_key(metric_keys.AUPRC)), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_and_plots_evaluator._serialize_metrics( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals( expected_metrics_for_slice, metrics_for_slice_pb2.MetricsForSlice.FromString(got))
def testAssertGeneralMetricsComputedWithBeamAre(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_eval_export_dir)) examples = [ self.makeExample(prediction=0.0, label=0.0, fixed_string='negative_slice', fixed_float=0.0, fixed_int=0), self.makeExample(prediction=0.2, label=0.0, fixed_string='negative_slice', fixed_float=0.0, fixed_int=0), self.makeExample(prediction=0.4, label=0.0, fixed_string='negative_slice', fixed_float=0.0, fixed_int=0), self.makeExample(prediction=0.8, label=1.0, fixed_string='positive_slice', fixed_float=0.0, fixed_int=0), self.makeExample(prediction=0.9, label=1.0, fixed_string='positive_slice', fixed_float=0.0, fixed_int=0), self.makeExample(prediction=1.0, label=1.0, fixed_string='positive_slice', fixed_float=0.0, fixed_int=0), ] expected_slice_metrics = {} expected_slice_metrics[()] = { 'average_loss': (0.00 + 0.04 + 0.16 + 0.04 + 0.01 + 0.00) / 6.0, 'mae': 0.15, # Note that we don't check the exact value because of numerical errors. metric_keys.AUC: tfma_unit.BoundedValue(0.98, 1.00), } # We don't check AUC for the positive / negative only slices because # it's not clear what the value should be. expected_slice_metrics[(('fixed_string', b'negative_slice'), )] = { 'average_loss': (0.00 + 0.04 + 0.16) / 3.0, 'mae': 0.2, } expected_slice_metrics[(('fixed_string', b'positive_slice'), )] = { 'average_loss': (0.04 + 0.01 + 0.00) / 3.0, 'mae': 0.1, } def add_metrics(features, predictions, labels): del features metric_ops = { 'mae': tf.metrics.mean_absolute_error(labels, predictions['predictions']), } return metric_ops with beam.Pipeline() as pipeline: examples_pcollection = pipeline | 'Create' >> beam.Create(examples) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=eval_export_dir, examples_pcollection=examples_pcollection, slice_spec=[ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['fixed_string']) ], add_metrics_callbacks=[add_metrics, post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics)
def testSerializeMetricsRanges(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8), metric_keys.AUPRC: 0.1, metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05, metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17, metric_keys.AUC: 0.2, metric_keys.lower_bound_key(metric_keys.AUC): 0.1, metric_keys.upper_bound_key(metric_keys.AUC): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { bounded_value { value { value: 0.8 } lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } methodology: POISSON_BOOTSTRAP } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_and_plots_serialization._serialize_metrics( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals( expected_metrics_for_slice, metrics_for_slice_pb2.MetricsForSlice.FromString(got))
def _counter_inc(self, data): auc = post_export_metrics.auc() counter_util.update_beam_counters([auc]) return
def testConvertSliceMetricsToProtoFromLegacyStrings(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': 0.8, metric_keys.AUPRC: 0.1, metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05, metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17, metric_keys.AUC: 0.2, metric_keys.lower_bound_key(metric_keys.AUC): 0.1, metric_keys.upper_bound_key(metric_keys.AUC): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { double_value { value: 0.8 } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals(expected_metrics_for_slice, got)
def testConvertSliceMetricsToProtoMetricsRanges(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8), metric_keys.AUPRC: 0.1, metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05, metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17, metric_keys.AUC: 0.2, metric_keys.lower_bound_key(metric_keys.AUC): 0.1, metric_keys.upper_bound_key(metric_keys.AUC): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { bounded_value { value { value: 0.8 } lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } t_distribution_value { sample_mean { value: 0.8 } sample_standard_deviation { value: 0.1 } sample_degrees_of_freedom { value: 9 } unsampled_value { value: 0.8 } } } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals(expected_metrics_for_slice, got)