def testValidateMetricsCrossSliceThresholdFail(self, cross_slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], cross_slicing_specs=cross_slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=(threshold if cross_slicing_specs is None else None), cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=cross_slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsValueThresholdLowerBoundPass( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 2 > 1, OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 2, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsChangeThresholdHigherIsBetterFail(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 > 0, NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection. HIGHER_IS_BETTER, absolute={'value': 0}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsMetricValueAndThresholdIgnoreUnmatchedSlice( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsMetricTDistributionValueAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.9})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='auc'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8) }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" } metric_value { double_value { value: 0.8 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertEqual(result, expected)
def testValidateMetricsChangeThresholdRelativePass(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 # Diff% = -.333/.333 = -100% < 0%, OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection. LOWER_IS_BETTER, relative={'value': 0}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsValueThresholdLowerBoundPass(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 2 > 1, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 2, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testGetMissingSlices(self): slicing_specs = [ config.SlicingSpec(), config.SlicingSpec(feature_values={'feature1': 'value1'}), config.SlicingSpec(feature_values={'feature2': 'value2'}) ] threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ((('feature1', 'value1'), ), { metric_types.MetricKey(name='weighted_example_count'): 0, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) expected_checks = text_format.Parse( """ validation_ok: true validation_details { slicing_details { slicing_spec { feature_values { key: "feature1" value: "value1" } } num_matching_slices: 1 } }""", validation_result_pb2.ValidationResult()) self.assertProtoEquals(expected_checks, result) missing = metrics_validator.get_missing_slices( result.validation_details.slicing_details, eval_config) self.assertLen(missing, 2) self.assertProtoEquals(missing[0], slicing_specs[0]) self.assertProtoEquals(missing[1], slicing_specs[2])
def testValidateMetricsMetricValueAndThreshold(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { slice_key { } failures { metric_key { name: "weighted_example_count" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) self.assertEqual(result, expected)
def testValidateMetricsMetricValueAndThreshold(self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold(upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "weighted_example_count" } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) self.assertEqual(result, expected)
def testValidateMetricsInvalidThreshold(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( thresholds={ 'invalid_threshold': config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.2})) }) ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { slice_key { } failures { metric_key { name: "invalid_threshold" } metric_threshold { value_threshold { lower_bound { value: 0.2 } } } message: 'Metric not found.' } }""", validation_result_pb2.ValidationResult()) self.assertProtoEquals(expected, result)
def testValidateMetricsChangeThresholdRelativeFail(self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, relative={'value': -2})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 # Diff% = -.333/.333 = -100% < -200%, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsChangeThresholdHigherIsBetterPass( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 > -1, OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsDivByZero(self): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, relative={'value': 0.1})) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(name='candidate'), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['baseline', 'candidate']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.0, metric_types.MetricKey( name='mean_prediction', model_name='candidate', is_diff=True): 0.1, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsMetricTDistributionChangeAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ( slice_key, { # This is the mean of the diff. metric_types.MetricKey(name='auc', model_name='baseline'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.6), metric_types.MetricKey(name='auc', is_diff=True): types.ValueWithTDistribution(sample_mean=0.1, unsampled_value=0.1), }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" is_diff: true } metric_value { double_value { value: 0.1 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertAlmostEqual(result, expected)
def testValidateMetricsChangeThresholdEqualPass(self, slicing_specs, slice_key): # Change thresholds. threshold1 = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -.333}, relative={'value': -.333})) threshold2 = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -.333}, relative={'value': -.333})) # Value thresholds. threshold3 = config.MetricThreshold( value_threshold=config.GenericValueThreshold(lower_bound={'value': 1})) threshold4 = config.MetricThreshold( value_threshold=config.GenericValueThreshold(upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(name='candidate'), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 == -.333, OK. threshold=threshold1 if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold1) ]), config.MetricConfig( class_name='MeanLabel', # Diff = -.333 == -.333, OK. threshold=threshold2 if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold2) ]), config.MetricConfig( class_name='ExampleCount', # 1 == 1, OK. threshold=threshold3 if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold3) ]), config.MetricConfig( class_name='WeightedExampleCount', # 1 == 1, OK. threshold=threshold4 if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold4) ]), ], model_names=['candidate']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='candidate'): 0.677, metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 1, metric_types.MetricKey( name='mean_prediction', is_diff=True, model_name='candidate'): -0.333, metric_types.MetricKey(name='mean_label', model_name='candidate'): 0.677, metric_types.MetricKey(name='mean_label', model_name='baseline'): 1, metric_types.MetricKey( name='mean_label', is_diff=True, model_name='candidate'): -0.333, metric_types.MetricKey(name='example_count', model_name='candidate'): 1, metric_types.MetricKey( name='weighted_example_count', model_name='candidate'): 1, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)