def default_regression_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, loss_functions: Optional[List[Union[tf.keras.metrics.Metric, tf.keras.losses.Loss]]] = None, min_value: Optional[float] = None, max_value: Optional[float] = None) -> List[config.MetricsSpec]: """Returns default metric specs for for regression problems. Args: model_names: Optional model names (if multi-model evaluation). output_names: Optional list of output names (if multi-output model). loss_functions: Loss functions to use (if None MSE is used). min_value: Min value for calibration plot (if None no plot will be created). max_value: Max value for calibration plot (if None no plot will be created). """ if loss_functions is None: loss_functions = [tf.keras.metrics.MeanSquaredError(name='mse')] metrics = [ tf.keras.metrics.Accuracy(name='accuracy'), calibration.MeanLabel(name='mean_label'), calibration.MeanPrediction(name='mean_prediction'), calibration.Calibration(name='calibration'), ] for fn in loss_functions: metrics.append(fn) if min_value is not None and max_value is not None: metrics.append( calibration_plot.CalibrationPlot( name='calibration_plot', left=min_value, right=max_value)) return specs_from_metrics( metrics, model_names=model_names, output_names=output_names)
def testCalibrationPlotWithSchema(self, eval_config, schema, model_names, output_names, expected_left, expected_range): computations = calibration_plot.CalibrationPlot( num_buckets=10).computations(eval_config=eval_config, schema=schema, model_names=model_names, output_names=output_names) histogram = computations[0] self.assertEqual(expected_left, histogram.combiner._left) self.assertEqual(expected_range, histogram.combiner._range)
def default_binary_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, output_weights: Optional[Dict[Text, float]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, include_loss: bool = True) -> List[config.MetricsSpec]: """Returns default metric specs for binary classification problems. Args: model_names: Optional model names (if multi-model evaluation). output_names: Optional list of output names (if multi-output model). output_weights: Optional output weights for creating overall metric aggregated across outputs (if multi-output model). If a weight is not provided for an output, it's weight defaults to 0.0 (i.e. output ignored). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. include_loss: True to include loss. """ metrics = [ tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC( name='auc', num_thresholds=binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS), tf.keras.metrics.AUC( name= 'auc_precison_recall', # Matches default name used by estimator. curve='PR', num_thresholds=binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), calibration.MeanLabel(name='mean_label'), calibration.MeanPrediction(name='mean_prediction'), calibration.Calibration(name='calibration'), confusion_matrix_plot.ConfusionMatrixPlot( name='confusion_matrix_plot'), calibration_plot.CalibrationPlot(name='calibration_plot') ] if include_loss: metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss')) return specs_from_metrics(metrics, model_names=model_names, output_names=output_names, output_weights=output_weights, binarize=binarize, aggregate=aggregate)
def default_binary_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, class_ids: Optional[List[int]] = None, k_list: Optional[List[int]] = None, top_k_list: Optional[List[int]] = None, include_loss: bool = True) -> List[config.MetricsSpec]: """Returns default metric specs for binary classification problems. Args: model_names: Optional model names (if multi-model evaluation). output_names: Optional list of output names (if multi-output model). class_ids: Optional class IDs to compute metrics for particular classes in a multi-class model. If output_names are provided, all outputs are assumed to use the same class IDs. k_list: Optional list of k values to compute metrics for the kth predicted values of a multi-class model prediction. If output_names are provided, all outputs are assumed to use the same k value. top_k_list: Optional list of top_k values to compute metrics for the top k predicted values in a multi-class model prediction. If output_names are provided, all outputs are assumed to use the same top_k value. Metrics and plots will be based on treating each predicted value in the top_k as though they were separate predictions. include_loss: True to include loss. """ metrics = [ tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auc_pr', curve='PR'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), calibration.MeanLabel(name='mean_label'), calibration.MeanPrediction(name='mean_prediction'), calibration.Calibration(name='calibration'), auc_plot.AUCPlot(name='auc_plot'), calibration_plot.CalibrationPlot(name='calibration_plot') ] if include_loss: metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss')) return specs_from_metrics(metrics, model_names=model_names, output_names=output_names, class_ids=class_ids, k_list=k_list, top_k_list=top_k_list)
def default_binary_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, include_loss: bool = True) -> List[config.MetricsSpec]: """Returns default metric specs for binary classification problems. Args: model_names: Optional model names (if multi-model evaluation). output_names: Optional list of output names (if multi-output model). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. include_loss: True to include loss. """ metrics = [ tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.AUC(name='auc_pr', curve='PR'), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), calibration.MeanLabel(name='mean_label'), calibration.MeanPrediction(name='mean_prediction'), calibration.Calibration(name='calibration'), auc_plot.AUCPlot(name='auc_plot'), calibration_plot.CalibrationPlot(name='calibration_plot') ] if include_loss: metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss')) return specs_from_metrics(metrics, model_names=model_names, output_names=output_names, binarize=binarize, aggregate=aggregate)
def testCalibrationPlot(self): computations = calibration_plot.CalibrationPlot( num_buckets=10).computations() histogram = computations[0] plot = computations[1] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([2.0]) } example3 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([3.0]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([-0.1]), 'example_weights': np.array([4.0]) } example5 = { 'labels': np.array([1.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([5.0]) } example6 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([6.0]) } example7 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([7.0]) } example8 = { 'labels': np.array([1.0]), 'predictions': np.array([1.1]), 'example_weights': np.array([8.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([ example1, example2, example3, example4, example5, example6, example7, example8 ]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey(name='calibration_plot') self.assertIn(key, got_plots) got_plot = got_plots[key] self.assertProtoEquals( """ buckets { lower_threshold_inclusive: -inf upper_threshold_exclusive: 0.0 total_weighted_label { value: 4.0 } total_weighted_refined_prediction { value: -0.4 } num_weighted_examples { value: 4.0 } } buckets { lower_threshold_inclusive: 0.0 upper_threshold_exclusive: 0.1 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.1 upper_threshold_exclusive: 0.2 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.2 upper_threshold_exclusive: 0.3 total_weighted_label { } total_weighted_refined_prediction { value: 1.6 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.3 upper_threshold_exclusive: 0.4 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.4 upper_threshold_exclusive: 0.5 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 0.6 total_weighted_label { value: 5.0 } total_weighted_refined_prediction { value: 4.0 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.6 upper_threshold_exclusive: 0.7 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.7 upper_threshold_exclusive: 0.8 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.8 upper_threshold_exclusive: 0.9 total_weighted_label { value: 8.0 } total_weighted_refined_prediction { value: 6.4 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.9 upper_threshold_exclusive: 1.0 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf total_weighted_label { value: 8.0 } total_weighted_refined_prediction { value: 8.8 } num_weighted_examples { value: 8.0 } } """, got_plot) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateWithBinaryClassificationModel(self): n_classes = 2 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add mean_label, example_count, weighted_example_count, calibration_plot eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration_plot.CalibrationPlot(name='calibration_plot', num_buckets=10) ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0.0), self._makeExample(age=2.0, language='chinese', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics_and_plots = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 3, weighted_example_count_key: (1.0 + 2.0 + 3.0), label_key: (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0), }) except AssertionError as err: raise util.BeamAssertException(err) def check_plots(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) plot_key = metric_types.PlotKey('calibration_plot') self.assertIn(plot_key, got_plots) # 10 buckets + 2 for edge cases self.assertLen(got_plots[plot_key].buckets, 12) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics_and_plots[constants.METRICS_KEY], check_metrics, label='metrics') util.assert_that(metrics_and_plots[constants.PLOTS_KEY], check_plots, label='plots')