def testMetricSpecsFromKerasSequential(self): export_dir = os.path.join(self._getTempDir(), 'export_dir') model = tf.keras.models.Sequential([ tf.keras.layers.InputLayer(input_shape=(1,), name='test'), tf.keras.layers.Dense(1, activation=tf.nn.sigmoid) ]) model.compile( loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.MeanSquaredError(name='mse')]) features = [[0.0], [1.0]] labels = [[1], [0]] dataset = tf.data.Dataset.from_tensor_slices((features, labels)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) model.save(export_dir, save_format='tf') eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) metrics_specs = ( keras_util.metrics_specs_from_keras('', eval_shared_model.model_loader)) # TODO(b/149995449): Keras does not support re-loading metrics with the new # API. Re-enable after this is fixed. model = eval_shared_model.model_loader.construct_fn(lambda x: None)() if not hasattr(model, 'loss_functions'): return self.assertLen(metrics_specs, 1) self.assertProtoEquals( self._comparable_spec(metrics_specs[0]), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='BinaryCrossentropy', config=json.dumps( { 'from_logits': False, 'label_smoothing': 0, 'reduction': 'auto', 'name': 'binary_crossentropy' }, sort_keys=True)), config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)) ], model_names=['']))
def _ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, metrics_specs: List[config.MetricsSpec], eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None, metrics_key: Text = constants.METRICS_KEY, plots_key: Text = constants.PLOTS_KEY, schema: Optional[schema_pb2.Schema] = None, random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation: """Computes metrics and plots. Args: extracts: PCollection of Extracts. If a query_key was used then the PCollection will contain a list of extracts. eval_config: Eval config. metrics_specs: Subset of the metric specs to compute metrics for. If a query_key was used all of the metric specs will be for the same query_key. eval_shared_models: Optional dict of shared models keyed by model name. Only required if there are metrics to be computed in-graph using the model. metrics_key: Name to use for metrics key in Evaluation output. plots_key: Name to use for plots key in Evaluation output. schema: A schema to use for customizing metrics and plots. random_seed_for_testing: Seed to use for unit testing. Returns: Evaluation containing dict of PCollections of (slice_key, results_dict) tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or plots_key (e.g. 'plots') depending on what the results_dict contains. schema: A schema to use for customizing metrics and plots. """ computations = [] # Add default metric computations if eval_shared_models: for model_name, eval_shared_model in eval_shared_models.items(): if not eval_shared_model.include_default_metrics: continue if eval_shared_model.model_type == constants.TF_KERAS: keras_specs = keras_util.metrics_specs_from_keras( model_name, eval_shared_model.model_loader) metrics_specs = keras_specs + metrics_specs[:] # TODO(mdreves): Add support for calling keras.evaluate(). elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags): # Note that there is the possibility for metric naming collisions here # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC # metric computation performed outside the model). Currently all the # overlapping metrics such as AUC that are computed outside the model # are all derived metrics so they will override the metrics calculated # by the model which is the desired behavior. computations.extend( eval_saved_model_util. metric_computations_using_eval_saved_model( model_name, eval_shared_model.model_loader)) # Add metric computations from specs computations_from_specs, derived_computations = ( _filter_and_separate_computations( metric_specs.to_computations(metrics_specs, eval_config=eval_config, schema=schema))) computations.extend(computations_from_specs) # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None # pylint: disable=no-value-for-parameter # Input: Single extract per example (or list of extracts if query_key used) # where each item contains slice keys and other extracts from upstream # extractors (e.g. labels, predictions, etc). # Output: Single extract (per example) containing slice keys and initial # combiner state returned from preprocessor. Note that even if a # query_key was used the output is still only a single extract # (though, that extract may contain lists of values (predictions, # labels, etc) in its keys). # # Note that the output of this step is extracts instead of just a tuple of # computation outputs because FanoutSlices takes extracts as input (and in # many cases a subset of the extracts themselves are what is fanned out). extracts = (extracts | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations))) # Input: Single extract containing slice keys and initial combiner inputs. If # query_key is used the extract represents multiple examples with the # same query_key, otherwise the extract represents a single example. # Output: Tuple (slice key, combiner inputs extracts). Notice that the per # example (or list or examples if query_key used) input extract turns # into n logical extracts, references to which are replicated once per # applicable slice key. slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices() slices_count = (slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) _ = (extracts.pipeline | 'IncrementMetricsSpecsCounters' >> counter_util.IncrementMetricsSpecsCounters(metrics_specs), slices_count | 'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters()) ci_params = _get_confidence_interval_params(eval_config, metrics_specs) cross_slice_specs = [] if eval_config.cross_slicing_specs: cross_slice_specs = eval_config.cross_slicing_specs # TODO(b/151482616): Make bootstrap and jackknife confidence interval # implementations more parallel. # Input: Tuple of (slice key, combiner input extracts). # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will # be keyed by MetricKey/PlotKey and the values will be the result # of the associated computations. A given MetricComputation can # perform computations for multiple keys, but the keys should be # unique across computations. sliced_metrics_and_plots = ( slices | 'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals( _ComputePerSlice, computations=computations, derived_computations=derived_computations, baseline_model_name=baseline_model_name, cross_slice_specs=cross_slice_specs, num_jackknife_samples=ci_params.num_jackknife_samples, num_bootstrap_samples=ci_params.num_bootstrap_samples, skip_ci_metric_keys=ci_params.skip_ci_metric_keys, random_seed_for_testing=random_seed_for_testing)) if eval_config.options.min_slice_size.value > 1: sliced_metrics_and_plots = ( sliced_metrics_and_plots | 'FilterSmallSlices' >> slicer.FilterOutSlices( slices_count, eval_config.options.min_slice_size.value)) sliced_metrics = (sliced_metrics_and_plots | 'FilterByMetrics' >> beam.Map(_filter_by_key_type, metric_types.MetricKey)) sliced_plots = ( sliced_metrics_and_plots | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey)) # pylint: enable=no-value-for-parameter return {metrics_key: sliced_metrics, plots_key: sliced_plots}
def testMetricSpecsFromKerasWithMultipleOutputs(self): export_dir = os.path.join(self._getTempDir(), 'export_dir') input_layer = tf.keras.layers.Input(shape=(1, )) output_layer1 = tf.keras.layers.Dense(1, name='output_1')(input_layer) output_layer2 = tf.keras.layers.Dense(1, name='output_2')(input_layer) model = tf.keras.models.Model([input_layer], [output_layer1, output_layer2]) model.compile(loss={ 'output_1': (tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy')), 'output_2': (tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy')) }, metrics=[tf.keras.metrics.MeanSquaredError(name='mse')]) features = [[0.0], [1.0]] labels = [[1], [0]] dataset = tf.data.Dataset.from_tensor_slices((features, { 'output_1': labels, 'output_2': labels })) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) model.save(export_dir, save_format='tf') eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir) metrics_specs = (keras_util.metrics_specs_from_keras( '', eval_shared_model.model_loader)) # TODO(b/149995449): Keras does not support re-loading metrics with the new # API. Re-enable after this is fixed. model = eval_shared_model.model_loader.construct_fn(lambda x: None)() if not hasattr(model, 'loss_functions'): return self.assertLen(metrics_specs, 2) self.assertProtoEquals( self._comparable_spec(metrics_specs[0]), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='BinaryCrossentropy', config=json.dumps( { 'from_logits': False, 'label_smoothing': 0, 'reduction': 'auto', 'name': self._loss_name(model, 'binary_crossentropy', 'output_1') }, sort_keys=True)), config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'output_1_mse', 'dtype': 'float32' }, sort_keys=True)) ], model_names=[''], output_names=['output_1'])) self.assertProtoEquals( self._comparable_spec(metrics_specs[1]), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='BinaryCrossentropy', config=json.dumps( { 'from_logits': False, 'label_smoothing': 0, 'reduction': 'auto', 'name': self._loss_name(model, 'binary_crossentropy', 'output_2') }, sort_keys=True)), config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'output_2_mse', 'dtype': 'float32' }, sort_keys=True)) ], model_names=[''], output_names=['output_2']))