def testUpdateConfigWithoutBaselineModelWhenModelNameProvided(self): eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: "candidate" } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["candidate"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults( eval_config, has_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithDefaultsSingleModel(self): eval_config_pbtxt = """ model_specs { name: "model1" } metrics_specs { metrics { class_name: "WeightedExampleCount" } } metrics_specs { metrics { class_name: "MeanLabel" } model_names: ["model1"] } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "" } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: [""] } metrics_specs { metrics { class_name: "MeanLabel" } model_names: [""] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults(eval_config) self.assertProtoEquals(got_eval_config, expected_eval_config)
def _update_eval_config_with_defaults( eval_config: config.EvalConfig, eval_shared_model: Optional[types.MaybeMultipleEvalSharedModels] ) -> config.EvalConfig: """Returns updated eval config with default values.""" eval_shared_models = model_util.verify_and_update_eval_shared_models( eval_shared_model) maybe_add_baseline = eval_shared_models and len(eval_shared_models) == 2 return config.update_eval_config_with_defaults( eval_config, maybe_add_baseline=maybe_add_baseline)
def default_evaluators( # pylint: disable=invalid-name eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, serialize: bool = False, random_seed_for_testing: Optional[int] = None) -> List[ evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if there are metrics to be computed in-graph using the model. eval_config: Eval config. compute_confidence_intervals: Deprecated (use eval_config). k_anonymization_count: Deprecated (use eval_config). desired_batch_size: Optional batch size for batching in combiner. serialize: Deprecated. random_seed_for_testing: Provide for deterministic tests only. """ disabled_outputs = [] if eval_config: eval_config = config.update_eval_config_with_defaults(eval_config) disabled_outputs = eval_config.options.disabled_outputs.values if (constants.METRICS_KEY in disabled_outputs and constants.PLOTS_KEY in disabled_outputs): return [] if _is_legacy_eval(eval_shared_model, eval_config): # Backwards compatibility for previous add_metrics_callbacks implementation. if eval_config is not None: if eval_config.options.HasField('compute_confidence_intervals'): compute_confidence_intervals = ( eval_config.options.compute_confidence_intervals.value) if eval_config.options.HasField('k_anonymization_count'): k_anonymization_count = eval_config.options.k_anonymization_count.value return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, compute_confidence_intervals=compute_confidence_intervals, k_anonymization_count=k_anonymization_count, desired_batch_size=desired_batch_size, serialize=serialize, random_seed_for_testing=random_seed_for_testing) ] else: return [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ]
def testUpdateConfigWithDefaultsDoesNotAutomaticallyAddBaselineModel(self): eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "model1" } model_specs { name: "model2" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["model1", "model2"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults(eval_config) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithDefaultsAutomaticallyAddsBaselineModel(self): eval_config_pbtxt = """ model_specs { label_key: "my_label" } metrics_specs { metrics { class_name: "WeightedExampleCount" } } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs { name: "candidate" label_key: "my_label" } model_specs { name: "baseline" label_key: "my_label" is_baseline: true } metrics_specs { metrics { class_name: "WeightedExampleCount" } model_names: ["candidate", "baseline"] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults( eval_config, maybe_add_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def testUpdateConfigWithDefaultsRemoveBaselineModel(self): eval_config_pbtxt = """ model_specs { name: "candidate" } model_specs { name: "baseline" is_baseline: true } metrics_specs { metrics { class_name: "MeanLabel" threshold { value_threshold { lower_bound { value: 0.9 } } change_threshold { direction: HIGHER_IS_BETTER absolute{ value: -1e-10 } } } } thresholds { key: "my_metric" value { value_threshold { lower_bound { value: 0.9 } } change_threshold { direction: HIGHER_IS_BETTER absolute{ value: -1e-10 } } } } } """ eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig()) expected_eval_config_pbtxt = """ model_specs {} metrics_specs { metrics { class_name: "MeanLabel" threshold { value_threshold { lower_bound { value: 0.9 } } } } thresholds { key: "my_metric" value { value_threshold { lower_bound { value: 0.9 } } } } model_names: [""] } """ expected_eval_config = text_format.Parse(expected_eval_config_pbtxt, config.EvalConfig()) got_eval_config = config.update_eval_config_with_defaults( eval_config, maybe_remove_baseline=True) self.assertProtoEquals(got_eval_config, expected_eval_config)
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, output_path: Optional[Text] = None, display_only_data_location: Optional[Text] = None, display_only_file_format: Optional[Text] = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, write_config: Optional[bool] = True, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, random_seed_for_testing: Optional[int] = None) -> beam.pvalue.PDone: """PTransform for performing extraction, evaluation, and writing results. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_config = tfma.EvalConfig(slicing_specs=[...], metrics_specs=[...]) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, eval_config=eval_config) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, eval_config=eval_config, ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if needed by default extractors, evaluators, or writers and for display purposes of the model path. eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. output_path: Path to output metrics and plots results. display_only_data_location: Optional path indicating where the examples were read from. This is used only for display purposes - data will not actually be read from this path. display_only_file_format: Optional format of the examples. This is used only for display purposes. slice_spec: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). desired_batch_size: Optional batch size for batching in Predict. random_seed_for_testing: Provide for deterministic tests only. Raises: ValueError: If EvalConfig invalid or matching Extractor not found for an Evaluator. Returns: PDone. """ eval_shared_models = eval_shared_model if not isinstance(eval_shared_model, dict): eval_shared_models = {'': eval_shared_model} if eval_config is None: model_specs = [] for model_name, shared_model in eval_shared_models.items(): example_weight_key = shared_model.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec( name=model_name, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if not write_config: options.disabled_outputs.values.append(_EVAL_CONFIG_FILE) eval_config = config.EvalConfig( model_specs=model_specs, slicing_specs=slicing_specs, options=options) else: eval_config = config.update_eval_config_with_defaults(eval_config) config.verify_eval_config(eval_config) if not extractors: extractors = default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model, materialize=False, desired_batch_size=desired_batch_size) if not evaluators: evaluators = default_evaluators( eval_config=eval_config, eval_shared_model=eval_shared_model, random_seed_for_testing=random_seed_for_testing) for v in evaluators: evaluator.verify_evaluator(v, extractors) if not writers: writers = default_writers( output_path=output_path, eval_shared_model=eval_shared_model) # pylint: disable=no-value-for-parameter _ = ( examples | 'InputsToExtracts' >> InputsToExtracts() | 'ExtractAndEvaluate' >> ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'WriteResults' >> WriteResults(writers=writers)) if _EVAL_CONFIG_FILE not in eval_config.options.disabled_outputs.values: data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location file_format = '<unknown>' if display_only_file_format is not None: file_format = display_only_file_format model_locations = {} for k, v in eval_shared_models.items(): model_locations[k] = ('<unknown>' if v is None or v.model_path is None else v.model_path) _ = ( examples.pipeline | WriteEvalConfig(eval_config, output_path, data_location, file_format, model_locations)) # pylint: enable=no-value-for-parameter return beam.pvalue.PDone(examples.pipeline)
def default_extractors( # pylint: disable=invalid-name eval_shared_model: Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, desired_batch_size: Optional[int] = None, materialize: Optional[bool] = True) -> List[extractor.Extractor]: """Returns the default extractors for use in ExtractAndEvaluate. Args: eval_shared_model: Shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Required unless the predictions are provided alongside of the features (i.e. model-agnostic evaluations). eval_config: Eval config. slice_spec: Deprecated (use EvalConfig). desired_batch_size: Optional batch size for batching in Predict. materialize: True to have extractors create materialized output. Raises: NotImplementedError: If eval_config contains mixed serving and eval models. """ if eval_config is not None: eval_config = config.update_eval_config_with_defaults(eval_config) slice_spec = [ slicer.SingleSliceSpec(spec=spec) for spec in eval_config.slicing_specs ] if _is_legacy_eval(eval_shared_model, eval_config): # Backwards compatibility for previous add_metrics_callbacks implementation. return [ predict_extractor.PredictExtractor( eval_shared_model, desired_batch_size, materialize=materialize), slice_key_extractor.SliceKeyExtractor( slice_spec, materialize=materialize) ] elif eval_shared_model: model_types = model_util.get_model_types(eval_config) if not model_types.issubset(constants.VALID_MODEL_TYPES): raise NotImplementedError( 'model type must be one of: {}. evalconfig={}'.format( str(constants.VALID_MODEL_TYPES), eval_config)) if model_types == set([constants.TF_LITE]): return [ input_extractor.InputExtractor(eval_config=eval_config), tflite_predict_extractor.TFLitePredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size), slice_key_extractor.SliceKeyExtractor( slice_spec, materialize=materialize) ] elif constants.TF_LITE in model_types: raise NotImplementedError( 'support for mixing tf_lite and non-tf_lite models is not ' 'implemented: eval_config={}'.format(eval_config)) elif (eval_config and all(s.signature_name == eval_constants.EVAL_TAG for s in eval_config.model_specs)): return [ predict_extractor.PredictExtractor( eval_shared_model, desired_batch_size, materialize=materialize, eval_config=eval_config), slice_key_extractor.SliceKeyExtractor( slice_spec, materialize=materialize) ] elif (eval_config and any(s.signature_name == eval_constants.EVAL_TAG for s in eval_config.model_specs)): raise NotImplementedError( 'support for mixing eval and non-eval models is not implemented: ' 'eval_config={}'.format(eval_config)) else: return [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size), slice_key_extractor.SliceKeyExtractor( slice_spec, materialize=materialize) ] else: return [ input_extractor.InputExtractor(eval_config=eval_config), slice_key_extractor.SliceKeyExtractor( slice_spec, materialize=materialize) ]