def benchmarkPredictExtractorManualActuation(self): """Benchmark PredictExtractorV2 "manually".""" self._init_model() records = self._readDatasetIntoExtracts() extracts = [] for elem in records: extracts.append( input_extractor._ParseExample(elem, self._eval_config)) # pylint: disable=protected-access prediction_do_fn = predict_extractor_v2._PredictionDoFn( # pylint: disable=protected-access eval_config=self._eval_config, eval_shared_models={"": self._eval_shared_model}) prediction_do_fn.setup() start = time.time() predict_result = [] predict_batch_size = 1000 for batch in benchmark_utils.batched_iterator(extracts, predict_batch_size): predict_result.extend(prediction_do_fn.process(batch)) end = time.time() delta = end - start self.report_benchmark(iters=1, wall_time=delta, extras={"num_examples": len(records)})
def benchmarkPredict(self): """Benchmark the predict and aggregate combine stages "manually". Runs _TFMAPredictionDoFn "manually" outside a Beam pipeline. Records the wall time taken. """ # Run InputsToExtracts manually. records = [] for x in self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES): records.append({tfma.constants.INPUT_KEY: x}) fn = tfma.extractors.predict_extractor._TFMAPredictionDoFn( # pylint: disable=protected-access eval_shared_models={"": tfma.default_eval_shared_model( eval_saved_model_path=self._dataset.tfma_saved_model_path())}, eval_config=None) fn.setup() # Predict predict_batch_size = 1000 predict_result = [] start = time.time() for batch in benchmark_utils.batched_iterator(records, predict_batch_size): predict_result.extend(fn.process(batch)) end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "batch_size": predict_batch_size, "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def benchmarkEvalSavedModelPredict(self): """Benchmark using the EvalSavedModel to make predictions. Runs EvalSavedModel.predict_list and records the wall time taken. """ batch_size = 1000 eval_saved_model = load.EvalSavedModel( path=self._dataset.tfma_saved_model_path(), include_default_metrics=True) records = self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES) start = time.time() for batch in benchmark_utils.batched_iterator(records, batch_size): eval_saved_model.predict_list(batch) end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "batch_size": batch_size, "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def benchmarkEvalSavedModelMetricsResetUpdateGetList(self): """Benchmark using the EvalSavedModel to compute metrics. Runs EvalSavedModel.metrics_reset_update_get_list and records the wall time taken. """ batch_size = 1000 eval_saved_model = load.EvalSavedModel( path=self._dataset.tfma_saved_model_path(), include_default_metrics=True) records = self._dataset.read_raw_dataset( deserialize=False, limit=self._max_num_examples()) start = time.time() accumulators = [] for batch in benchmark_utils.batched_iterator(records, batch_size): accumulators.append( eval_saved_model.metrics_reset_update_get_list(batch)) end = time.time() delta = end - start # Sanity check metric_variables_sum = accumulators[0] for acc in accumulators[1:]: if len(metric_variables_sum) != len(acc): raise ValueError( "all metric variable value lists should have the same length, but " "got lists with different lengths: %d and %d" % (len(metric_variables_sum), len(acc))) metric_variables_sum = [ a + b for a, b in zip(metric_variables_sum, acc) ] metrics = eval_saved_model.metrics_set_variables_and_get_values( metric_variables_sum) if "average_loss" not in metrics: raise ValueError( "metrics should contain average_loss metric, but it did not. " "metrics were: %s" % metrics) self.report_benchmark( iters=1, wall_time=delta, extras={ "batch_size": batch_size, "num_examples": self._dataset.num_examples(limit=self._max_num_examples()) })
def _get_batched_records(dataset): """Returns a (batch_size, iterator for batched records) tuple for the dataset. Args: dataset: BenchmarkDataset object. Returns: Tuple of (batch_size, iterator for batched records), where records are decoded tf.train.Examples. """ batch_size = 1000 common_variables = _get_common_variables(dataset) converter = tft.coders.ExampleProtoCoder( common_variables.tf_metadata_schema, serialized=False) records = [converter.decode(x) for x in dataset.read_raw_dataset()] return batch_size, benchmark_utils.batched_iterator(records, batch_size)
def _get_batched_records(dataset, max_num_examples=None): """Returns a (batch_size, iterator for batched records) tuple for the dataset. Args: dataset: BenchmarkDataset object. max_num_examples: Maximum number of examples to read from the dataset. Returns: Tuple of (batch_size, iterator for batched records), where records are decoded tf.train.Examples. """ batch_size = 1000 common_variables = _get_common_variables(dataset) converter = example_coder.ExamplesToRecordBatchDecoder( common_variables.transform_input_dataset_metadata.schema. SerializeToString()) serialized_records = benchmark_utils.batched_iterator( dataset.read_raw_dataset(deserialize=False, limit=max_num_examples), batch_size) records = [converter.DecodeBatch(x) for x in serialized_records] return batch_size, records
def _runMetricsAndPlotsEvaluatorManualActuation(self, with_confidence_intervals, metrics_specs=None): """Benchmark MetricsAndPlotsEvaluatorV2 "manually".""" self._init_model() if not metrics_specs: metrics_specs = self._eval_config.metrics_specs records = self._readDatasetIntoExtracts() extracts = [] for elem in records: extracts.append( input_extractor._ParseExample(elem, self._eval_config)) # pylint: disable=protected-access prediction_do_fn = predict_extractor_v2._PredictionDoFn( # pylint: disable=protected-access eval_config=self._eval_config, eval_shared_models={"": self._eval_shared_model}) prediction_do_fn.setup() # Have to predict first predict_result = [] predict_batch_size = 1000 for batch in benchmark_utils.batched_iterator(extracts, predict_batch_size): predict_result.extend(prediction_do_fn.process(batch)) # Now Evaluate inputs_per_accumulator = 1000 start = time.time() computations, _ = ( metrics_and_plots_evaluator_v2._filter_and_separate_computations( # pylint: disable=protected-access metric_specs.to_computations(metrics_specs, eval_config=self._eval_config))) processed = [] for elem in predict_result: processed.append( next( metrics_and_plots_evaluator_v2._PreprocessorDoFn( # pylint: disable=protected-access computations).process(elem))) combiner = metrics_and_plots_evaluator_v2._ComputationsCombineFn( # pylint: disable=protected-access computations=computations, compute_with_sampling=with_confidence_intervals) accumulators = [] for batch in benchmark_utils.batched_iterator(processed, inputs_per_accumulator): accumulator = combiner.create_accumulator() for elem in batch: accumulator = combiner.add_input(accumulator, elem) accumulators.append(accumulator) final_accumulator = combiner.merge_accumulators(accumulators) final_output = combiner.extract_output(final_accumulator) end = time.time() delta = end - start # Sanity check the example count. This is not timed. example_count_key = metric_types.MetricKey(name="example_count") example_count = None for x in final_output: if example_count_key in x: example_count = x[example_count_key] break if example_count is None: raise ValueError( "example_count was not in the final list of metrics. " "metrics were: %s" % str(final_output)) if with_confidence_intervals: # If we're computing using confidence intervals, the example count will # not be exact. lower_bound = int(0.9 * len(records)) upper_bound = int(1.1 * len(records)) if example_count < lower_bound or example_count > upper_bound: raise ValueError("example count out of bounds: expecting " "%d < example_count < %d, but got %d" % (lower_bound, upper_bound, example_count)) else: # If we're not using confidence intervals, we expect the example count to # be exact. if example_count != len(records): raise ValueError( "example count mismatch: expecting %d got %d" % (len(records), example_count)) self.report_benchmark(iters=1, wall_time=delta, extras={ "inputs_per_accumulator": inputs_per_accumulator, "num_examples": len(records) })
def benchmarkAggregateCombineManualActuation(self): """Benchmark the aggregate combine stage "manually". Runs _AggregateCombineFn "manually" outside a Beam pipeline. Records the wall time taken. """ # Run InputsToExtracts manually. records = [] for x in self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES): records.append({tfma.constants.INPUT_KEY: x}) fn = tfma.extractors.predict_extractor._TFMAPredictionDoFn( # pylint: disable=protected-access eval_shared_models={"": tfma.default_eval_shared_model( eval_saved_model_path=self._dataset.tfma_saved_model_path())}, eval_config=None) fn.setup() # Predict predict_batch_size = 1000 predict_result = [] for batch in benchmark_utils.batched_iterator(records, predict_batch_size): predict_result.extend(fn.process(batch)) # AggregateCombineFn # # We simulate accumulating records into multiple different accumulators, # each with inputs_per_accumulator records, and then merging the resulting # accumulators together at one go. # Number of elements to feed into a single accumulator. # (This means we will have len(records) / inputs_per_accumulator # accumulators to merge). inputs_per_accumulator = 1000 combiner = tfma.evaluators.aggregate._AggregateCombineFn( # pylint: disable=protected-access eval_shared_model=tfma.default_eval_shared_model( eval_saved_model_path=self._dataset.tfma_saved_model_path())) accumulators = [] start = time.time() for batch in benchmark_utils.batched_iterator(predict_result, inputs_per_accumulator): accumulator = combiner.create_accumulator() for elem in batch: combiner.add_input(accumulator, elem) accumulators.append(accumulator) final_accumulator = combiner.merge_accumulators(accumulators) final_output = combiner.extract_output(final_accumulator) end = time.time() delta = end - start # Extract output to sanity check example count. This is not timed. extract_fn = tfma.evaluators.aggregate._ExtractOutputDoFn( # pylint: disable=protected-access eval_shared_model=tfma.default_eval_shared_model( eval_saved_model_path=self._dataset.tfma_saved_model_path())) extract_fn.setup() interpreted_output = list(extract_fn.process(((), final_output))) if len(interpreted_output) != 1: raise ValueError("expecting exactly 1 interpreted output, got %d" % (len(interpreted_output))) got_example_count = interpreted_output[0][1].get( "post_export_metrics/example_count") if got_example_count != self._dataset.num_examples( limit=MAX_NUM_EXAMPLES): raise ValueError( "example count mismatch: expecting %d got %d" % (self._dataset.num_examples(limit=MAX_NUM_EXAMPLES), got_example_count)) self.report_benchmark( iters=1, wall_time=delta, extras={ "inputs_per_accumulator": inputs_per_accumulator, "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def _runMetricsPlotsAndValidationsEvaluatorManualActuation( self, with_confidence_intervals, multi_model, metrics_specs=None, validation=False): """Benchmark MetricsPlotsAndValidationsEvaluator "manually".""" self._init_model(multi_model, validation) if not metrics_specs: metrics_specs = self._eval_config.metrics_specs extracts = self._readDatasetIntoBatchedExtracts() num_examples = sum( [e[constants.ARROW_RECORD_BATCH_KEY].num_rows for e in extracts]) extracts = [self._extract_features_and_labels(e) for e in extracts] prediction_do_fn = model_util.ModelSignaturesDoFn( eval_config=self._eval_config, eval_shared_models=self._eval_shared_models, signature_names={ constants.PREDICTIONS_KEY: {name: [None] for name in self._eval_shared_models} }, prefer_dict_outputs=False) prediction_do_fn.setup() # Have to predict first predict_result = [] for e in extracts: predict_result.extend(prediction_do_fn.process(e)) # Unbatch extracts unbatched_extracts = [] for e in predict_result: unbatched_extracts.extend( unbatch_extractor._extract_unbatched_inputs(e)) # pylint: disable=protected-access # Add global slice key. for e in unbatched_extracts: e[tfma.SLICE_KEY_TYPES_KEY] = () # Now Evaluate inputs_per_accumulator = 1000 start = time.time() for _ in range(_ITERS): computations, _, _, _ = ( # pylint: disable=protected-access metrics_plots_and_validations_evaluator. _filter_and_separate_computations( metric_specs_util.to_computations( metrics_specs, eval_config=self._eval_config))) # pylint: enable=protected-access processed = [] for elem in unbatched_extracts: processed.append( next( metrics_plots_and_validations_evaluator. _PreprocessorDoFn( # pylint: disable=protected-access computations).process(elem))) combiner = metrics_plots_and_validations_evaluator._ComputationsCombineFn( # pylint: disable=protected-access computations=computations) if with_confidence_intervals: combiner = poisson_bootstrap._BootstrapCombineFn(combiner) # pylint: disable=protected-access combiner.setup() accumulators = [] for batch in benchmark_utils.batched_iterator( processed, inputs_per_accumulator): accumulator = combiner.create_accumulator() for elem in batch: accumulator = combiner.add_input(accumulator, elem) accumulators.append(accumulator) final_accumulator = combiner.merge_accumulators(accumulators) final_output = combiner.extract_output(final_accumulator) end = time.time() delta = end - start # Sanity check the example count. This is not timed. example_count_key = metric_types.MetricKey( name="example_count", model_name="candidate" if multi_model else "") if example_count_key in final_output: example_count = final_output[example_count_key] else: raise ValueError( "example_count_key ({}) was not in the final list of " "metrics. metrics were: {}".format(example_count_key, final_output)) if with_confidence_intervals: # If we're computing using confidence intervals, the example count will # not be exact. lower_bound = int(0.9 * num_examples) upper_bound = int(1.1 * num_examples) if example_count < lower_bound or example_count > upper_bound: raise ValueError("example count out of bounds: expecting " "%d < example_count < %d, but got %d" % (lower_bound, upper_bound, example_count)) else: # If we're not using confidence intervals, we expect the example count to # be exact. if example_count != num_examples: raise ValueError( "example count mismatch: expecting %d got %d" % (num_examples, example_count)) self.report_benchmark(iters=_ITERS, wall_time=delta, extras={ "inputs_per_accumulator": inputs_per_accumulator, "num_examples": num_examples })
def _runMetricsAndPlotsEvaluatorManualActuation(self, with_confidence_intervals, metrics_specs=None): """Benchmark MetricsAndPlotsEvaluatorV2 "manually".""" self._init_model() if not metrics_specs: metrics_specs = self._eval_config.metrics_specs extracts = self._readDatasetIntoBatchedExtracts() num_examples = sum( [e[constants.ARROW_RECORD_BATCH_KEY].num_rows for e in extracts]) extracts = [ batched_input_extractor._ExtractInputs(e, self._eval_config) # pylint: disable=protected-access for e in extracts ] prediction_do_fn = batched_predict_extractor_v2._BatchedPredictionDoFn( # pylint: disable=protected-access eval_config=self._eval_config, eval_shared_models={"": self._eval_shared_model}) prediction_do_fn.setup() # Have to predict first predict_result = [] for e in extracts: predict_result.extend(prediction_do_fn.process(e)) # Unbatch extracts unbatched_extarcts = [] for e in predict_result: unbatched_extarcts.extend( unbatch_extractor._ExtractUnbatchedInputs(e)) # pylint: disable=protected-access # Add global slice key. for e in unbatched_extarcts: e[tfma.SLICE_KEY_TYPES_KEY] = () # Now Evaluate inputs_per_accumulator = 1000 start = time.time() computations, _ = ( # pylint: disable=protected-access metrics_plots_and_validations_evaluator. _filter_and_separate_computations( metric_specs.to_computations(metrics_specs, eval_config=self._eval_config))) # pylint: enable=protected-access processed = [] for elem in unbatched_extarcts: processed.append( next( metrics_plots_and_validations_evaluator._PreprocessorDoFn( # pylint: disable=protected-access computations).process(elem))) combiner = metrics_plots_and_validations_evaluator._ComputationsCombineFn( # pylint: disable=protected-access computations=computations, compute_with_sampling=with_confidence_intervals) accumulators = [] for batch in benchmark_utils.batched_iterator(processed, inputs_per_accumulator): accumulator = combiner.create_accumulator() for elem in batch: accumulator = combiner.add_input(accumulator, elem) accumulators.append(accumulator) final_accumulator = combiner.merge_accumulators(accumulators) final_output = combiner.extract_output(final_accumulator) end = time.time() delta = end - start # Sanity check the example count. This is not timed. example_count_key = metric_types.MetricKey(name="example_count") example_count = None for x in final_output: if example_count_key in x: example_count = x[example_count_key] break if example_count is None: raise ValueError( "example_count was not in the final list of metrics. " "metrics were: %s" % str(final_output)) if with_confidence_intervals: # If we're computing using confidence intervals, the example count will # not be exact. lower_bound = int(0.9 * num_examples) upper_bound = int(1.1 * num_examples) if example_count < lower_bound or example_count > upper_bound: raise ValueError("example count out of bounds: expecting " "%d < example_count < %d, but got %d" % (lower_bound, upper_bound, example_count)) else: # If we're not using confidence intervals, we expect the example count to # be exact. if example_count != num_examples: raise ValueError( "example count mismatch: expecting %d got %d" % (num_examples, example_count)) self.report_benchmark(iters=1, wall_time=delta, extras={ "inputs_per_accumulator": inputs_per_accumulator, "num_examples": num_examples })