def run(self): """ Evaluation on a existing model. Step 1: Build model. Step 2: Builds evaluation dataset. Step 3: Restore checkpoints. Step 4: Evaluate and reduce metric. """ with training_utils.get_strategy_scope(self.strategy): tfds = training_utils.build_datasets(compat.ModeKeys.EVAL, self.strategy, self.custom_dataset, self.task, cache=True) keras_model = self.build_evaluation_model(self.task, self.model, self._criterion) keras_model.summary() summary_model_variables(keras_model) # Step 4: Restore checkpoints. stat = restore_checkpoint_if_possible(self.model, self.model_dir) if not stat: logging.info( f"WARNING: Fail to restore checkpoint from {self.model_dir}. " "We assume this was done on purpose. ") # Step 5: Evaluate and reduce metric. start_time = time.time() results, avg_res, whole_res = training_utils.reduce_eval_results( self._criterion, self.custom_dataset, training_utils.make_predictions(self.strategy, keras_model, tfds, self.custom_dataset)) logging.info("Evaluation elapsed: %.2fs", time.time() - start_time) def _display(res, name=None): if name: logging.info(f"Evaluation Results ({name}):") for k, v in res.items(): logging.info(" %s: %.2f", k, v) if not isinstance(self.custom_dataset, MultipleDataset): _display(results) else: for name, res in results.items(): _display(res, name) _display( avg_res, f"on average by weights {self.custom_dataset.sample_weights}") _display(whole_res, "mixed")
def validate(self, step): if not self._validate_criterion: return start_time = time.time() results, avg_res, mixed_res = training_utils.reduce_eval_results( self._criterion, self._custom_dataset, training_utils.make_predictions(self._strategy, self._criterion_model, self._eval_tfds, self._custom_dataset)) elapsed = time.time() - start_time elapsed_from_start = time.time() - self._criterion_start_time def _display(res, best, name=None, tb_name=None): if tb_name is None: tb_name = name tb_name = "" if tb_name is None else (tb_name + "_") name = "" if name is None else f" ({name})" for k, v in res.items(): logging.info( "Evaluating (%s) validation set%s: %s=%.2f (Best %.2f) " "step=%d\tElapsed %.2fs FromSTART %.2fs", self._criterion_metric.flag, name, k, v, best[k], step, elapsed, elapsed_from_start) tf.summary.scalar(compat.GlobalKeys.TBPREFIX_VALID + f"/{tb_name}{k}", v, step=step) if isinstance(self._custom_dataset, MultipleDataset): for name, res in results.items(): self._criterion_recorder[name].record(step, res) _display(res, self._criterion_recorder[name].best, name=name) self._avg_criterion_recorder.record(step, avg_res) _display( avg_res, self._avg_criterion_recorder.best, f"on average by weights {self._custom_dataset.sample_weights}", tb_name="AVERAGE") self._mixed_criterion_recorder.record(step, mixed_res) _display(mixed_res, self._mixed_criterion_recorder.best, "MIXED") else: self._criterion_recorder.record(step, results) _display(results, self._criterion_recorder.best)
def run(self): """ Sequence generation from an existing model checkpoint. Step 1: Build model and restore checkpoints. Step 2: Build test dataset. Step 3: Sequence generation. Step 4: Evaluation using metric. """ # Step 3: Build model. with training_utils.get_strategy_scope(self.strategy): model = self._build_and_restore_model() keras_model = self.build_generation_model(self.task, model, self._search_layer) tfds = training_utils.build_datasets(compat.ModeKeys.INFER, self.strategy, self.custom_dataset, self.task) keras_model.summary() summary_model_variables(keras_model) # Step 5: Sequence Generation. start_time = time.time() results = training_utils.make_predictions( self.strategy, keras_model, tfds, self.custom_dataset, map_func=lambda y: SequenceGenerator.postprocess_generation( self.task, y)) logging.info("Generation elapsed: %.2fs", time.time() - start_time) if self._output_file: if isinstance(self.custom_dataset, MultipleDataset): if isinstance(self._output_file, dict): for name in results: if self._output_file.get(name, None): with tf.io.gfile.GFile(self._output_file[name], "w") as fw: fw.write("\n".join(results[name]) + "\n") logging.info( "Saving generation of dataset {} results into {}" .format(name, self._output_file[name])) else: logging.info( "Unsupported type of `output_file`={}({}) for MultipleDataset." .format(self._output_file, type(self._output_file))) else: if isinstance(self._output_file, str): with tf.io.gfile.GFile(self._output_file, "w") as fw: fw.write("\n".join(results) + "\n") logging.info("Saving generation results into {}".format( self._output_file)) else: logging.info( f"WARNING: No generation results are saved due to unsupported type " f"of `output_file`: {self._output_file} ({type(self._output_file)})" ) # Step 6: evaluation using metric def _display(res, name=None): if name: logging.info(f"Evaluation Result ({name}):") else: logging.info("Evaluation Result:") for k, v in res.items(): logging.info(" %s=%.2f", k, v) if self._metric is not None: saving_metrics = dict() if isinstance(self.custom_dataset, MultipleDataset): on_average = {} mixed_dsnames = [] mixed_hypos = [] mixed_refs = [] for name in tfds: assert isinstance(self.custom_dataset.datasets[name], TextGenDataset) if self.custom_dataset.datasets[name].targets: metric_result = self._metric( results[name], self.custom_dataset.datasets[name].targets) for k, v in metric_result.items(): if k not in on_average: on_average[k] = 0. on_average[ k] += self.custom_dataset.sample_weights[ name] * v _display(metric_result, name) mixed_dsnames.append(name) mixed_hypos.extend(results[name]) mixed_refs.extend( self.custom_dataset.datasets[name].targets) saving_metrics[name] = metric_result if len(mixed_dsnames) > 1: _display( on_average, f"on average by weights {self._custom_dataset.sample_weights}" ) mixed_metric_result = self._metric(mixed_refs, mixed_hypos) _display(mixed_metric_result, "mixed of {}".format(",".join(mixed_dsnames))) saving_metrics["MIXED"] = mixed_metric_result else: assert isinstance(self.custom_dataset, TextGenDataset) if self.custom_dataset.targets is not None: metric_result = self._metric(results, self.custom_dataset.targets) _display(metric_result) saving_metrics = metric_result if self._save_metric is not None: logging.info(f"Saving metric results into {self._save_metric}") with tf.io.gfile.GFile(self._save_metric, "w") as fw: json.dump(saving_metrics, fw)
def validate(self, step): super(SeqGenerationValidator, self).validate(step) if not self._validate_gen: return start_time = time.time() results = training_utils.make_predictions( self._strategy, self._gen_model, self._gen_tfds, self._custom_dataset, map_func=self._postprocess_fn) elapsed = time.time() - start_time elapsed_from_start = time.time() - self._gen_start_time def _display_hypo(custom_ds, hypos, name=None): if name: logging.info( f"===== Generation examples from {name} (Total {len(hypos)}) =====" ) else: logging.info( f"===== Generation examples (Total {len(hypos)}) =====") for sample_idx in random.sample(list(range(0, len(hypos))), 5): logging.info("Sample %d", sample_idx) if hasattr(custom_ds, "sources") and custom_ds.sources is not None: logging.info(" Data: %s", custom_ds.sources[sample_idx]) logging.info(" Reference: %s", custom_ds.targets[sample_idx]) logging.info(" Hypothesis: %s", hypos[sample_idx]) def _display(res, best, name=None, tb_name=None): if tb_name is None: tb_name = name tb_name = "" if tb_name is None else (tb_name + "_") name = "" if name is None else f" ({name})" for k, v in res.items(): logging.info( "Evaluating (%s) validation set%s: %s=%.2f (Best %.2f) " "step=%d\tElapsed %.2fs FromSTART %.2fs", self._gen_metric.flag, name, k, v, best[k], step, elapsed, elapsed_from_start) tf.summary.scalar(compat.GlobalKeys.TBPREFIX_VALID + f"/{tb_name}{k}", v, step=step) if isinstance(self._custom_dataset, MultipleDataset): on_average = {} mixed_dsnames = [] mixed_hypos = [] mixed_refs = [] sample_weights = { name: self._custom_dataset.sample_weights[name] for name in self._gen_tfds } sample_weight_sum = sum(sample_weights.values()) * 1. sample_weights = { name: weight / sample_weight_sum for name, weight in sample_weights.items() } for name, res in results.items(): metric_res = self._gen_metric( res, self._custom_dataset.datasets[name].targets) self._gen_recorder[name].record(step, metric_res) for k, v in metric_res.items(): if k not in on_average: on_average[k] = 0. on_average[k] += sample_weights[name] * v _display_hypo(self._custom_dataset.datasets[name], res, name=name) _display(metric_res, self._gen_recorder[name].best, name=name) mixed_dsnames.append(name) mixed_hypos.extend(res) mixed_refs.extend(self._custom_dataset.datasets[name].targets) if len(mixed_dsnames) >= 1: self._avg_gen_recorder.record(step, on_average) if len(mixed_dsnames) > 1: _display(on_average, self._avg_gen_recorder.best, f"on average by weights {sample_weights}", tb_name="AVERAGE") mixed_metric_result = self._gen_metric( mixed_hypos, mixed_refs) self._mixed_gen_recorder.record(step, mixed_metric_result) _display(mixed_metric_result, self._mixed_gen_recorder.best, "mixed of {}".format(",".join(mixed_dsnames)), tb_name="MIXED") else: metric_res = self._gen_metric(results, self._custom_dataset.targets) _display_hypo(self._custom_dataset, results) self._gen_recorder.record(step, metric_res) _display(metric_res, self._gen_recorder.best)