Пример #1
0
 def continuous_eval_on_train_data(self):
     """Evaluate on train data until checkpoints stop being produced."""
     for ckpt_path in next_checkpoint(self._hparams.model_dir,
                                      self._hparams.eval_timeout_mins):
         # Skip zero'th step.
         train_step = decoding.get_step_from_ckpt_path(ckpt_path)
         if train_step == 0:
             tf.logging.info("Skipping evaluation at step 0")
             continue
         self.evaluate_on_train_data()
Пример #2
0
    def continuous_decode_on_eval_data(self):
        """Decode from dataset on new checkpoint."""
        if self._hparams.mlperf_mode:
            ckpt_generator = next_undecoded_checkpoint(
                self._hparams.model_dir,
                self._decode_hparams.decode_timeout_mins)
        else:
            ckpt_generator = next_checkpoint(
                self._hparams.model_dir,
                self._decode_hparams.decode_timeout_mins)

        for ckpt in ckpt_generator:
            current_step = decoding.get_step_from_ckpt_path(ckpt)
            tf.logging.info("Decoding step %d" % current_step)
            # Skip checkpoint 0.
            if current_step == 0:
                continue
            # Decode the latest checkpoint by default.
            checkpoint_path = None
            if self._hparams.mlperf_mode:
                self._decode_hparams.mlperf_decode_step = current_step
                checkpoint_path = ckpt

            mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
            self.decode(dataset_split=tf.estimator.ModeKeys.EVAL,
                        checkpoint_path=checkpoint_path)
            d_hparams = self._decode_hparams
            if self._hparams.mlperf_mode and d_hparams.mlperf_success:
                mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                             value={"success": "true"})
                break

        d_hparams = self._decode_hparams
        if self._hparams.mlperf_mode and not d_hparams.mlperf_success:
            mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                         value={"success": "false"})