def valid(self, force=False): """Validate the model every few steps. """ valid_condition = (self._current_step + 1) % self._valid_freq == 0 or force if valid_condition and self._is_root_node(): self._model.train(False) score_map = self.run_valid() is_improved = self.check_improvement(score_map) self._scheduler.after_valid(is_improved, score_map) self._model.train(True) self.log( "valid", "{}{} (epoch {}, step {})".format(self._dict_str(score_map), " *" if is_improved else "", self._current_epoch + 1, self._global_step + 1)) # Check new trainer settings when using horovod if valid_condition and self._multigpu and self._horovod: self.synchronize_learning_rate() if (self._current_step + 1) % 1000 == 0 and self._multigpu and self._horovod: import horovod.torch as hvd hvd.init() from nmtlab.trainers.hvd_utils import broadcast_optimizer_state import horovod.torch as hvd broadcast_optimizer_state(self._optimizer, ROOT_RANK) hvd.broadcast_parameters(self._model.state_dict(), ROOT_RANK)
def valid(self): """Validate the model every few steps. """ if (self._current_step + 1) % self._valid_freq == 0 and self._is_root_node(): self._model.train(False) score_map = self.run_valid() is_improved = self.check_improvement(score_map) self._scheduler.after_valid(is_improved, score_map) self._model.train(True) self.log("valid", "{}{} (epoch {}, step {})".format( self._dict_str(score_map), " *" if is_improved else "", self._current_epoch + 1, self._global_step + 1 )) # Check new trainer settings if (self._current_step + 1) % self._valid_freq == 0 and self._multigpu: self.synchronize_learning_rate() if (self._current_step + 1) % 100 == 0 and self._multigpu: from nmtlab.trainers.hvd_utils import broadcast_optimizer_state broadcast_optimizer_state(self._optimizer, ROOT_RANK)