def attach(self: TrainerType) -> TrainerType: ra = RunningAverage(output_transform=lambda x: x) ra.attach(self.trainer, "Train Loss") self.pbar.attach(self.trainer, ['Train Loss']) self.val_pbar.attach(self.train_evaluator) self.val_pbar.attach(self.valid_evaluator) self.valid_evaluator.add_event_handler(Events.COMPLETED, self.early_stop) ckpt = {'model': self.model, 'optimizer': self.optimizer} self.valid_evaluator.add_event_handler(Events.COMPLETED, self.checkpoint, ckpt) def graceful_exit(engine, e): if isinstance(e, KeyboardInterrupt): engine.terminate() LOGGER.warn("CTRL-C caught. Exiting gracefully...") else: raise (e) self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit) return self
def setup_ignite( engine: Engine, params: SimpleNamespace, exp_source, run_name: str, extra_metrics: Iterable[str] = (), ): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print( "Episode %d: reward=%.2f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), ) ) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics["time_passed"] print( "Game solved in %s, after %d episodes " "and %d iterations!" % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration) ) trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler( tag="train", metric_names=metrics, output_transform=lambda a: a ) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def _test_distrib_on_output(device): rank = idist.get_rank() n_iters = 10 n_epochs = 3 batch_size = 10 # Data per rank data = list(range(n_iters)) k = n_epochs * batch_size * n_iters all_loss_values = torch.arange(0, k * idist.get_world_size(), dtype=torch.float64).to(device) loss_values = iter(all_loss_values[k * rank:k * (rank + 1)]) def update_fn(engine, batch): loss_value = next(loss_values) return loss_value.item() trainer = Engine(update_fn) alpha = 0.98 metric_device = idist.device( ) if torch.device(device).type != "xla" else "cpu" avg_output = RunningAverage(output_transform=lambda x: x, alpha=alpha, epoch_bound=False, device=metric_device) avg_output.attach(trainer, "running_avg_output") @trainer.on(Events.STARTED) def running_avg_output_init(engine): engine.state.running_avg_output = None @trainer.on(Events.ITERATION_COMPLETED) def running_avg_output_update(engine): i = engine.state.iteration - 1 o = sum([ all_loss_values[i + j * k] for j in range(idist.get_world_size()) ]).item() o /= idist.get_world_size() if engine.state.running_avg_output is None: engine.state.running_avg_output = o else: engine.state.running_avg_output = engine.state.running_avg_output * alpha + ( 1.0 - alpha) * o @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_output_values(engine): it = engine.state.iteration assert engine.state.running_avg_output == pytest.approx( engine.state.metrics["running_avg_output"] ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" trainer.run(data, max_epochs=3)
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): # get rid of missing metrics warning warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) avg_steps = trainer.state.metrics.get("avg_steps", 50) avg_reward = trainer.state.metrics.get("avg_reward", 0.0) print("Episode %d: reward=%.0f (avg %.2f), " "steps=%s (avg %.2f), speed=%.1f f/s, " "elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, avg_reward, trainer.state.episode_steps, avg_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), )) if avg_steps < 15 and trainer.state.episode > 100: print("Average steps has fallen below 10, stop training") trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward", "avg_steps"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): warnings.simplefilter('ignore', category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print('Episode %d: reward=%0.f, steps=%s, speed=%.1f f/s, elapsed=%s' % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(passed)))) now = datetime.now().isoformat(timespec='minutes') logdir = f'runs-{now}-{run_name}'.replace(':', '') tb = tb_logger.TensorboardLogger(logdir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, 'avg_loss') metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag='train', metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
def setup_ignite(engine: Engine, exp_source, run_name: str, extra_metrics: Iterable[str] = ()): # get rid of missing metrics warning warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print("Episode %d: reward=%.0f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % ( trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)), )) now = datetime.now().isoformat(timespec="minutes") logdir = f"runs/{now}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) ptan_ignite.PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
def setup_ignite(engine: Engine, params: SimpleNamespace, exp_source, run_name: str, net, extra_metrics: Iterable[str] = ()): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler(exp_source, subsample_end_of_episode=100) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.0f, steps=%s, " "elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, timedelta(seconds=int(passed)))) path = './saves/(episode-%.3f.data' % trainer.state.episode torch.save(net.state_dict(), path) now = datetime.now().isoformat(timespec='minutes').replace(':', '') logdir = f"runs2/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, 'avg_loss') metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tb every 100 Iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) return tb
def main_test_mnist(): from torchvision.datasets import MNIST from torchvision.transforms import Compose, ToTensor, ToPILImage, Normalize transform = Compose([ToTensor()]) train_dataset = MNIST(root="/tmp", train=True, download=True, transform=transform) test_dataset = MNIST(root="/tmp", train=False, download=True, transform=transform) vae = VAE(x_dim=784, z_dim=50, device='cuda' if torch.cuda.is_available() else 'cpu') logger.info(f"\n{vae}") optimizer = ClippedAdam({"lr": 1e-3}) svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO()) def _update(engine, batch): vae.train() x, y = batch loss = svi.step(x.view(-1, 784).to(vae.device, non_blocking=True)) return loss / len(x), len(x) def _evaluate(engine, batch): vae.eval() x, y = batch elbo = svi.evaluate_loss( x.view(-1, 784).to(vae.device, non_blocking=True)) return elbo / len(x), len(x) trainer = Engine(_update) evaluater = Engine(_evaluate) train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True, drop_last=True, num_workers=8) test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True, pin_memory=True, drop_last=True, num_workers=8) timer = Timer(average=True) timer.attach(engine=trainer, start=Events.EPOCH_STARTED, pause=Events.ITERATION_COMPLETED, resume=Events.ITERATION_STARTED, step=Events.ITERATION_COMPLETED) loss_metric = RunningAverage(output_transform=lambda outputs: -outputs[0], alpha=1) loss_metric.attach(engine=trainer, name="ELBO") loss_metric.attach(engine=evaluater, name="ELBO") vis = Visdom(server="gpu1.cluster.peidan.me", port=10697, env='Imp-pyro--vae-MNIST') @trainer.on(Events.EPOCH_COMPLETED) def log_train_loss(engine): elbo = engine.state.metrics['ELBO'] logger.info( f"epoch:{engine.state.epoch}, ELBO: {elbo:.2f}, step time: {timer.value():.3f}s" ) vis.line(Y=[elbo], X=[engine.state.epoch], win="Train-ELBO", update='append', opts={"title": "Train-ELBO"}) def plot_vae_samples(title): x = torch.zeros([1, 784]).to(vae.device) for i in range(10): images = [] for rr in range(100): # get loc from the model sample_loc_i = vae.model(x) img = sample_loc_i[0].view(1, 28, 28).cpu().data.numpy() images.append(img) vis.images(images, 10, 2, win=title, opts={'title': title}) @trainer.on(Events.EPOCH_COMPLETED) def generate_samples(engine): epoch = engine.state.epoch if epoch % 10 == 0: logger.info(f"epoch: {epoch}, plot samples") plot_vae_samples(f"epoch-{epoch}") @trainer.on(Events.EPOCH_COMPLETED) def validation(engine): epoch = engine.state.epoch if epoch % 5 == 0: evaluater.run(test_dataloader) elbo = evaluater.state.metrics['ELBO'] logger.info(f"epoch: {epoch}, validation ELBO: {elbo}") vis.line(Y=[elbo], X=[engine.state.epoch], win="Validation-ELBO", update='append', opts={'title': "Validation-ELBO"}) trainer.run(train_dataloader, max_epochs=2500)
class BasicTrainTask(BaseTask): name = "Train Task" def _validate(self, config): """ Method to check if specific configuration is correct. Raises AssertError if is incorrect. """ assert isinstance(config, BasicTrainConfig), \ "Configuration should be instance of `BasicTrainConfig`, but given {}".format(type(config)) def _start(self): """Method to run the task """ if 'cuda' in self.device: self.model = self.model.to(self.device) mlflow.log_param("model", get_object_name(self.model)) self.logger.debug("Setup criterion") if "cuda" in self.device: self.criterion = self.criterion.to(self.device) mlflow.log_param("criterion", get_object_name(self.criterion)) mlflow.log_param("optimizer", get_object_name(self.optimizer)) self.logger.debug("Setup ignite trainer") trainer = self._setup_trainer() self._setup_trainer_handlers(trainer) metrics = {'loss': Loss(self.criterion)} metrics.update(self.metrics) self.logger.debug("Input data info: ") msg = "- train data loader: {} number of batches".format( len(self.train_dataloader)) if isinstance(self.train_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.train_dataloader.sampler)) self.logger.debug(msg) if isinstance(self.train_dataloader, DataLoader): write_model_graph(self.writer, model=self.model, data_loader=self.train_dataloader, device=self.device) self.pbar_eval = None if self.train_eval_dataloader is not None: self.pbar_eval = ProgressBar() self._setup_offline_train_metrics_computation(trainer, metrics) if self.val_dataloader is not None: if self.val_metrics is None: self.val_metrics = metrics if self.pbar_eval is None: self.pbar_eval = ProgressBar() val_evaluator = self._setup_val_metrics_computation(trainer) if self.reduce_lr_on_plateau is not None: assert self.reduce_lr_on_plateau_var in self.val_metrics, \ "Monitor variable {} is not found in metrics {}" \ .format(self.reduce_lr_on_plateau_var, metrics) @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_var = engine.state.metrics[ self.reduce_lr_on_plateau_var] self.reduce_lr_on_plateau.step(val_var) def default_score_function(engine): val_loss = engine.state.metrics['loss'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if self.early_stopping_kwargs is not None: if 'score_function' in self.early_stopping_kwargs: es_score_function = self.early_stopping_kwargs[ 'score_function'] else: es_score_function = default_score_function self._setup_early_stopping(trainer, val_evaluator, es_score_function) # Setup model checkpoint: if self.model_checkpoint_kwargs is None: self.model_checkpoint_kwargs = { "filename_prefix": "model", "score_name": "val_loss", "score_function": default_score_function, "n_saved": 3, "atomic": True, "create_dir": True, "save_as_state_dict": True } self._setup_best_model_checkpointing(val_evaluator) self.logger.debug("Setup other handlers") if self.lr_scheduler is not None: @trainer.on(Events.ITERATION_STARTED) def update_lr_scheduler(engine): self.lr_scheduler.step() self._setup_log_learning_rate(trainer) self.logger.info("Start training: {} epochs".format(self.num_epochs)) mlflow.log_param("num_epochs", self.num_epochs) trainer.run(self.train_dataloader, max_epochs=self.num_epochs) self.logger.debug("Training is ended") def _setup_trainer(self): trainer = create_supervised_trainer(self.model, self.optimizer, self.criterion, device=self.device, non_blocking='cuda' in self.device) return trainer def _setup_trainer_handlers(self, trainer): # Setup timer to measure training time timer = setup_timer(trainer) self._setup_log_training_loss(trainer) @trainer.on(Events.EPOCH_COMPLETED) def log_training_time(engine): self.logger.info("One epoch training time (seconds): {}".format( timer.value())) last_model_saver = ModelCheckpoint( self.log_dir.as_posix(), filename_prefix="checkpoint", save_interval=self.trainer_checkpoint_interval, n_saved=1, atomic=True, create_dir=True, save_as_state_dict=True) model_name = get_object_name(self.model) to_save = { model_name: self.model, "optimizer": self.optimizer, } if self.lr_scheduler is not None: to_save['lr_scheduler'] = self.lr_scheduler trainer.add_event_handler(Events.ITERATION_COMPLETED, last_model_saver, to_save) trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) def _setup_log_training_loss(self, trainer): self.avg_output = RunningAverage(output_transform=lambda out: out) self.avg_output.attach(trainer, 'running_avg_loss') self.pbar.attach(trainer, ['running_avg_loss']) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iteration = (engine.state.iteration - 1) % len( self.train_dataloader) + 1 if iteration % self.log_interval == 0: # self.logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration, # len(self.train_dataloader), # engine.state.output)) self.writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) mlflow.log_metric("training_loss_vs_iterations", engine.state.output) def _setup_log_learning_rate(self, trainer): @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(self.optimizer.param_groups) == 1: lr = float(self.optimizer.param_groups[0]['lr']) self.logger.debug("Learning rate: {}".format(lr)) self.writer.add_scalar("learning_rate", lr, engine.state.epoch) mlflow.log_metric("learning_rate", lr) else: for i, param_group in enumerate(self.optimizer.param_groups): lr = float(param_group['lr']) self.logger.debug("Learning rate (group {}): {}".format( i, lr)) self.writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) mlflow.log_metric("learning_rate_group_{}".format(i), lr) def _setup_offline_train_metrics_computation(self, trainer, metrics): train_eval_loader = self.train_eval_dataloader msg = "- train evaluation data loader: {} number of batches".format( len(train_eval_loader)) if isinstance(train_eval_loader, DataLoader): msg += " | {} number of samples".format( len(train_eval_loader.sampler)) self.logger.debug(msg) train_evaluator = create_supervised_evaluator(self.model, metrics=metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(train_evaluator) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute training metrics") metrics_results = train_evaluator.run( train_eval_loader).metrics self.logger.info("Training Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("training/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("training_avg_{}".format(name), metrics_results[name]) return train_evaluator def _setup_val_metrics_computation(self, trainer): val_evaluator = create_supervised_evaluator(self.model, metrics=self.val_metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(val_evaluator) msg = "- validation data loader: {} number of batches".format( len(self.val_dataloader)) if isinstance(self.val_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.val_dataloader.sampler)) self.logger.debug(msg) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute validation metrics") metrics_results = val_evaluator.run( self.val_dataloader).metrics self.logger.info( "Validation Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("validation/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("validation_avg_{}".format(name), metrics_results[name]) return val_evaluator def _setup_early_stopping(self, trainer, val_evaluator, score_function): kwargs = dict(self.early_stopping_kwargs) if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, self.log_filepath, self.log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) def _setup_best_model_checkpointing(self, val_evaluator): model_name = get_object_name(self.model) best_model_saver = ModelCheckpoint(self.log_dir.as_posix(), **self.model_checkpoint_kwargs) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: self.model})
def train(model, train_loader, eval_loaders, optimizer, loss_fn, n_it_max, patience, split_names, select_metric='Val accuracy_0', select_mode='max', viz=None, device='cpu', lr_scheduler=None, name=None, log_steps=None, log_epoch=False, _run=None, prepare_batch=_prepare_batch, single_pass=False, n_ep_max=None): # print(model) if not log_steps and not log_epoch: logger.warning('/!\\ No logging during training /!\\') if log_steps is None: log_steps = [] epoch_steps = len(train_loader) if log_epoch: log_steps.append(epoch_steps) if single_pass: max_epoch = 1 elif n_ep_max is None: assert n_it_max is not None max_epoch = int(n_it_max / epoch_steps) + 1 else: assert n_it_max is None max_epoch = n_ep_max all_metrics = defaultdict(dict) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device, prepare_batch=prepare_batch) if hasattr(model, 'new_epoch_hook'): trainer.add_event_handler(Events.EPOCH_STARTED, model.new_epoch_hook) if hasattr(model, 'new_iter_hook'): trainer.add_event_handler(Events.ITERATION_STARTED, model.new_iter_hook) trainer._logger.setLevel(logging.WARNING) # trainer output is in the format (x, y, y_pred, loss, optionals) train_loss = RunningAverage(output_transform=lambda out: out[3].item(), epoch_bound=True) train_loss.attach(trainer, 'Trainer loss') if hasattr(model, 's'): met = Average(output_transform=lambda _: float('nan') if model.s is None else model.s) met.attach(trainer, 'cur_s') trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'cur_s') if hasattr(model, 'arch_sampler') and model.arch_sampler.distrib_dim > 0: met = Average(output_transform=lambda _: float('nan') if model.cur_split is None else model.cur_split) met.attach(trainer, 'Trainer split') trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'Trainer split') # trainer.add_event_handler(Events.EPOCH_STARTED, met.started) all_ent = Average( output_transform=lambda out: out[-1]['arch_entropy_avg'].item()) all_ent.attach(trainer, 'Trainer all entropy') trainer.add_event_handler(Events.ITERATION_COMPLETED, all_ent.completed, 'Trainer all entropy') train_ent = Average( output_transform=lambda out: out[-1]['arch_entropy_sample'].item()) train_ent.attach(trainer, 'Trainer sampling entropy') trainer.add_event_handler(Events.ITERATION_COMPLETED, train_ent.completed, 'Trainer sampling entropy') trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda engine: model.check_arch_freezing( ent=train_ent.compute(), epoch=engine.state.iteration / (epoch_steps * max_epoch))) def log_always(engine, name): val = engine.state.output[-1][name] all_metrics[name][engine.state.iteration / epoch_steps] = val.mean().item() def log_always_dict(engine, name): for node, val in engine.state.output[-1][name].items(): all_metrics['node {} {}'.format( node, name)][engine.state.iteration / epoch_steps] = val.mean().item() trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_grads') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_probas') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='node_grads') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='task all_loss') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='arch all_loss') trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='entropy all_loss') if n_it_max is not None: StopAfterIterations([n_it_max]).attach(trainer) # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # epoch_pbar.attach(trainer, metric_names=['Train loss']) # # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED, # closing_event_name=Events.COMPLETED) total_time = Timer(average=False) eval_time = Timer(average=False) eval_time.pause() data_time = Timer(average=False) forward_time = Timer(average=False) forward_time.attach(trainer, start=Events.EPOCH_STARTED, pause=Events.ITERATION_COMPLETED, resume=Events.ITERATION_STARTED, step=Events.ITERATION_COMPLETED) epoch_time = Timer(average=False) epoch_time.attach(trainer, start=Events.EPOCH_STARTED, pause=Events.EPOCH_COMPLETED, resume=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) def get_loss(y_pred, y): l = loss_fn(y_pred, y) if not torch.is_tensor(l): l, *l_details = l return l.mean() def get_member(x, n=0): if isinstance(x, (list, tuple)): return x[n] return x eval_metrics = {'loss': Loss(get_loss)} for i in range(model.n_out): out_trans = get_attr_transform(i) def extract_ys(out): x, y, y_pred, loss, _ = out return out_trans((y_pred, y)) train_acc = Accuracy(extract_ys) train_acc.attach(trainer, 'Trainer accuracy_{}'.format(i)) trainer.add_event_handler(Events.ITERATION_COMPLETED, train_acc.completed, 'Trainer accuracy_{}'.format(i)) eval_metrics['accuracy_{}'.format(i)] = \ Accuracy(output_transform=out_trans) # if isinstance(model, SSNWrapper): # model.arch_sampler.entropy().mean() evaluator = create_supervised_evaluator(model, metrics=eval_metrics, device=device, prepare_batch=prepare_batch) last_iteration = 0 patience_counter = 0 best = { 'value': float('inf') * 1 if select_mode == 'min' else -1, 'iter': -1, 'state_dict': None } def is_better(new, old): if select_mode == 'min': return new < old else: return new > old def log_results(evaluator, data_loader, iteration, split_name): evaluator.run(data_loader) metrics = evaluator.state.metrics log_metrics = {} for metric_name, metric_val in metrics.items(): log_name = '{} {}'.format(split_name, metric_name) if viz: first = iteration == 0 and split_name == split_names[0] viz.line( [metric_val], X=[iteration], win=metric_name, name=log_name, update=None if first else 'append', opts={ 'title': metric_name, 'showlegend': True, 'width': 500, 'xlabel': 'iterations' }) viz.line( [metric_val], X=[iteration / epoch_steps], win='{}epoch'.format(metric_name), name=log_name, update=None if first else 'append', opts={ 'title': metric_name, 'showlegend': True, 'width': 500, 'xlabel': 'epoch' }) if _run: _run.log_scalar(log_name, metric_val, iteration) log_metrics[log_name] = metric_val all_metrics[log_name][iteration] = metric_val return log_metrics if lr_scheduler is not None: @trainer.on(Events.EPOCH_COMPLETED) def step(_): lr_scheduler.step() # logger.warning('current lr {:.5e}'.format( # optimizer.param_groups[0]['lr'])) @trainer.on(Events.ITERATION_COMPLETED) def log_event(trainer): iteration = trainer.state.iteration if trainer.state else 0 nonlocal last_iteration, patience_counter, best if not log_steps or not \ (iteration in log_steps or iteration % log_steps[-1] == 0): return epoch_time.pause() eval_time.resume() all_metrics['training_epoch'][iteration] = iteration / epoch_steps all_metrics['training_iteration'][iteration] = iteration if hasattr(model, 'arch_sampler'): all_metrics['training_archs'][iteration] = \ model.arch_sampler().squeeze().detach() # if hasattr(model, 'distrib_gen'): # entropy = model.distrib_gen.entropy() # all_metrics['entropy'][iteration] = entropy.mean().item() # if trainer.state and len(trainer.state.metrics) > 1: # raise ValueError(trainer.state.metrics) all_metrics['data time'][iteration] = data_time.value() all_metrics['data time_ps'][iteration] = data_time.value() / max( data_time.step_count, 1.) all_metrics['forward time'][iteration] = forward_time.value() all_metrics['forward time_ps'][iteration] = forward_time.value() / max( forward_time.step_count, 1.) all_metrics['epoch time'][iteration] = epoch_time.value() all_metrics['epoch time_ps'][iteration] = epoch_time.value() / max( epoch_time.step_count, 1.) if trainer.state: # logger.warning(trainer.state.metrics) for metric, value in trainer.state.metrics.items(): all_metrics[metric][iteration] = value if viz: viz.line( [value], X=[iteration], win=metric.split()[-1], name=metric, update=None if iteration == 0 else 'append', opts={ 'title': metric, 'showlegend': True, 'width': 500, 'xlabel': 'iterations' }) iter_this_step = iteration - last_iteration for d_loader, name in zip(eval_loaders, split_names): if name == 'Train': if iteration == 0: all_metrics['Trainer loss'][iteration] = float('nan') all_metrics['Trainer accuracy_0'][iteration] = float('nan') if hasattr(model, 'arch_sampler'): all_metrics['Trainer all entropy'][iteration] = float( 'nan') all_metrics['Trainer sampling entropy'][ iteration] = float('nan') # if hasattr(model, 'cur_split'): all_metrics['Trainer split'][iteration] = float('nan') continue split_metrics = log_results(evaluator, d_loader, iteration, name) if select_metric not in split_metrics: continue if is_better(split_metrics[select_metric], best['value']): best['value'] = split_metrics[select_metric] best['iter'] = iteration best['state_dict'] = copy.deepcopy(model.state_dict()) if patience > 0: patience_counter = 0 elif patience > 0: patience_counter += iter_this_step if patience_counter >= patience: logger.info('#####') logger.info('# Early stopping Run') logger.info('#####') trainer.terminate() last_iteration = iteration eval_time.pause() eval_time.step() all_metrics['eval time'][iteration] = eval_time.value() all_metrics['eval time_ps'][iteration] = eval_time.value( ) / eval_time.step_count all_metrics['total time'][iteration] = total_time.value() epoch_time.resume() log_event(trainer) # # @trainer.on(Events.EPOCH_COMPLETED) # def log_epoch(trainer): # iteration = trainer.state.iteration if trainer.state else 0 # epoch = iteration/epoch_steps # fw_t = forward_time.value() # fw_t_ps = fw_t / forward_time.step_count # d_t = data_time.value() # d_t_ps = d_t / data_time.step_count # e_t = epoch_time.value() # e_t_ps = e_t / epoch_time.step_count # ev_t = eval_time.value() # ev_t_ps = ev_t / eval_time.step_count # logger.warning('<{}> Epoch {}/{} finished (Forward: {:.3f}s({:.3f}), ' # 'data: {:.3f}s({:.3f}), epoch: {:.3f}s({:.3f}),' # ' Eval: {:.3f}s({:.3f}), Total: ' # '{:.3f}s)'.format(type(model).__name__, epoch, # max_epoch, fw_t, fw_t_ps, d_t, d_t_ps, # e_t, e_t_ps, ev_t, ev_t_ps, # total_time.value())) data_time.attach(trainer, start=Events.STARTED, pause=Events.ITERATION_STARTED, resume=Events.ITERATION_COMPLETED, step=Events.ITERATION_STARTED) if hasattr(model, 'iter_per_epoch'): model.iter_per_epoch = len(train_loader) trainer.run(train_loader, max_epochs=max_epoch) return trainer.state.iteration, all_metrics, best
def setup_meters(self): # meters avg_output = RunningAverage(output_transform=lambda x: x) avg_output.attach(self.engine, 'running_avg_loss')
def setup_ignite( engine: Engine, params: SimpleNamespace, exp_source, run_name: str, model, optimizer, extra_metrics: Iterable[str] = (), ): warnings.simplefilter("ignore", category=UserWarning) handler = ptan_ignite.EndOfEpisodeHandler( exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) ptan_ignite.EpisodeFPSHandler().attach(engine) objects_to_checkpoint = { 'model': model, 'optimizer': optimizer, 'trainer': engine } checkpoint_dir = Path("models") saver = DiskSaver(str(checkpoint_dir), create_dir=True, require_empty=False) handler = Checkpoint(objects_to_checkpoint, saver, n_saved=2) engine.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler) checkpoints_paths = list(checkpoint_dir.iterdir()) if checkpoints_paths: checkpoint = torch.load(checkpoints_paths[-1]) print(f"Loading checkpoint {checkpoints_paths[-1].name}") Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get('time_passed', 0) print("Episode %d: reward=%.2f, steps=%s, " "speed=%.1f f/s, elapsed=%s" % (trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get('avg_fps', 0), timedelta(seconds=int(passed)))) @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics['time_passed'] print("Game solved in %s, after %d episodes " "and %d iterations!" % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration)) trainer.should_terminate = True now = datetime.now().isoformat(timespec='minutes').replace(":", "-") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = tb_logger.TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v['loss']) run_avg.attach(engine, "avg_loss") metrics = ['reward', 'steps', 'avg_reward'] handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics) event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations ptan_ignite.PeriodicEvents().attach(engine) metrics = ['avg_loss', 'avg_fps'] metrics.extend(extra_metrics) handler = tb_logger.OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite( engine: Engine, params: SimpleNamespace, exp_source, run_name: str, model, optimizer, buffer, target_net, extra_metrics: Iterable[str] = (), ): simplefilter("ignore", category=UserWarning) handler = EndOfEpisodeHandler(exp_source, bound_avg_reward=params.stop_reward) handler.attach(engine) EpisodeFPSHandler().attach(engine) objects_to_checkpoint = { 'model': model, 'optimizer': optimizer, 'trainer': engine, "buffer": buffer, "target_net": target_net } checkpoint_dir = Path("models backup") saver = LightDiskSaver(str(checkpoint_dir), create_dir=True, require_empty=False) handler = Checkpoint(objects_to_checkpoint, saver, n_saved=1) engine.add_event_handler(Events.ITERATION_COMPLETED(every=30000), handler) checkpoints_paths = list(checkpoint_dir.iterdir()) if checkpoints_paths: checkpoint = joblib.load(checkpoints_paths[-1]) print(f"Loading checkpoint {checkpoints_paths[-1].name}") Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) @engine.on(EpisodeEvents.EPISODE_COMPLETED) def episode_completed(trainer: Engine): passed = trainer.state.metrics.get("time_passed", 0) print( "Episode {}: reward={:.0f}, steps={}, speed={:.1f} f/s, elapsed={}" .format(trainer.state.episode, trainer.state.episode_reward, trainer.state.episode_steps, trainer.state.metrics.get("avg_fps", 0), timedelta(seconds=int(passed)))) @engine.on(EpisodeEvents.BOUND_REWARD_REACHED) def game_solved(trainer: Engine): passed = trainer.state.metrics["time_passed"] print( f"Game solved in {timedelta(seconds=int(passed))} after {trainer.state.episode}" f" episodes and {trainer.state.iteration} iterations!") trainer.should_terminate = True now = datetime.now().isoformat(timespec="minutes").replace(":", "-") logdir = f"runs/{now}-{params.run_name}-{run_name}" tb = TensorboardLogger(log_dir=logdir) run_avg = RunningAverage(output_transform=lambda v: v["loss"]) run_avg.attach(engine, "avg_loss") metrics = ["reward", "steps", "avg_reward"] handler = OutputHandler(tag="episodes", metric_names=metrics) event = EpisodeEvents.EPISODE_COMPLETED tb.attach(engine, log_handler=handler, event_name=event) # write to tensorboard every 100 iterations PeriodicEvents().attach(engine) metrics = ["avg_loss", "avg_fps"] metrics.extend(extra_metrics) handler = OutputHandler(tag="train", metric_names=metrics, output_transform=lambda a: a) event = PeriodEvents.ITERS_100_COMPLETED tb.attach(engine, log_handler=handler, event_name=event)
def evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, dataloader, experiment_mode="debug", base_dir=utils.BASE_DIR): # Create tester engine tester = Engine(utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, training=False)) loss_metric = RunningAverage(output_transform=lambda x: x[0], alpha=1) loss_metric.attach(tester, loss_name) utilsT.attach_metrics(tester, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(tester, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(tester, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(tester, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(tester, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2,)) timer = Timer(average=True) timer.attach(tester, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # Save metrics log_metrics = list(ALL_METRICS) # Run test print("Testing...") tester.run(dataloader, 1) # Capture time secs_per_epoch = timer.value() duration_per_epoch = utils.duration_to_str(int(secs_per_epoch)) print("Time elapsed in epoch: ", duration_per_epoch) # Copy metrics dict metrics = dict() original_metrics = tester.state.metrics for metric_name in log_metrics: for disease_name in chosen_diseases: key = metric_name + "_" + disease_name if key not in original_metrics: print("Metric not found in tester, skipping: ", key) continue metrics[key] = original_metrics[key] # Copy CMs for disesase_name in chosen_diseases: key = "cm_" + disease_name if key not in original_metrics: print("CM not found in tester, skipping: ", key) continue cm = original_metrics[key] metrics[key] = cm.numpy().tolist() # Save to file folder = os.path.join(base_dir, "results", experiment_mode) os.makedirs(folder, exist_ok=True) fname = os.path.join(folder, run_name + ".json") with open(fname, "w+") as f: json.dump(metrics, f) print("Saved metrics to: ", fname) return metrics
def _add_metrics(self): train_loss = RunningAverage(Loss(self.get_loss)) train_loss.attach(self.trainer, 'avg_train_loss') val_loss = Loss(self.get_loss) val_loss.attach(self.evaluator, 'val_loss')
def _test(metric_device): data = list(range(n_iters)) np.random.seed(12) all_y_true_batch_values = np.random.randint( 0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size)) all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes) y_true_batch_values = iter(all_y_true_batch_values[rank, ...]) y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...]) def update_fn(engine, batch): y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage(Accuracy( output_transform=lambda x: [x[0], x[1]], device=metric_device), alpha=alpha, epoch_bound=False) acc_metric.attach(trainer, "running_avg_accuracy") running_avg_acc = [ None, ] true_acc_metric = Accuracy(device=metric_device) @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): i = engine.state.iteration - 1 true_acc_metric.reset() for j in range(idist.get_world_size()): output = ( torch.from_numpy(all_y_pred_batch_values[j, i, :, :]), torch.from_numpy(all_y_true_batch_values[j, i, :]), ) true_acc_metric.update(output) batch_acc = true_acc_metric._num_correct.item( ) * 1.0 / true_acc_metric._num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert ( engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" trainer.run(data, max_epochs=3)
def main(cfg): """ Performs training, validation and testing. """ assert isdir(cfg.data_dir), \ '`data_dir` must be a valid path.' cfg.cuda = torch.cuda.is_available() \ and not cfg.no_cuda cfg.model_dir = os.getcwd() # setting random seed for reproducibility if cfg.seed: set_random_seed(cfg) device = torch.device('cuda' if cfg.cuda else 'cpu') os.makedirs(cfg.model_dir, exist_ok=True) label2id = create_label2id(cfg) cfg.num_labels = len(label2id) xlmr = create_pretrained(cfg.model_type, cfg.force_download) # creating dataset split loaders datasets = create_dataset(cfg, xlmr, label2id) train_dataset, valid_dataset = datasets def compute_loss(batch): """ Computes the forward pass and returns the cross entropy loss. """ inputs, labels = [ torch.from_numpy(tensor).to(device).long() for tensor in batch ] logits = model(inputs) logits = logits.view(-1, logits.size(-1)) labels = labels.view(-1) loss = torch.nn.functional.cross_entropy(logits, labels, ignore_index=-1) return loss def train_step(engine, batch): """ Propagates the inputs forward and updates the parameters. """ step = engine.state.iteration model.train() loss = compute_loss(batch) backward(loss) if cfg.clip_grad_norm is not None: clip_grad_norm(cfg.clip_grad_norm) if step % cfg.grad_accum_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() # restoring the averaged loss across steps loss *= cfg.grad_accum_steps return loss.item() def eval_step(engine, batch): """ Propagates the inputs forward without storing any gradients. """ model.eval() with torch.no_grad(): loss = compute_loss(batch) return loss.item() def backward(loss): """ Backpropagates the loss in either mixed or normal precision mode. """ if cfg.fp16: with amp.scale_loss(loss, optimizer) as sc: sc.backward() else: loss.backward() def clip_grad_norm(max_norm): """ Applies gradient clipping. """ if cfg.fp16: params = amp.master_params(optimizer) else: params = model.parameters() torch.nn.utils.clip_grad_norm_(params, max_norm) trainer = Engine(train_step) validator = Engine(eval_step) checkpoint = ModelCheckpoint( cfg.model_dir, cfg.model_type, n_saved=5, save_as_state_dict=True, score_function=lambda e: -e.state.metrics['loss']) last_ckpt_path = cfg.ckpt_path if last_ckpt_path is not None: msg = 'Loading state from {}' print(msg.format(basename(last_ckpt_path))) last_state = torch.load(last_ckpt_path, map_location=device) model = create_model(xlmr, len(label2id), cfg) model = model.to(device) del xlmr.model optimizer = create_optimizer(cfg, model) scheduler = create_scheduler(cfg, optimizer, len(train_dataset)) # using apex if required and loading its state if cfg.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if last_ckpt_path is not None and \ 'amp' in last_state: amp.load_state_dict(last_state['amp']) if last_ckpt_path is not None: model.load_state_dict(last_state['model']) optimizer.load_state_dict(last_state['optimizer']) scheduler.load_state_dict(last_state['scheduler']) checkpoint_dict = { 'model': model, 'optimizer': optimizer, 'scheduler': scheduler } if cfg.fp16: checkpoint_dict['amp'] = amp validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict) metric = RunningAverage(output_transform=lambda x: x) metric.attach(trainer, 'loss') metric.attach(validator, 'loss') pbar = ProgressBar() pbar.attach(trainer, metric_names=['loss']) history_path = join(cfg.model_dir, 'history.json') history = collections.defaultdict(list) headers = ['epoch', 'train_loss', 'valid_loss'] if exists(history_path): with open(history_path, 'r') as fh: history = json.load(fh) def record_history(results): """ Records the results to the history. """ for header in headers: history[header].append(results[header]) with open(history_path, 'w') as fh: json.dump(history, fh) @trainer.on(Events.EPOCH_COMPLETED) def print_results(engine): """ Logs the training results. """ validator.run(valid_dataset) record_history({ 'epoch': engine.state.epoch, 'train_loss': engine.state.metrics['loss'], 'valid_loss': validator.state.metrics['loss'] }) data = list(zip(*[history[h] for h in headers])) table = tabulate(data, headers, floatfmt='.3f') print(table.split('\n')[-1]) data = list(zip(*[history[h] for h in headers])) print() print(cfg.pretty()) print() print('***** Running training *****') print() print(tabulate(data, headers, floatfmt='.3f')) trainer.run(train_dataset, cfg.max_epochs)
def train_model( name="", resume="", base_dir=utils.BASE_DIR, model_name="v0", chosen_diseases=None, n_epochs=10, batch_size=4, oversample=False, max_os=None, shuffle=False, opt="sgd", opt_params={}, loss_name="wbce", loss_params={}, train_resnet=False, log_metrics=None, flush_secs=120, train_max_images=None, val_max_images=None, test_max_images=None, experiment_mode="debug", save=True, save_cms=True, # Note that in this case, save_cms (to disk) includes write_cms (to TB) write_graph=False, write_emb=False, write_emb_img=False, write_img=False, image_format="RGB", multiple_gpu=False, ): # Choose GPU device = utilsT.get_torch_device() print("Using device: ", device) # Common folders dataset_dir = os.path.join(base_dir, "dataset") # Dataset handling print("Loading train dataset...") train_dataset, train_dataloader = utilsT.prepare_data( dataset_dir, "train", chosen_diseases, batch_size, oversample=oversample, max_os=max_os, shuffle=shuffle, max_images=train_max_images, image_format=image_format, ) train_samples, _ = train_dataset.size() print("Loading val dataset...") val_dataset, val_dataloader = utilsT.prepare_data( dataset_dir, "val", chosen_diseases, batch_size, max_images=val_max_images, image_format=image_format, ) val_samples, _ = val_dataset.size() # Should be the same than chosen_diseases chosen_diseases = list(train_dataset.classes) print("Chosen diseases: ", chosen_diseases) if resume: # Load model and optimizer model, model_name, optimizer, opt, loss_name, loss_params, chosen_diseases = models.load_model( base_dir, resume, experiment_mode="", device=device) model.train(True) else: # Create model model = models.init_empty_model(model_name, chosen_diseases, train_resnet=train_resnet).to(device) # Create optimizer OptClass = optimizers.get_optimizer_class(opt) optimizer = OptClass(model.parameters(), **opt_params) # print("OPT: ", opt_params) # Allow multiple GPUs if multiple_gpu: model = DataParallel(model) # Tensorboard log options run_name = utils.get_timestamp() if name: run_name += "_{}".format(name) if len(chosen_diseases) == 1: run_name += "_{}".format(chosen_diseases[0]) elif len(chosen_diseases) == 14: run_name += "_all" log_dir = get_log_dir(base_dir, run_name, experiment_mode=experiment_mode) print("Run name: ", run_name) print("Saved TB in: ", log_dir) writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs) # Create validator engine validator = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, False)) val_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) val_loss.attach(validator, loss_name) utilsT.attach_metrics(validator, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(validator, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(validator, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(validator, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(validator, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(validator, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) # Create trainer engine trainer = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, True)) train_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) train_loss.attach(trainer, loss_name) utilsT.attach_metrics(trainer, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(trainer, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(trainer, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(trainer, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(trainer, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(trainer, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # TODO: Early stopping # def score_function(engine): # val_loss = engine.state.metrics[loss_name] # return -val_loss # handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) # validator.add_event_handler(Events.COMPLETED, handler) # Metrics callbacks if log_metrics is None: log_metrics = list(ALL_METRICS) def _write_metrics(run_type, metrics, epoch, wall_time): loss = metrics.get(loss_name, 0) writer.add_scalar("Loss/" + run_type, loss, epoch, wall_time) for metric_base_name in log_metrics: for disease in chosen_diseases: metric_value = metrics.get( "{}_{}".format(metric_base_name, disease), -1) writer.add_scalar( "{}_{}/{}".format(metric_base_name, disease, run_type), metric_value, epoch, wall_time) @trainer.on(Events.EPOCH_COMPLETED) def tb_write_metrics(trainer): epoch = trainer.state.epoch max_epochs = trainer.state.max_epochs # Run on evaluation validator.run(val_dataloader, 1) # Common time wall_time = time.time() # Log all metrics to TB _write_metrics("train", trainer.state.metrics, epoch, wall_time) _write_metrics("val", validator.state.metrics, epoch, wall_time) train_loss = trainer.state.metrics.get(loss_name, 0) val_loss = validator.state.metrics.get(loss_name, 0) tb_write_histogram(writer, model, epoch, wall_time) print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})". format(epoch, max_epochs, train_loss, val_loss, utils.duration_to_str(int(timer._elapsed())))) # Hparam dict hparam_dict = { "resume": resume, "n_diseases": len(chosen_diseases), "diseases": ",".join(chosen_diseases), "n_epochs": n_epochs, "batch_size": batch_size, "shuffle": shuffle, "model_name": model_name, "opt": opt, "loss": loss_name, "samples (train, val)": "{},{}".format(train_samples, val_samples), "train_resnet": train_resnet, "multiple_gpu": multiple_gpu, } def copy_params(params_dict, base_name): for name, value in params_dict.items(): hparam_dict["{}_{}".format(base_name, name)] = value copy_params(loss_params, "loss") copy_params(opt_params, "opt") print("HPARAM: ", hparam_dict) # Train print("-" * 50) print("Training...") trainer.run(train_dataloader, n_epochs) # Capture time secs_per_epoch = timer.value() duration_per_epoch = utils.duration_to_str(int(secs_per_epoch)) print("Average time per epoch: ", duration_per_epoch) print("-" * 50) ## Write all hparams hparam_dict["duration_per_epoch"] = duration_per_epoch # FIXME: this is commented to avoid having too many hparams in TB frontend # metrics # def copy_metrics(engine, engine_name): # for metric_name, metric_value in engine.state.metrics.items(): # hparam_dict["{}_{}".format(engine_name, metric_name)] = metric_value # copy_metrics(trainer, "train") # copy_metrics(validator, "val") print("Writing TB hparams") writer.add_hparams(hparam_dict, {}) # Save model to disk if save: print("Saving model...") models.save_model(base_dir, run_name, model_name, experiment_mode, hparam_dict, trainer, model, optimizer) # Write graph to TB if write_graph: print("Writing TB graph...") tb_write_graph(writer, model, train_dataloader, device) # Write embeddings to TB if write_emb: print("Writing TB embeddings...") image_size = 256 if write_emb_img else 0 # FIXME: be able to select images (balanced, train vs val, etc) image_list = list(train_dataset.label_index["FileName"])[:1000] # disease = chosen_diseases[0] # positive = train_dataset.label_index[train_dataset.label_index[disease] == 1] # negative = train_dataset.label_index[train_dataset.label_index[disease] == 0] # positive_images = list(positive["FileName"])[:25] # negative_images = list(negative["FileName"])[:25] # image_list = positive_images + negative_images all_images, all_embeddings, all_predictions, all_ground_truths = gen_embeddings( model, train_dataset, device, image_list=image_list, image_size=image_size) tb_write_embeddings( writer, chosen_diseases, all_images, all_embeddings, all_predictions, all_ground_truths, global_step=n_epochs, use_images=write_emb_img, tag="1000_{}".format("img" if write_emb_img else "no_img"), ) # Save confusion matrices (is expensive to calculate them afterwards) if save_cms: print("Saving confusion matrices...") # Assure folder cms_dir = os.path.join(base_dir, "cms", experiment_mode) os.makedirs(cms_dir, exist_ok=True) base_fname = os.path.join(cms_dir, run_name) n_diseases = len(chosen_diseases) def extract_cms(metrics): """Extract confusion matrices from a metrics dict.""" cms = [] for disease in chosen_diseases: key = "cm_" + disease if key not in metrics: cm = np.array([[-1, -1], [-1, -1]]) else: cm = metrics[key].numpy() cms.append(cm) return np.array(cms) # Train confusion matrix train_cms = extract_cms(trainer.state.metrics) np.save(base_fname + "_train", train_cms) tb_write_cms(writer, "train", chosen_diseases, train_cms) # Validation confusion matrix val_cms = extract_cms(validator.state.metrics) np.save(base_fname + "_val", val_cms) tb_write_cms(writer, "val", chosen_diseases, val_cms) # All confusion matrix (train + val) all_cms = train_cms + val_cms np.save(base_fname + "_all", all_cms) # Print to console if len(chosen_diseases) == 1: print("Train CM: ") print(train_cms[0]) print("Val CM: ") print(val_cms[0]) # print("Train CM 2: ") # print(trainer.state.metrics["cm_" + chosen_diseases[0]]) # print("Val CM 2: ") # print(validator.state.metrics["cm_" + chosen_diseases[0]]) if write_img: # NOTE: this option is not recommended, use Testing notebook to plot and analyze images print("Writing images to TB...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images, ) # TODO: add a way to select images? # image_list = list(test_dataset.label_index["FileName"])[:3] # Examples in test_dataset (with bboxes available): image_list = [ # "00010277_000.png", # (Effusion, Infiltrate, Mass, Pneumonia) # "00018427_004.png", # (Atelectasis, Effusion, Mass) # "00021703_001.png", # (Atelectasis, Effusion, Infiltrate) # "00028640_008.png", # (Effusion, Infiltrate) # "00019124_104.png", # (Pneumothorax) # "00019124_090.png", # (Nodule) # "00020318_007.png", # (Pneumothorax) "00000003_000.png", # (0) # "00000003_001.png", # (0) # "00000003_002.png", # (0) "00000732_005.png", # (Cardiomegaly, Pneumothorax) # "00012261_001.png", # (Cardiomegaly, Pneumonia) # "00013249_033.png", # (Cardiomegaly, Pneumonia) # "00029808_003.png", # (Cardiomegaly, Pneumonia) # "00022215_012.png", # (Cardiomegaly, Pneumonia) # "00011402_007.png", # (Cardiomegaly, Pneumonia) # "00019018_007.png", # (Cardiomegaly, Infiltrate) # "00021009_001.png", # (Cardiomegaly, Infiltrate) # "00013670_151.png", # (Cardiomegaly, Infiltrate) # "00005066_030.png", # (Cardiomegaly, Infiltrate, Effusion) "00012288_000.png", # (Cardiomegaly) "00008399_007.png", # (Cardiomegaly) "00005532_000.png", # (Cardiomegaly) "00005532_014.png", # (Cardiomegaly) "00005532_016.png", # (Cardiomegaly) "00005827_000.png", # (Cardiomegaly) # "00006912_007.png", # (Cardiomegaly) # "00007037_000.png", # (Cardiomegaly) # "00007043_000.png", # (Cardiomegaly) # "00012741_004.png", # (Cardiomegaly) # "00007551_020.png", # (Cardiomegaly) # "00007735_040.png", # (Cardiomegaly) # "00008339_010.png", # (Cardiomegaly) # "00008365_000.png", # (Cardiomegaly) # "00012686_003.png", # (Cardiomegaly) ] tb_write_images(writer, model, test_dataset, chosen_diseases, n_epochs, device, image_list) # Close TB writer if experiment_mode != "debug": writer.close() # Run post_train print("-" * 50) print("Running post_train...") print("Loading test dataset...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images) save_cms_with_names(run_name, experiment_mode, model, test_dataset, test_dataloader, chosen_diseases) evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, test_dataloader, experiment_mode=experiment_mode, base_dir=base_dir) # Return values for debugging model_run = ModelRun(model, run_name, model_name, chosen_diseases) if experiment_mode == "debug": model_run.save_debug_data(writer, trainer, validator, train_dataset, train_dataloader, val_dataset, val_dataloader) return model_run
num_workers=6) model = BaselineModel( config=experiment_config.pop("model"), embeddings_result_file=data_config.get("embeddings_result_file"), vocab=vocab) optimizer = Adam(model.parameters(), training_config.pop("lr")) loss = CrossEntropyLoss() trainer = create_supervised_trainer(model, optimizer, loss, device=device) evaluator = create_supervised_evaluator( model, metrics={'accuracy': VisualQAAccuracy()}, device=device) # create and add handlers run_avg = RunningAverage(output_transform=lambda x: x) run_avg.attach(trainer, 'loss') pbar = ProgressBar(persist=False, bar_format=None) pbar.attach(trainer, ['loss']) pbar.attach(evaluator) eval_handler = EvalHandler(evaluator=evaluator, data_loader=val_loader) eval_handler.attach(trainer) if not DEBUGGING_MODE: tb_handler = TensorboardHandler(evaluator=evaluator) tb_handler.attach(trainer) mlflow_handler = MlflowHandler(evaluator=evaluator) mlflow_handler.attach(trainer) # finally run training process trainer.run(train_loader, max_epochs=training_config.pop("n_epochs"))
def run(train_dir, val_dir=None, learning_rate=1e-4, num_workers=1, num_epochs=100, batch_size=16, shuffle=False, num_controls=2, num_intentions=4, hidden_dim=256, log_interval=10, log_dir='./logs', seed=2605, accumulation_steps=4, save_model='models', resume=None): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') cudnn.benchmark = True train_loader, val_loader = get_dataloader(train_dir, val_dir, num_workers=num_workers, batch_size=batch_size, shuffle=shuffle) if resume: model = torch.load(resume) else: model = DepthIntentionEncodeModel(num_controls=num_controls, num_intentions=num_intentions, hidden_dim=hidden_dim) model = model.to(device) writer = create_summary_writer(model, train_loader, log_dir) criterion = nn.MSELoss() check_manual_seed(seed) # optim = RAdam(model.parameters(),lr=learning_rate,betas=(0.9,0.999)) optim = SGD(model.parameters(), lr=learning_rate) lr_scheduler = ExponentialLR(optim, gamma=0.95) checkpoints = ModelCheckpoint(save_model, 'Model', save_interval=1, n_saved=3, create_dir=True, require_empty=False, save_as_state_dict=False) def update_fn(engine, batch): model.train() optim.zero_grad() x, y = batch x = list(map(lambda x: x.to(device), x)) y = y.to(device) y_pred = model(*x) loss = criterion(y_pred, y) loss.backward() optim.step() return loss.item() def evaluate_fn(engine, batch): engine.state.metrics = dict() model.eval() x, y = batch x = list(map(lambda x: x.to(device), x)) y = y.to(device) y_pred = model(*x) mse_loss = F.mse_loss(y_pred, y) mae_loss = F.l1_loss(y_pred, y) engine.state.metrics['mse'] = mse_loss engine.state.metrics['mae'] = mae_loss trainer = Engine(update_fn) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoints, {'model': model}) avg_loss = RunningAverage(output_transform=lambda x: x, alpha=0.1) avg_loss.attach(trainer, 'running_avg_loss') pbar = ProgressBar() pbar.attach(trainer, ['running_avg_loss']) evaluator = Engine(evaluate_fn) pbar.attach(evaluator) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: print("[Epoch: {}][Iteration: {}/{}] loss: {:.4f}".format( engine.state.epoch, iter, len(train_loader), engine.state.output)) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), iter) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) metrics = evaluator.state.metrics mse = metrics['mse'] mae = metrics['mae'] print("Training Results - Epoch: {} mae: {:.5f} mse: {:.5f}".format( engine.state.epoch, mse, mae)) writer.add_scalar("training/mse", mse, engine.state.epoch) writer.add_scalar("training/mae", mae, engine.state.epoch) # @trainer.on(Events.EPOCH_COMPLETED) # def log_validation_results(engine): # evaluator.run(val_loader) # metrics = evaluator.state.metrics # mse = metrics['mse'] # mae = metrics['mae'] # print("Validation Results - Epoch: {} mae: {:.2f} mse: {:.2f}".format(engine.state.epoch, mse, mae)) # writer.add_scalar("valid/mse", mse, engine.state.epoch) # writer.add_scalar("valid/mae", mae, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def update_lr_scheduler(engine): lr_scheduler.step() print('learning rate is: {:6f}'.format(lr_scheduler.get_lr()[0])) trainer.run(train_loader, max_epochs=num_epochs) writer.close()
def test_integration(): n_iters = 100 batch_size = 10 n_classes = 10 y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) loss_values = iter(range(n_iters)) def update_fn(engine, batch): loss_value = next(loss_values) y_true_batch = next(y_true_batch_values) y_pred_batch = next(y_pred_batch_values) return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) trainer = Engine(update_fn) alpha = 0.98 acc_metric = RunningAverage( Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha) acc_metric.attach(trainer, "running_avg_accuracy") avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha) avg_output.attach(trainer, "running_avg_output") running_avg_acc = [ None, ] @trainer.on(Events.ITERATION_COMPLETED) def manual_running_avg_acc(engine): _, y_pred, y = engine.state.output indices = torch.max(y_pred, 1)[1] correct = torch.eq(indices, y).view(-1) num_correct = torch.sum(correct).item() num_examples = correct.shape[0] batch_acc = num_correct * 1.0 / num_examples if running_avg_acc[0] is None: running_avg_acc[0] = batch_acc else: running_avg_acc[0] = running_avg_acc[0] * alpha + ( 1.0 - alpha) * batch_acc engine.state.running_avg_acc = running_avg_acc[0] @trainer.on(Events.EPOCH_STARTED) def running_avg_output_init(engine): engine.state.running_avg_output = None @trainer.on(Events.ITERATION_COMPLETED) def running_avg_output_update(engine): if engine.state.running_avg_output is None: engine.state.running_avg_output = engine.state.output[0] else: engine.state.running_avg_output = ( engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0]) @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_acc_values(engine): assert ( engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" @trainer.on(Events.ITERATION_COMPLETED) def assert_equal_running_avg_output_values(engine): assert ( engine.state.running_avg_output == engine.state.metrics["running_avg_output"] ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" np.random.seed(10) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1) running_avg_acc = [ None, ] n_iters = 10 batch_size = 10 n_classes = 10 data = list(range(n_iters)) loss_values = iter(range(n_iters)) y_true_batch_values = iter( np.random.randint(0, n_classes, size=(n_iters, batch_size))) y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) trainer.run(data, max_epochs=1)
def train(model, train_loader, eval_loaders, optimizer, loss_fn, n_it_max, patience, split_names, viz=None, device='cpu', name=None, log_steps=None, log_epoch=False, _run=None): """ :param model: :param datasets: list containing the datasets corresponding to the different datasplits (train, val[, test]) :param task_id: :param batch_sizes: :param optimizer: :param max_epoch: :param patience: :param log_interval: :param viz: :param device: :param name: :param log_steps: :param log_epoch: :param _run: :return: """ if not log_steps and not log_epoch: logger.warning('/!\\ No logging during training /!\\') if log_steps is None: log_steps = [] if log_epoch: log_steps.append(len(train_loader)) trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device) trainer._logger.setLevel(logging.WARNING) train_loss = RunningAverage(output_transform=lambda loss: loss, epoch_bound=False) train_loss.attach(trainer, 'train_loss') StopAfterIterations([n_it_max]).attach(trainer) # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # epoch_pbar.attach(trainer, metric_names=['train_loss']) # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name, # persist=True, disable=not (_run or viz)) # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED, # closing_event_name=Events.COMPLETED) eval_metrics = {'nll': Loss(lambda y_pred, y: loss_fn(y_pred, y).mean())} for i in range(model.n_out): eval_metrics['accuracy_{}'.format(i)] = \ Accuracy(output_transform=get_attr_transform(i)) evaluator = create_supervised_evaluator(model, metrics=eval_metrics, device=device) all_metrics = defaultdict(dict) last_iteration = 0 patience_counter = 0 best_loss = float('inf') best_state_dict = None best_iter = -1 def log_results(evaluator, data_loader, iteration, split_name): evaluator.run(data_loader) metrics = evaluator.state.metrics log_metrics = {} for metric_name, metric_val in metrics.items(): log_name = '{} {}'.format(split_name, metric_name) if viz: viz.line([metric_val], X=[iteration], win=metric_name, name=log_name, update='append', opts={'title': metric_name, 'showlegend': True, 'width': 500}) if _run: _run.log_scalar(log_name, metric_val, iteration) log_metrics[log_name] = metric_val all_metrics[log_name][iteration] = metric_val return log_metrics @trainer.on(Events.ITERATION_COMPLETED) def log_event(trainer): iteration = trainer.state.iteration if trainer.state else 0 nonlocal last_iteration, patience_counter, \ best_state_dict, best_loss, best_iter if not log_steps or not \ (iteration in log_steps or iteration % log_steps[-1] == 0): return all_metrics['training_epoch'][iteration] = iteration / len(train_loader) all_metrics['training_iterations'][iteration] = iteration if trainer.state and 'train_loss' in trainer.state.metrics: all_metrics['train_loss'][iteration] = trainer.state.metrics['train_loss'] else: all_metrics['train_loss'][iteration] = float('nan') iter_this_step = iteration - last_iteration for d_loader, name in zip(eval_loaders, split_names): if name == 'Train': continue split_metrics = log_results(evaluator, d_loader, iteration, name) if name == 'Val' and patience > 0: if split_metrics['Val nll'] < best_loss: best_loss = split_metrics['Val nll'] best_iter = iteration patience_counter = 0 best_state_dict = copy.deepcopy(model.state_dict()) else: patience_counter += iter_this_step if patience_counter >= patience: logger.info('#####') logger.info('# Early stopping Run') logger.info('#####') trainer.terminate() last_iteration = iteration log_event(trainer) max_epoch = int(n_it_max / len(train_loader)) + 1 trainer.run(train_loader, max_epochs=max_epoch) # all_metrics['mean_loss'] = all_metrics['Val nll'] all_metrics['mean_loss'] = best_loss all_metrics['training_iteration'] = best_iter return trainer.state.iteration, all_metrics, best_state_dict