示例#1
0
文件: trainer.py 项目: Hadryan/slp
    def attach(self: TrainerType) -> TrainerType:
        ra = RunningAverage(output_transform=lambda x: x)
        ra.attach(self.trainer, "Train Loss")
        self.pbar.attach(self.trainer, ['Train Loss'])
        self.val_pbar.attach(self.train_evaluator)
        self.val_pbar.attach(self.valid_evaluator)
        self.valid_evaluator.add_event_handler(Events.COMPLETED,
                                               self.early_stop)
        ckpt = {'model': self.model, 'optimizer': self.optimizer}
        self.valid_evaluator.add_event_handler(Events.COMPLETED,
                                               self.checkpoint, ckpt)

        def graceful_exit(engine, e):
            if isinstance(e, KeyboardInterrupt):
                engine.terminate()
                LOGGER.warn("CTRL-C caught. Exiting gracefully...")
            else:
                raise (e)

        self.trainer.add_event_handler(Events.EXCEPTION_RAISED, graceful_exit)
        self.train_evaluator.add_event_handler(Events.EXCEPTION_RAISED,
                                               graceful_exit)
        self.valid_evaluator.add_event_handler(Events.EXCEPTION_RAISED,
                                               graceful_exit)
        return self
示例#2
0
def setup_ignite(
    engine: Engine,
    params: SimpleNamespace,
    exp_source,
    run_name: str,
    extra_metrics: Iterable[str] = (),
):
    warnings.simplefilter("ignore", category=UserWarning)
    handler = ptan_ignite.EndOfEpisodeHandler(exp_source, bound_avg_reward=params.stop_reward)
    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get("time_passed", 0)
        print(
            "Episode %d: reward=%.2f, steps=%s, "
            "speed=%.1f f/s, elapsed=%s"
            % (
                trainer.state.episode,
                trainer.state.episode_reward,
                trainer.state.episode_steps,
                trainer.state.metrics.get("avg_fps", 0),
                timedelta(seconds=int(passed)),
            )
        )

    @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED)
    def game_solved(trainer: Engine):
        passed = trainer.state.metrics["time_passed"]
        print(
            "Game solved in %s, after %d episodes "
            "and %d iterations!"
            % (timedelta(seconds=int(passed)), trainer.state.episode, trainer.state.iteration)
        )
        trainer.should_terminate = True

    now = datetime.now().isoformat(timespec="minutes")
    logdir = f"runs/{now}-{params.run_name}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v["loss"])
    run_avg.attach(engine, "avg_loss")

    metrics = ["reward", "steps", "avg_reward"]
    handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics)
    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tensorboard every 100 iterations
    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ["avg_loss", "avg_fps"]
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(
        tag="train", metric_names=metrics, output_transform=lambda a: a
    )
    event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
示例#3
0
def _test_distrib_on_output(device):

    rank = idist.get_rank()
    n_iters = 10
    n_epochs = 3
    batch_size = 10

    # Data per rank
    data = list(range(n_iters))
    k = n_epochs * batch_size * n_iters
    all_loss_values = torch.arange(0,
                                   k * idist.get_world_size(),
                                   dtype=torch.float64).to(device)
    loss_values = iter(all_loss_values[k * rank:k * (rank + 1)])

    def update_fn(engine, batch):
        loss_value = next(loss_values)
        return loss_value.item()

    trainer = Engine(update_fn)
    alpha = 0.98

    metric_device = idist.device(
    ) if torch.device(device).type != "xla" else "cpu"
    avg_output = RunningAverage(output_transform=lambda x: x,
                                alpha=alpha,
                                epoch_bound=False,
                                device=metric_device)
    avg_output.attach(trainer, "running_avg_output")

    @trainer.on(Events.STARTED)
    def running_avg_output_init(engine):
        engine.state.running_avg_output = None

    @trainer.on(Events.ITERATION_COMPLETED)
    def running_avg_output_update(engine):
        i = engine.state.iteration - 1
        o = sum([
            all_loss_values[i + j * k] for j in range(idist.get_world_size())
        ]).item()
        o /= idist.get_world_size()
        if engine.state.running_avg_output is None:
            engine.state.running_avg_output = o
        else:
            engine.state.running_avg_output = engine.state.running_avg_output * alpha + (
                1.0 - alpha) * o

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_output_values(engine):
        it = engine.state.iteration
        assert engine.state.running_avg_output == pytest.approx(
            engine.state.metrics["running_avg_output"]
        ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"

    trainer.run(data, max_epochs=3)
示例#4
0
def setup_ignite(engine: Engine,
                 exp_source,
                 run_name: str,
                 extra_metrics: Iterable[str] = ()):
    # get rid of missing metrics warning
    warnings.simplefilter("ignore", category=UserWarning)

    handler = ptan_ignite.EndOfEpisodeHandler(exp_source)
    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get("time_passed", 0)
        avg_steps = trainer.state.metrics.get("avg_steps", 50)
        avg_reward = trainer.state.metrics.get("avg_reward", 0.0)
        print("Episode %d: reward=%.0f (avg %.2f), "
              "steps=%s (avg %.2f), speed=%.1f f/s, "
              "elapsed=%s" % (
                  trainer.state.episode,
                  trainer.state.episode_reward,
                  avg_reward,
                  trainer.state.episode_steps,
                  avg_steps,
                  trainer.state.metrics.get("avg_fps", 0),
                  timedelta(seconds=int(passed)),
              ))

        if avg_steps < 15 and trainer.state.episode > 100:
            print("Average steps has fallen below 10, stop training")
            trainer.should_terminate = True

    now = datetime.now().isoformat(timespec="minutes")
    logdir = f"runs/{now}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v["loss"])
    run_avg.attach(engine, "avg_loss")

    metrics = ["reward", "steps", "avg_reward", "avg_steps"]
    handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics)
    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tensorboard every 100 iterations
    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ["avg_loss", "avg_fps"]
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(tag="train",
                                      metric_names=metrics,
                                      output_transform=lambda a: a)
    event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite(engine: Engine,
                 exp_source,
                 run_name: str,
                 extra_metrics: Iterable[str] = ()):
    warnings.simplefilter('ignore', category=UserWarning)

    handler = ptan_ignite.EndOfEpisodeHandler(exp_source,
                                              subsample_end_of_episode=100)

    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get('time_passed', 0)
        print('Episode %d: reward=%0.f, steps=%s, speed=%.1f f/s, elapsed=%s' %
              (trainer.state.episode, trainer.state.episode_reward,
               trainer.state.episode_steps,
               trainer.state.metrics.get('avg_fps',
                                         0), timedelta(seconds=int(passed))))

    now = datetime.now().isoformat(timespec='minutes')
    logdir = f'runs-{now}-{run_name}'.replace(':', '')
    tb = tb_logger.TensorboardLogger(logdir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v['loss'])
    run_avg.attach(engine, 'avg_loss')

    metrics = ['reward', 'steps', 'avg_reward']
    handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics)

    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tensorboard every 100 iterations
    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ['avg_loss', 'avg_fps']
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(tag='train',
                                      metric_names=metrics,
                                      output_transform=lambda a: a)

    event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    return tb
示例#6
0
def setup_ignite(engine: Engine,
                 exp_source,
                 run_name: str,
                 extra_metrics: Iterable[str] = ()):
    # get rid of missing metrics warning
    warnings.simplefilter("ignore", category=UserWarning)

    handler = ptan_ignite.EndOfEpisodeHandler(exp_source,
                                              subsample_end_of_episode=100)
    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get("time_passed", 0)
        print("Episode %d: reward=%.0f, steps=%s, "
              "speed=%.1f f/s, elapsed=%s" % (
                  trainer.state.episode,
                  trainer.state.episode_reward,
                  trainer.state.episode_steps,
                  trainer.state.metrics.get("avg_fps", 0),
                  timedelta(seconds=int(passed)),
              ))

    now = datetime.now().isoformat(timespec="minutes")
    logdir = f"runs/{now}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v["loss"])
    run_avg.attach(engine, "avg_loss")

    metrics = ["reward", "steps", "avg_reward"]
    handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics)
    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ["avg_loss", "avg_fps"]
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(tag="train",
                                      metric_names=metrics,
                                      output_transform=lambda a: a)
    event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
    return tb
示例#7
0
def setup_ignite(engine: Engine,
                 params: SimpleNamespace,
                 exp_source,
                 run_name: str,
                 net,
                 extra_metrics: Iterable[str] = ()):
    warnings.simplefilter("ignore", category=UserWarning)
    handler = ptan_ignite.EndOfEpisodeHandler(exp_source,
                                              subsample_end_of_episode=100)
    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get('time_passed', 0)
        print("Episode %d: reward=%.0f, steps=%s, "
              "elapsed=%s" %
              (trainer.state.episode, trainer.state.episode_reward,
               trainer.state.episode_steps, timedelta(seconds=int(passed))))
        path = './saves/(episode-%.3f.data' % trainer.state.episode
        torch.save(net.state_dict(), path)

    now = datetime.now().isoformat(timespec='minutes').replace(':', '')
    logdir = f"runs2/{now}-{params.run_name}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v['loss'])
    run_avg.attach(engine, 'avg_loss')

    metrics = ['reward', 'steps', 'avg_reward']
    handler = tb_logger.OutputHandler(tag='episodes', metric_names=metrics)
    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tb every 100 Iterations
    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ['avg_loss', 'avg_fps']
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(tag="train",
                                      metric_names=metrics,
                                      output_transform=lambda a: a)
    event = ptan_ignite.PeriodEvents.ITERS_1000_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
    return tb
示例#8
0
def main_test_mnist():
    from torchvision.datasets import MNIST
    from torchvision.transforms import Compose, ToTensor, ToPILImage, Normalize
    transform = Compose([ToTensor()])
    train_dataset = MNIST(root="/tmp",
                          train=True,
                          download=True,
                          transform=transform)
    test_dataset = MNIST(root="/tmp",
                         train=False,
                         download=True,
                         transform=transform)
    vae = VAE(x_dim=784,
              z_dim=50,
              device='cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"\n{vae}")
    optimizer = ClippedAdam({"lr": 1e-3})
    svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO())

    def _update(engine, batch):
        vae.train()
        x, y = batch
        loss = svi.step(x.view(-1, 784).to(vae.device, non_blocking=True))
        return loss / len(x), len(x)

    def _evaluate(engine, batch):
        vae.eval()
        x, y = batch
        elbo = svi.evaluate_loss(
            x.view(-1, 784).to(vae.device, non_blocking=True))
        return elbo / len(x), len(x)

    trainer = Engine(_update)
    evaluater = Engine(_evaluate)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=256,
                                  shuffle=True,
                                  pin_memory=True,
                                  drop_last=True,
                                  num_workers=8)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=256,
                                 shuffle=True,
                                 pin_memory=True,
                                 drop_last=True,
                                 num_workers=8)
    timer = Timer(average=True)
    timer.attach(engine=trainer,
                 start=Events.EPOCH_STARTED,
                 pause=Events.ITERATION_COMPLETED,
                 resume=Events.ITERATION_STARTED,
                 step=Events.ITERATION_COMPLETED)
    loss_metric = RunningAverage(output_transform=lambda outputs: -outputs[0],
                                 alpha=1)
    loss_metric.attach(engine=trainer, name="ELBO")
    loss_metric.attach(engine=evaluater, name="ELBO")
    vis = Visdom(server="gpu1.cluster.peidan.me",
                 port=10697,
                 env='Imp-pyro--vae-MNIST')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_train_loss(engine):
        elbo = engine.state.metrics['ELBO']
        logger.info(
            f"epoch:{engine.state.epoch}, ELBO: {elbo:.2f}, step time: {timer.value():.3f}s"
        )
        vis.line(Y=[elbo],
                 X=[engine.state.epoch],
                 win="Train-ELBO",
                 update='append',
                 opts={"title": "Train-ELBO"})

    def plot_vae_samples(title):
        x = torch.zeros([1, 784]).to(vae.device)
        for i in range(10):
            images = []
            for rr in range(100):
                # get loc from the model
                sample_loc_i = vae.model(x)
                img = sample_loc_i[0].view(1, 28, 28).cpu().data.numpy()
                images.append(img)
            vis.images(images, 10, 2, win=title, opts={'title': title})

    @trainer.on(Events.EPOCH_COMPLETED)
    def generate_samples(engine):
        epoch = engine.state.epoch
        if epoch % 10 == 0:
            logger.info(f"epoch: {epoch}, plot samples")
            plot_vae_samples(f"epoch-{epoch}")

    @trainer.on(Events.EPOCH_COMPLETED)
    def validation(engine):
        epoch = engine.state.epoch
        if epoch % 5 == 0:
            evaluater.run(test_dataloader)
            elbo = evaluater.state.metrics['ELBO']
            logger.info(f"epoch: {epoch}, validation ELBO: {elbo}")
            vis.line(Y=[elbo],
                     X=[engine.state.epoch],
                     win="Validation-ELBO",
                     update='append',
                     opts={'title': "Validation-ELBO"})

    trainer.run(train_dataloader, max_epochs=2500)
示例#9
0
class BasicTrainTask(BaseTask):

    name = "Train Task"

    def _validate(self, config):
        """
        Method to check if specific configuration is correct. Raises AssertError if is incorrect.
        """
        assert isinstance(config, BasicTrainConfig), \
            "Configuration should be instance of `BasicTrainConfig`, but given {}".format(type(config))

    def _start(self):
        """Method to run the task
        """
        if 'cuda' in self.device:
            self.model = self.model.to(self.device)

        mlflow.log_param("model", get_object_name(self.model))

        self.logger.debug("Setup criterion")
        if "cuda" in self.device:
            self.criterion = self.criterion.to(self.device)

        mlflow.log_param("criterion", get_object_name(self.criterion))
        mlflow.log_param("optimizer", get_object_name(self.optimizer))

        self.logger.debug("Setup ignite trainer")
        trainer = self._setup_trainer()
        self._setup_trainer_handlers(trainer)

        metrics = {'loss': Loss(self.criterion)}
        metrics.update(self.metrics)

        self.logger.debug("Input data info: ")
        msg = "- train data loader: {} number of batches".format(
            len(self.train_dataloader))
        if isinstance(self.train_dataloader, DataLoader):
            msg += " | {} number of samples".format(
                len(self.train_dataloader.sampler))
        self.logger.debug(msg)

        if isinstance(self.train_dataloader, DataLoader):
            write_model_graph(self.writer,
                              model=self.model,
                              data_loader=self.train_dataloader,
                              device=self.device)

        self.pbar_eval = None
        if self.train_eval_dataloader is not None:
            self.pbar_eval = ProgressBar()
            self._setup_offline_train_metrics_computation(trainer, metrics)

        if self.val_dataloader is not None:
            if self.val_metrics is None:
                self.val_metrics = metrics

            if self.pbar_eval is None:
                self.pbar_eval = ProgressBar()

            val_evaluator = self._setup_val_metrics_computation(trainer)

            if self.reduce_lr_on_plateau is not None:
                assert self.reduce_lr_on_plateau_var in self.val_metrics, \
                    "Monitor variable {} is not found in metrics {}" \
                    .format(self.reduce_lr_on_plateau_var, metrics)

                @val_evaluator.on(Events.COMPLETED)
                def update_reduce_on_plateau(engine):
                    val_var = engine.state.metrics[
                        self.reduce_lr_on_plateau_var]
                    self.reduce_lr_on_plateau.step(val_var)

            def default_score_function(engine):
                val_loss = engine.state.metrics['loss']
                # Objects with highest scores will be retained.
                return -val_loss

            # Setup early stopping:
            if self.early_stopping_kwargs is not None:
                if 'score_function' in self.early_stopping_kwargs:
                    es_score_function = self.early_stopping_kwargs[
                        'score_function']
                else:
                    es_score_function = default_score_function
                self._setup_early_stopping(trainer, val_evaluator,
                                           es_score_function)

            # Setup model checkpoint:
            if self.model_checkpoint_kwargs is None:
                self.model_checkpoint_kwargs = {
                    "filename_prefix": "model",
                    "score_name": "val_loss",
                    "score_function": default_score_function,
                    "n_saved": 3,
                    "atomic": True,
                    "create_dir": True,
                    "save_as_state_dict": True
                }
            self._setup_best_model_checkpointing(val_evaluator)

        self.logger.debug("Setup other handlers")

        if self.lr_scheduler is not None:

            @trainer.on(Events.ITERATION_STARTED)
            def update_lr_scheduler(engine):
                self.lr_scheduler.step()

        self._setup_log_learning_rate(trainer)

        self.logger.info("Start training: {} epochs".format(self.num_epochs))
        mlflow.log_param("num_epochs", self.num_epochs)
        trainer.run(self.train_dataloader, max_epochs=self.num_epochs)
        self.logger.debug("Training is ended")

    def _setup_trainer(self):
        trainer = create_supervised_trainer(self.model,
                                            self.optimizer,
                                            self.criterion,
                                            device=self.device,
                                            non_blocking='cuda' in self.device)
        return trainer

    def _setup_trainer_handlers(self, trainer):
        # Setup timer to measure training time
        timer = setup_timer(trainer)
        self._setup_log_training_loss(trainer)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_time(engine):
            self.logger.info("One epoch training time (seconds): {}".format(
                timer.value()))

        last_model_saver = ModelCheckpoint(
            self.log_dir.as_posix(),
            filename_prefix="checkpoint",
            save_interval=self.trainer_checkpoint_interval,
            n_saved=1,
            atomic=True,
            create_dir=True,
            save_as_state_dict=True)

        model_name = get_object_name(self.model)

        to_save = {
            model_name: self.model,
            "optimizer": self.optimizer,
        }

        if self.lr_scheduler is not None:
            to_save['lr_scheduler'] = self.lr_scheduler

        trainer.add_event_handler(Events.ITERATION_COMPLETED, last_model_saver,
                                  to_save)
        trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

    def _setup_log_training_loss(self, trainer):

        self.avg_output = RunningAverage(output_transform=lambda out: out)
        self.avg_output.attach(trainer, 'running_avg_loss')
        self.pbar.attach(trainer, ['running_avg_loss'])

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            iteration = (engine.state.iteration - 1) % len(
                self.train_dataloader) + 1
            if iteration % self.log_interval == 0:
                # self.logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration,
                #                                                                   len(self.train_dataloader),
                #                                                                   engine.state.output))
                self.writer.add_scalar("training/loss_vs_iterations",
                                       engine.state.output,
                                       engine.state.iteration)
                mlflow.log_metric("training_loss_vs_iterations",
                                  engine.state.output)

    def _setup_log_learning_rate(self, trainer):
        @trainer.on(Events.EPOCH_STARTED)
        def log_lrs(engine):
            if len(self.optimizer.param_groups) == 1:
                lr = float(self.optimizer.param_groups[0]['lr'])
                self.logger.debug("Learning rate: {}".format(lr))
                self.writer.add_scalar("learning_rate", lr, engine.state.epoch)
                mlflow.log_metric("learning_rate", lr)
            else:
                for i, param_group in enumerate(self.optimizer.param_groups):
                    lr = float(param_group['lr'])
                    self.logger.debug("Learning rate (group {}): {}".format(
                        i, lr))
                    self.writer.add_scalar("learning_rate_group_{}".format(i),
                                           lr, engine.state.epoch)
                    mlflow.log_metric("learning_rate_group_{}".format(i), lr)

    def _setup_offline_train_metrics_computation(self, trainer, metrics):

        train_eval_loader = self.train_eval_dataloader
        msg = "- train evaluation data loader: {} number of batches".format(
            len(train_eval_loader))
        if isinstance(train_eval_loader, DataLoader):
            msg += " | {} number of samples".format(
                len(train_eval_loader.sampler))
        self.logger.debug(msg)

        train_evaluator = create_supervised_evaluator(self.model,
                                                      metrics=metrics,
                                                      device=self.device,
                                                      non_blocking="cuda"
                                                      in self.device)

        self.pbar_eval.attach(train_evaluator)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_metrics(engine):
            epoch = engine.state.epoch
            if epoch % self.val_interval_epochs == 0:
                self.logger.debug("Compute training metrics")
                metrics_results = train_evaluator.run(
                    train_eval_loader).metrics
                self.logger.info("Training Results - Epoch: {}".format(epoch))
                for name in metrics_results:
                    self.logger.info("\tAverage {}: {:.5f}".format(
                        name, metrics_results[name]))
                    self.writer.add_scalar("training/avg_{}".format(name),
                                           metrics_results[name], epoch)
                    mlflow.log_metric("training_avg_{}".format(name),
                                      metrics_results[name])

        return train_evaluator

    def _setup_val_metrics_computation(self, trainer):
        val_evaluator = create_supervised_evaluator(self.model,
                                                    metrics=self.val_metrics,
                                                    device=self.device,
                                                    non_blocking="cuda"
                                                    in self.device)
        self.pbar_eval.attach(val_evaluator)

        msg = "- validation data loader: {} number of batches".format(
            len(self.val_dataloader))
        if isinstance(self.val_dataloader, DataLoader):
            msg += " | {} number of samples".format(
                len(self.val_dataloader.sampler))
        self.logger.debug(msg)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(engine):
            epoch = engine.state.epoch
            if epoch % self.val_interval_epochs == 0:
                self.logger.debug("Compute validation metrics")
                metrics_results = val_evaluator.run(
                    self.val_dataloader).metrics
                self.logger.info(
                    "Validation Results - Epoch: {}".format(epoch))
                for name in metrics_results:
                    self.logger.info("\tAverage {}: {:.5f}".format(
                        name, metrics_results[name]))
                    self.writer.add_scalar("validation/avg_{}".format(name),
                                           metrics_results[name], epoch)
                    mlflow.log_metric("validation_avg_{}".format(name),
                                      metrics_results[name])

        return val_evaluator

    def _setup_early_stopping(self, trainer, val_evaluator, score_function):
        kwargs = dict(self.early_stopping_kwargs)
        if 'score_function' not in kwargs:
            kwargs['score_function'] = score_function
        handler = EarlyStopping(trainer=trainer, **kwargs)
        setup_logger(handler._logger, self.log_filepath, self.log_level)
        val_evaluator.add_event_handler(Events.COMPLETED, handler)

    def _setup_best_model_checkpointing(self, val_evaluator):
        model_name = get_object_name(self.model)
        best_model_saver = ModelCheckpoint(self.log_dir.as_posix(),
                                           **self.model_checkpoint_kwargs)
        val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver,
                                        {model_name: self.model})
示例#10
0
def train(model,
          train_loader,
          eval_loaders,
          optimizer,
          loss_fn,
          n_it_max,
          patience,
          split_names,
          select_metric='Val accuracy_0',
          select_mode='max',
          viz=None,
          device='cpu',
          lr_scheduler=None,
          name=None,
          log_steps=None,
          log_epoch=False,
          _run=None,
          prepare_batch=_prepare_batch,
          single_pass=False,
          n_ep_max=None):

    # print(model)

    if not log_steps and not log_epoch:
        logger.warning('/!\\ No logging during training /!\\')

    if log_steps is None:
        log_steps = []

    epoch_steps = len(train_loader)
    if log_epoch:
        log_steps.append(epoch_steps)

    if single_pass:
        max_epoch = 1
    elif n_ep_max is None:
        assert n_it_max is not None
        max_epoch = int(n_it_max / epoch_steps) + 1
    else:
        assert n_it_max is None
        max_epoch = n_ep_max

    all_metrics = defaultdict(dict)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_fn,
                                        device=device,
                                        prepare_batch=prepare_batch)

    if hasattr(model, 'new_epoch_hook'):
        trainer.add_event_handler(Events.EPOCH_STARTED, model.new_epoch_hook)
    if hasattr(model, 'new_iter_hook'):
        trainer.add_event_handler(Events.ITERATION_STARTED,
                                  model.new_iter_hook)

    trainer._logger.setLevel(logging.WARNING)

    # trainer output is in the format (x, y, y_pred, loss, optionals)
    train_loss = RunningAverage(output_transform=lambda out: out[3].item(),
                                epoch_bound=True)
    train_loss.attach(trainer, 'Trainer loss')
    if hasattr(model, 's'):
        met = Average(output_transform=lambda _: float('nan')
                      if model.s is None else model.s)
        met.attach(trainer, 'cur_s')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed,
                                  'cur_s')

    if hasattr(model, 'arch_sampler') and model.arch_sampler.distrib_dim > 0:
        met = Average(output_transform=lambda _: float('nan')
                      if model.cur_split is None else model.cur_split)
        met.attach(trainer, 'Trainer split')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed,
                                  'Trainer split')
        # trainer.add_event_handler(Events.EPOCH_STARTED, met.started)
        all_ent = Average(
            output_transform=lambda out: out[-1]['arch_entropy_avg'].item())
        all_ent.attach(trainer, 'Trainer all entropy')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  all_ent.completed, 'Trainer all entropy')
        train_ent = Average(
            output_transform=lambda out: out[-1]['arch_entropy_sample'].item())
        train_ent.attach(trainer, 'Trainer sampling entropy')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  train_ent.completed,
                                  'Trainer sampling entropy')
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, lambda engine: model.check_arch_freezing(
                ent=train_ent.compute(),
                epoch=engine.state.iteration / (epoch_steps * max_epoch)))

        def log_always(engine, name):
            val = engine.state.output[-1][name]
            all_metrics[name][engine.state.iteration /
                              epoch_steps] = val.mean().item()

        def log_always_dict(engine, name):
            for node, val in engine.state.output[-1][name].items():
                all_metrics['node {} {}'.format(
                    node, name)][engine.state.iteration /
                                 epoch_steps] = val.mean().item()

        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always_dict,
                                  name='arch_grads')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always_dict,
                                  name='arch_probas')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always_dict,
                                  name='node_grads')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always,
                                  name='task all_loss')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always,
                                  name='arch all_loss')
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  log_always,
                                  name='entropy all_loss')

    if n_it_max is not None:
        StopAfterIterations([n_it_max]).attach(trainer)
    # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                          persist=True, disable=not (_run or viz))
    # epoch_pbar.attach(trainer, metric_names=['Train loss'])
    #
    # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                             persist=True, disable=not (_run or viz))
    # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED,
    #                      closing_event_name=Events.COMPLETED)
    total_time = Timer(average=False)
    eval_time = Timer(average=False)
    eval_time.pause()
    data_time = Timer(average=False)
    forward_time = Timer(average=False)
    forward_time.attach(trainer,
                        start=Events.EPOCH_STARTED,
                        pause=Events.ITERATION_COMPLETED,
                        resume=Events.ITERATION_STARTED,
                        step=Events.ITERATION_COMPLETED)
    epoch_time = Timer(average=False)
    epoch_time.attach(trainer,
                      start=Events.EPOCH_STARTED,
                      pause=Events.EPOCH_COMPLETED,
                      resume=Events.EPOCH_STARTED,
                      step=Events.EPOCH_COMPLETED)

    def get_loss(y_pred, y):
        l = loss_fn(y_pred, y)
        if not torch.is_tensor(l):
            l, *l_details = l
        return l.mean()

    def get_member(x, n=0):
        if isinstance(x, (list, tuple)):
            return x[n]
        return x

    eval_metrics = {'loss': Loss(get_loss)}

    for i in range(model.n_out):
        out_trans = get_attr_transform(i)

        def extract_ys(out):
            x, y, y_pred, loss, _ = out
            return out_trans((y_pred, y))

        train_acc = Accuracy(extract_ys)
        train_acc.attach(trainer, 'Trainer accuracy_{}'.format(i))
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  train_acc.completed,
                                  'Trainer accuracy_{}'.format(i))
        eval_metrics['accuracy_{}'.format(i)] = \
            Accuracy(output_transform=out_trans)
        # if isinstance(model, SSNWrapper):
        #     model.arch_sampler.entropy().mean()

    evaluator = create_supervised_evaluator(model,
                                            metrics=eval_metrics,
                                            device=device,
                                            prepare_batch=prepare_batch)
    last_iteration = 0
    patience_counter = 0

    best = {
        'value': float('inf') * 1 if select_mode == 'min' else -1,
        'iter': -1,
        'state_dict': None
    }

    def is_better(new, old):
        if select_mode == 'min':
            return new < old
        else:
            return new > old

    def log_results(evaluator, data_loader, iteration, split_name):
        evaluator.run(data_loader)
        metrics = evaluator.state.metrics

        log_metrics = {}

        for metric_name, metric_val in metrics.items():
            log_name = '{} {}'.format(split_name, metric_name)
            if viz:
                first = iteration == 0 and split_name == split_names[0]
                viz.line(
                    [metric_val],
                    X=[iteration],
                    win=metric_name,
                    name=log_name,
                    update=None if first else 'append',
                    opts={
                        'title': metric_name,
                        'showlegend': True,
                        'width': 500,
                        'xlabel': 'iterations'
                    })
                viz.line(
                    [metric_val],
                    X=[iteration / epoch_steps],
                    win='{}epoch'.format(metric_name),
                    name=log_name,
                    update=None if first else 'append',
                    opts={
                        'title': metric_name,
                        'showlegend': True,
                        'width': 500,
                        'xlabel': 'epoch'
                    })
            if _run:
                _run.log_scalar(log_name, metric_val, iteration)
            log_metrics[log_name] = metric_val
            all_metrics[log_name][iteration] = metric_val

        return log_metrics

    if lr_scheduler is not None:

        @trainer.on(Events.EPOCH_COMPLETED)
        def step(_):
            lr_scheduler.step()
            # logger.warning('current lr {:.5e}'.format(
            #     optimizer.param_groups[0]['lr']))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_event(trainer):
        iteration = trainer.state.iteration if trainer.state else 0
        nonlocal last_iteration, patience_counter, best

        if not log_steps or not \
                (iteration in log_steps or iteration % log_steps[-1] == 0):
            return
        epoch_time.pause()
        eval_time.resume()
        all_metrics['training_epoch'][iteration] = iteration / epoch_steps
        all_metrics['training_iteration'][iteration] = iteration
        if hasattr(model, 'arch_sampler'):
            all_metrics['training_archs'][iteration] = \
                model.arch_sampler().squeeze().detach()
        # if hasattr(model, 'distrib_gen'):
        #     entropy = model.distrib_gen.entropy()
        #     all_metrics['entropy'][iteration] = entropy.mean().item()
        # if trainer.state and len(trainer.state.metrics) > 1:
        #     raise ValueError(trainer.state.metrics)
        all_metrics['data time'][iteration] = data_time.value()
        all_metrics['data time_ps'][iteration] = data_time.value() / max(
            data_time.step_count, 1.)
        all_metrics['forward time'][iteration] = forward_time.value()
        all_metrics['forward time_ps'][iteration] = forward_time.value() / max(
            forward_time.step_count, 1.)
        all_metrics['epoch time'][iteration] = epoch_time.value()
        all_metrics['epoch time_ps'][iteration] = epoch_time.value() / max(
            epoch_time.step_count, 1.)

        if trainer.state:
            # logger.warning(trainer.state.metrics)
            for metric, value in trainer.state.metrics.items():
                all_metrics[metric][iteration] = value
                if viz:
                    viz.line(
                        [value],
                        X=[iteration],
                        win=metric.split()[-1],
                        name=metric,
                        update=None if iteration == 0 else 'append',
                        opts={
                            'title': metric,
                            'showlegend': True,
                            'width': 500,
                            'xlabel': 'iterations'
                        })

        iter_this_step = iteration - last_iteration
        for d_loader, name in zip(eval_loaders, split_names):
            if name == 'Train':
                if iteration == 0:
                    all_metrics['Trainer loss'][iteration] = float('nan')
                    all_metrics['Trainer accuracy_0'][iteration] = float('nan')
                    if hasattr(model, 'arch_sampler'):
                        all_metrics['Trainer all entropy'][iteration] = float(
                            'nan')
                        all_metrics['Trainer sampling entropy'][
                            iteration] = float('nan')
                        # if hasattr(model, 'cur_split'):
                        all_metrics['Trainer split'][iteration] = float('nan')
                continue
            split_metrics = log_results(evaluator, d_loader, iteration, name)
            if select_metric not in split_metrics:
                continue
            if is_better(split_metrics[select_metric], best['value']):
                best['value'] = split_metrics[select_metric]
                best['iter'] = iteration
                best['state_dict'] = copy.deepcopy(model.state_dict())
                if patience > 0:
                    patience_counter = 0
            elif patience > 0:
                patience_counter += iter_this_step
                if patience_counter >= patience:
                    logger.info('#####')
                    logger.info('# Early stopping Run')
                    logger.info('#####')
                    trainer.terminate()
        last_iteration = iteration
        eval_time.pause()
        eval_time.step()
        all_metrics['eval time'][iteration] = eval_time.value()
        all_metrics['eval time_ps'][iteration] = eval_time.value(
        ) / eval_time.step_count
        all_metrics['total time'][iteration] = total_time.value()
        epoch_time.resume()

    log_event(trainer)

    #
    # @trainer.on(Events.EPOCH_COMPLETED)
    # def log_epoch(trainer):
    #     iteration = trainer.state.iteration if trainer.state else 0
    #     epoch = iteration/epoch_steps
    #     fw_t = forward_time.value()
    #     fw_t_ps = fw_t / forward_time.step_count
    #     d_t = data_time.value()
    #     d_t_ps = d_t / data_time.step_count
    #     e_t = epoch_time.value()
    #     e_t_ps = e_t / epoch_time.step_count
    #     ev_t = eval_time.value()
    #     ev_t_ps = ev_t / eval_time.step_count
    #     logger.warning('<{}> Epoch {}/{} finished (Forward: {:.3f}s({:.3f}), '
    #                    'data: {:.3f}s({:.3f}), epoch: {:.3f}s({:.3f}),'
    #                    ' Eval: {:.3f}s({:.3f}), Total: '
    #                    '{:.3f}s)'.format(type(model).__name__, epoch,
    #                                      max_epoch, fw_t, fw_t_ps, d_t, d_t_ps,
    #                                      e_t, e_t_ps, ev_t, ev_t_ps,
    #                                      total_time.value()))

    data_time.attach(trainer,
                     start=Events.STARTED,
                     pause=Events.ITERATION_STARTED,
                     resume=Events.ITERATION_COMPLETED,
                     step=Events.ITERATION_STARTED)

    if hasattr(model, 'iter_per_epoch'):
        model.iter_per_epoch = len(train_loader)
    trainer.run(train_loader, max_epochs=max_epoch)
    return trainer.state.iteration, all_metrics, best
示例#11
0
 def setup_meters(self):
     # meters
     avg_output = RunningAverage(output_transform=lambda x: x)
     avg_output.attach(self.engine, 'running_avg_loss')
def setup_ignite(
        engine: Engine,
        params: SimpleNamespace,
        exp_source,
        run_name: str,
        model,
        optimizer,
        extra_metrics: Iterable[str] = (),
):
    warnings.simplefilter("ignore", category=UserWarning)
    handler = ptan_ignite.EndOfEpisodeHandler(
        exp_source, bound_avg_reward=params.stop_reward)
    handler.attach(engine)
    ptan_ignite.EpisodeFPSHandler().attach(engine)

    objects_to_checkpoint = {
        'model': model,
        'optimizer': optimizer,
        'trainer': engine
    }
    checkpoint_dir = Path("models")
    saver = DiskSaver(str(checkpoint_dir),
                      create_dir=True,
                      require_empty=False)
    handler = Checkpoint(objects_to_checkpoint, saver, n_saved=2)
    engine.add_event_handler(Events.ITERATION_COMPLETED(every=1000), handler)

    checkpoints_paths = list(checkpoint_dir.iterdir())
    if checkpoints_paths:
        checkpoint = torch.load(checkpoints_paths[-1])
        print(f"Loading checkpoint {checkpoints_paths[-1].name}")
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get('time_passed', 0)
        print("Episode %d: reward=%.2f, steps=%s, "
              "speed=%.1f f/s, elapsed=%s" %
              (trainer.state.episode, trainer.state.episode_reward,
               trainer.state.episode_steps,
               trainer.state.metrics.get('avg_fps',
                                         0), timedelta(seconds=int(passed))))

    @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED)
    def game_solved(trainer: Engine):
        passed = trainer.state.metrics['time_passed']
        print("Game solved in %s, after %d episodes "
              "and %d iterations!" %
              (timedelta(seconds=int(passed)), trainer.state.episode,
               trainer.state.iteration))
        trainer.should_terminate = True

    now = datetime.now().isoformat(timespec='minutes').replace(":", "-")
    logdir = f"runs/{now}-{params.run_name}-{run_name}"
    tb = tb_logger.TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v['loss'])
    run_avg.attach(engine, "avg_loss")

    metrics = ['reward', 'steps', 'avg_reward']
    handler = tb_logger.OutputHandler(tag="episodes", metric_names=metrics)
    event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tensorboard every 100 iterations
    ptan_ignite.PeriodicEvents().attach(engine)
    metrics = ['avg_loss', 'avg_fps']
    metrics.extend(extra_metrics)
    handler = tb_logger.OutputHandler(tag="train",
                                      metric_names=metrics,
                                      output_transform=lambda a: a)
    event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
def setup_ignite(
        engine: Engine,
        params: SimpleNamespace,
        exp_source,
        run_name: str,
        model,
        optimizer,
        buffer,
        target_net,
        extra_metrics: Iterable[str] = (),
):
    simplefilter("ignore", category=UserWarning)
    handler = EndOfEpisodeHandler(exp_source,
                                  bound_avg_reward=params.stop_reward)
    handler.attach(engine)
    EpisodeFPSHandler().attach(engine)

    objects_to_checkpoint = {
        'model': model,
        'optimizer': optimizer,
        'trainer': engine,
        "buffer": buffer,
        "target_net": target_net
    }
    checkpoint_dir = Path("models backup")
    saver = LightDiskSaver(str(checkpoint_dir),
                           create_dir=True,
                           require_empty=False)
    handler = Checkpoint(objects_to_checkpoint, saver, n_saved=1)
    engine.add_event_handler(Events.ITERATION_COMPLETED(every=30000), handler)

    checkpoints_paths = list(checkpoint_dir.iterdir())
    if checkpoints_paths:
        checkpoint = joblib.load(checkpoints_paths[-1])
        print(f"Loading checkpoint {checkpoints_paths[-1].name}")
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    @engine.on(EpisodeEvents.EPISODE_COMPLETED)
    def episode_completed(trainer: Engine):
        passed = trainer.state.metrics.get("time_passed", 0)
        print(
            "Episode {}: reward={:.0f}, steps={}, speed={:.1f} f/s, elapsed={}"
            .format(trainer.state.episode, trainer.state.episode_reward,
                    trainer.state.episode_steps,
                    trainer.state.metrics.get("avg_fps", 0),
                    timedelta(seconds=int(passed))))

    @engine.on(EpisodeEvents.BOUND_REWARD_REACHED)
    def game_solved(trainer: Engine):
        passed = trainer.state.metrics["time_passed"]
        print(
            f"Game solved in {timedelta(seconds=int(passed))} after {trainer.state.episode}"
            f" episodes and {trainer.state.iteration} iterations!")
        trainer.should_terminate = True

    now = datetime.now().isoformat(timespec="minutes").replace(":", "-")
    logdir = f"runs/{now}-{params.run_name}-{run_name}"
    tb = TensorboardLogger(log_dir=logdir)
    run_avg = RunningAverage(output_transform=lambda v: v["loss"])
    run_avg.attach(engine, "avg_loss")
    metrics = ["reward", "steps", "avg_reward"]
    handler = OutputHandler(tag="episodes", metric_names=metrics)
    event = EpisodeEvents.EPISODE_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)

    # write to tensorboard every 100 iterations
    PeriodicEvents().attach(engine)
    metrics = ["avg_loss", "avg_fps"]
    metrics.extend(extra_metrics)
    handler = OutputHandler(tag="train",
                            metric_names=metrics,
                            output_transform=lambda a: a)
    event = PeriodEvents.ITERS_100_COMPLETED
    tb.attach(engine, log_handler=handler, event_name=event)
示例#14
0
def evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, dataloader,
                   experiment_mode="debug", base_dir=utils.BASE_DIR):
    # Create tester engine
    tester = Engine(utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, training=False))

    loss_metric = RunningAverage(output_transform=lambda x: x[0], alpha=1)
    loss_metric.attach(tester, loss_name)
    
    utilsT.attach_metrics(tester, chosen_diseases, "prec", Precision, True)
    utilsT.attach_metrics(tester, chosen_diseases, "recall", Recall, True)
    utilsT.attach_metrics(tester, chosen_diseases, "acc", Accuracy, True)
    utilsT.attach_metrics(tester, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False)
    utilsT.attach_metrics(tester, chosen_diseases, "cm", ConfusionMatrix,
                          get_transform_fn=utilsT.get_transform_cm, metric_args=(2,))

    timer = Timer(average=True)
    timer.attach(tester, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED)

    # Save metrics
    log_metrics = list(ALL_METRICS)

    # Run test
    print("Testing...")
    tester.run(dataloader, 1)
    

    # Capture time
    secs_per_epoch = timer.value()
    duration_per_epoch = utils.duration_to_str(int(secs_per_epoch))
    print("Time elapsed in epoch: ", duration_per_epoch)

    # Copy metrics dict
    metrics = dict()
    original_metrics = tester.state.metrics
    for metric_name in log_metrics:
        for disease_name in chosen_diseases:
            key = metric_name + "_" + disease_name
            if key not in original_metrics:
                print("Metric not found in tester, skipping: ", key)
                continue

            metrics[key] = original_metrics[key]
            
    # Copy CMs
    for disesase_name in chosen_diseases:
        key = "cm_" + disease_name
        if key not in original_metrics:
            print("CM not found in tester, skipping: ", key)
            continue
        
        cm = original_metrics[key]
        metrics[key] = cm.numpy().tolist()
    
    # Save to file
    folder = os.path.join(base_dir, "results", experiment_mode)
    os.makedirs(folder, exist_ok=True)
    
    fname = os.path.join(folder, run_name + ".json")
    with open(fname, "w+") as f:
        json.dump(metrics, f)   
    print("Saved metrics to: ", fname)
    
    return metrics
示例#15
0
    def _add_metrics(self):
        train_loss = RunningAverage(Loss(self.get_loss))
        train_loss.attach(self.trainer, 'avg_train_loss')

        val_loss = Loss(self.get_loss)
        val_loss.attach(self.evaluator, 'val_loss')
示例#16
0
    def _test(metric_device):
        data = list(range(n_iters))
        np.random.seed(12)
        all_y_true_batch_values = np.random.randint(
            0,
            n_classes,
            size=(idist.get_world_size(), n_epochs * n_iters, batch_size))
        all_y_pred_batch_values = np.random.rand(idist.get_world_size(),
                                                 n_epochs * n_iters,
                                                 batch_size, n_classes)

        y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
        y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])

        def update_fn(engine, batch):
            y_true_batch = next(y_true_batch_values)
            y_pred_batch = next(y_pred_batch_values)
            return torch.from_numpy(y_pred_batch), torch.from_numpy(
                y_true_batch)

        trainer = Engine(update_fn)
        alpha = 0.98

        acc_metric = RunningAverage(Accuracy(
            output_transform=lambda x: [x[0], x[1]], device=metric_device),
                                    alpha=alpha,
                                    epoch_bound=False)
        acc_metric.attach(trainer, "running_avg_accuracy")

        running_avg_acc = [
            None,
        ]
        true_acc_metric = Accuracy(device=metric_device)

        @trainer.on(Events.ITERATION_COMPLETED)
        def manual_running_avg_acc(engine):
            i = engine.state.iteration - 1

            true_acc_metric.reset()
            for j in range(idist.get_world_size()):
                output = (
                    torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
                    torch.from_numpy(all_y_true_batch_values[j, i, :]),
                )
                true_acc_metric.update(output)

            batch_acc = true_acc_metric._num_correct.item(
            ) * 1.0 / true_acc_metric._num_examples

            if running_avg_acc[0] is None:
                running_avg_acc[0] = batch_acc
            else:
                running_avg_acc[0] = running_avg_acc[0] * alpha + (
                    1.0 - alpha) * batch_acc
            engine.state.running_avg_acc = running_avg_acc[0]

        @trainer.on(Events.ITERATION_COMPLETED)
        def assert_equal_running_avg_acc_values(engine):
            assert (
                engine.state.running_avg_acc ==
                engine.state.metrics["running_avg_accuracy"]
            ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"

        trainer.run(data, max_epochs=3)
示例#17
0
def main(cfg):
    """
    Performs training, validation and testing.
    """
    assert isdir(cfg.data_dir), \
        '`data_dir` must be a valid path.'

    cfg.cuda = torch.cuda.is_available() \
        and not cfg.no_cuda

    cfg.model_dir = os.getcwd()

    # setting random seed for reproducibility
    if cfg.seed: set_random_seed(cfg)

    device = torch.device('cuda' if cfg.cuda else 'cpu')

    os.makedirs(cfg.model_dir, exist_ok=True)

    label2id = create_label2id(cfg)
    cfg.num_labels = len(label2id)

    xlmr = create_pretrained(cfg.model_type, cfg.force_download)

    # creating dataset split loaders
    datasets = create_dataset(cfg, xlmr, label2id)

    train_dataset, valid_dataset = datasets

    def compute_loss(batch):
        """
        Computes the forward pass and returns the
        cross entropy loss.
        """
        inputs, labels = [
            torch.from_numpy(tensor).to(device).long() for tensor in batch
        ]

        logits = model(inputs)

        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)

        loss = torch.nn.functional.cross_entropy(logits,
                                                 labels,
                                                 ignore_index=-1)

        return loss

    def train_step(engine, batch):
        """
        Propagates the inputs forward and updates
        the parameters.
        """
        step = engine.state.iteration

        model.train()

        loss = compute_loss(batch)

        backward(loss)

        if cfg.clip_grad_norm is not None:
            clip_grad_norm(cfg.clip_grad_norm)

        if step % cfg.grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        # restoring the averaged loss across steps
        loss *= cfg.grad_accum_steps

        return loss.item()

    def eval_step(engine, batch):
        """
        Propagates the inputs forward without
        storing any gradients.
        """
        model.eval()

        with torch.no_grad():
            loss = compute_loss(batch)

        return loss.item()

    def backward(loss):
        """
        Backpropagates the loss in either mixed or
        normal precision mode.
        """
        if cfg.fp16:
            with amp.scale_loss(loss, optimizer) as sc:
                sc.backward()

        else:
            loss.backward()

    def clip_grad_norm(max_norm):
        """
        Applies gradient clipping.
        """
        if cfg.fp16:
            params = amp.master_params(optimizer)
        else:
            params = model.parameters()

        torch.nn.utils.clip_grad_norm_(params, max_norm)

    trainer = Engine(train_step)
    validator = Engine(eval_step)

    checkpoint = ModelCheckpoint(
        cfg.model_dir,
        cfg.model_type,
        n_saved=5,
        save_as_state_dict=True,
        score_function=lambda e: -e.state.metrics['loss'])

    last_ckpt_path = cfg.ckpt_path

    if last_ckpt_path is not None:
        msg = 'Loading state from {}'
        print(msg.format(basename(last_ckpt_path)))

        last_state = torch.load(last_ckpt_path, map_location=device)

    model = create_model(xlmr, len(label2id), cfg)
    model = model.to(device)

    del xlmr.model

    optimizer = create_optimizer(cfg, model)

    scheduler = create_scheduler(cfg, optimizer, len(train_dataset))

    # using apex if required and loading its state
    if cfg.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

        if last_ckpt_path is not None and \
                'amp' in last_state:
            amp.load_state_dict(last_state['amp'])

    if last_ckpt_path is not None:
        model.load_state_dict(last_state['model'])
        optimizer.load_state_dict(last_state['optimizer'])
        scheduler.load_state_dict(last_state['scheduler'])

    checkpoint_dict = {
        'model': model,
        'optimizer': optimizer,
        'scheduler': scheduler
    }

    if cfg.fp16: checkpoint_dict['amp'] = amp

    validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict)

    metric = RunningAverage(output_transform=lambda x: x)
    metric.attach(trainer, 'loss')
    metric.attach(validator, 'loss')

    pbar = ProgressBar()
    pbar.attach(trainer, metric_names=['loss'])

    history_path = join(cfg.model_dir, 'history.json')
    history = collections.defaultdict(list)
    headers = ['epoch', 'train_loss', 'valid_loss']

    if exists(history_path):
        with open(history_path, 'r') as fh:
            history = json.load(fh)

    def record_history(results):
        """
        Records the results to the history.
        """
        for header in headers:
            history[header].append(results[header])

        with open(history_path, 'w') as fh:
            json.dump(history, fh)

    @trainer.on(Events.EPOCH_COMPLETED)
    def print_results(engine):
        """
        Logs the training results.
        """
        validator.run(valid_dataset)

        record_history({
            'epoch': engine.state.epoch,
            'train_loss': engine.state.metrics['loss'],
            'valid_loss': validator.state.metrics['loss']
        })

        data = list(zip(*[history[h] for h in headers]))
        table = tabulate(data, headers, floatfmt='.3f')

        print(table.split('\n')[-1])

    data = list(zip(*[history[h] for h in headers]))

    print()
    print(cfg.pretty())

    print()
    print('***** Running training *****')

    print()
    print(tabulate(data, headers, floatfmt='.3f'))

    trainer.run(train_dataset, cfg.max_epochs)
示例#18
0
def train_model(
    name="",
    resume="",
    base_dir=utils.BASE_DIR,
    model_name="v0",
    chosen_diseases=None,
    n_epochs=10,
    batch_size=4,
    oversample=False,
    max_os=None,
    shuffle=False,
    opt="sgd",
    opt_params={},
    loss_name="wbce",
    loss_params={},
    train_resnet=False,
    log_metrics=None,
    flush_secs=120,
    train_max_images=None,
    val_max_images=None,
    test_max_images=None,
    experiment_mode="debug",
    save=True,
    save_cms=True,  # Note that in this case, save_cms (to disk) includes write_cms (to TB)
    write_graph=False,
    write_emb=False,
    write_emb_img=False,
    write_img=False,
    image_format="RGB",
    multiple_gpu=False,
):

    # Choose GPU
    device = utilsT.get_torch_device()
    print("Using device: ", device)

    # Common folders
    dataset_dir = os.path.join(base_dir, "dataset")

    # Dataset handling
    print("Loading train dataset...")
    train_dataset, train_dataloader = utilsT.prepare_data(
        dataset_dir,
        "train",
        chosen_diseases,
        batch_size,
        oversample=oversample,
        max_os=max_os,
        shuffle=shuffle,
        max_images=train_max_images,
        image_format=image_format,
    )
    train_samples, _ = train_dataset.size()

    print("Loading val dataset...")
    val_dataset, val_dataloader = utilsT.prepare_data(
        dataset_dir,
        "val",
        chosen_diseases,
        batch_size,
        max_images=val_max_images,
        image_format=image_format,
    )
    val_samples, _ = val_dataset.size()

    # Should be the same than chosen_diseases
    chosen_diseases = list(train_dataset.classes)
    print("Chosen diseases: ", chosen_diseases)

    if resume:
        # Load model and optimizer
        model, model_name, optimizer, opt, loss_name, loss_params, chosen_diseases = models.load_model(
            base_dir, resume, experiment_mode="", device=device)
        model.train(True)
    else:
        # Create model
        model = models.init_empty_model(model_name,
                                        chosen_diseases,
                                        train_resnet=train_resnet).to(device)

        # Create optimizer
        OptClass = optimizers.get_optimizer_class(opt)
        optimizer = OptClass(model.parameters(), **opt_params)
        # print("OPT: ", opt_params)

    # Allow multiple GPUs
    if multiple_gpu:
        model = DataParallel(model)

    # Tensorboard log options
    run_name = utils.get_timestamp()
    if name:
        run_name += "_{}".format(name)

    if len(chosen_diseases) == 1:
        run_name += "_{}".format(chosen_diseases[0])
    elif len(chosen_diseases) == 14:
        run_name += "_all"

    log_dir = get_log_dir(base_dir, run_name, experiment_mode=experiment_mode)

    print("Run name: ", run_name)
    print("Saved TB in: ", log_dir)

    writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)

    # Create validator engine
    validator = Engine(
        utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params,
                           False))

    val_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1)
    val_loss.attach(validator, loss_name)

    utilsT.attach_metrics(validator, chosen_diseases, "prec", Precision, True)
    utilsT.attach_metrics(validator, chosen_diseases, "recall", Recall, True)
    utilsT.attach_metrics(validator, chosen_diseases, "acc", Accuracy, True)
    utilsT.attach_metrics(validator, chosen_diseases, "roc_auc",
                          utilsT.RocAucMetric, False)
    utilsT.attach_metrics(validator,
                          chosen_diseases,
                          "cm",
                          ConfusionMatrix,
                          get_transform_fn=utilsT.get_transform_cm,
                          metric_args=(2, ))
    utilsT.attach_metrics(validator,
                          chosen_diseases,
                          "positives",
                          RunningAverage,
                          get_transform_fn=utilsT.get_count_positives)

    # Create trainer engine
    trainer = Engine(
        utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params,
                           True))

    train_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1)
    train_loss.attach(trainer, loss_name)

    utilsT.attach_metrics(trainer, chosen_diseases, "acc", Accuracy, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "prec", Precision, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "recall", Recall, True)
    utilsT.attach_metrics(trainer, chosen_diseases, "roc_auc",
                          utilsT.RocAucMetric, False)
    utilsT.attach_metrics(trainer,
                          chosen_diseases,
                          "cm",
                          ConfusionMatrix,
                          get_transform_fn=utilsT.get_transform_cm,
                          metric_args=(2, ))
    utilsT.attach_metrics(trainer,
                          chosen_diseases,
                          "positives",
                          RunningAverage,
                          get_transform_fn=utilsT.get_count_positives)

    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 step=Events.EPOCH_COMPLETED)

    # TODO: Early stopping
    #     def score_function(engine):
    #         val_loss = engine.state.metrics[loss_name]
    #         return -val_loss

    #     handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
    #     validator.add_event_handler(Events.COMPLETED, handler)

    # Metrics callbacks
    if log_metrics is None:
        log_metrics = list(ALL_METRICS)

    def _write_metrics(run_type, metrics, epoch, wall_time):
        loss = metrics.get(loss_name, 0)

        writer.add_scalar("Loss/" + run_type, loss, epoch, wall_time)

        for metric_base_name in log_metrics:
            for disease in chosen_diseases:
                metric_value = metrics.get(
                    "{}_{}".format(metric_base_name, disease), -1)
                writer.add_scalar(
                    "{}_{}/{}".format(metric_base_name, disease, run_type),
                    metric_value, epoch, wall_time)

    @trainer.on(Events.EPOCH_COMPLETED)
    def tb_write_metrics(trainer):
        epoch = trainer.state.epoch
        max_epochs = trainer.state.max_epochs

        # Run on evaluation
        validator.run(val_dataloader, 1)

        # Common time
        wall_time = time.time()

        # Log all metrics to TB
        _write_metrics("train", trainer.state.metrics, epoch, wall_time)
        _write_metrics("val", validator.state.metrics, epoch, wall_time)

        train_loss = trainer.state.metrics.get(loss_name, 0)
        val_loss = validator.state.metrics.get(loss_name, 0)

        tb_write_histogram(writer, model, epoch, wall_time)

        print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})".
              format(epoch, max_epochs, train_loss, val_loss,
                     utils.duration_to_str(int(timer._elapsed()))))

    # Hparam dict
    hparam_dict = {
        "resume": resume,
        "n_diseases": len(chosen_diseases),
        "diseases": ",".join(chosen_diseases),
        "n_epochs": n_epochs,
        "batch_size": batch_size,
        "shuffle": shuffle,
        "model_name": model_name,
        "opt": opt,
        "loss": loss_name,
        "samples (train, val)": "{},{}".format(train_samples, val_samples),
        "train_resnet": train_resnet,
        "multiple_gpu": multiple_gpu,
    }

    def copy_params(params_dict, base_name):
        for name, value in params_dict.items():
            hparam_dict["{}_{}".format(base_name, name)] = value

    copy_params(loss_params, "loss")
    copy_params(opt_params, "opt")
    print("HPARAM: ", hparam_dict)

    # Train
    print("-" * 50)
    print("Training...")
    trainer.run(train_dataloader, n_epochs)

    # Capture time
    secs_per_epoch = timer.value()
    duration_per_epoch = utils.duration_to_str(int(secs_per_epoch))
    print("Average time per epoch: ", duration_per_epoch)
    print("-" * 50)

    ## Write all hparams
    hparam_dict["duration_per_epoch"] = duration_per_epoch

    # FIXME: this is commented to avoid having too many hparams in TB frontend
    # metrics
    #     def copy_metrics(engine, engine_name):
    #         for metric_name, metric_value in engine.state.metrics.items():
    #             hparam_dict["{}_{}".format(engine_name, metric_name)] = metric_value
    #     copy_metrics(trainer, "train")
    #     copy_metrics(validator, "val")

    print("Writing TB hparams")
    writer.add_hparams(hparam_dict, {})

    # Save model to disk
    if save:
        print("Saving model...")
        models.save_model(base_dir, run_name, model_name, experiment_mode,
                          hparam_dict, trainer, model, optimizer)

    # Write graph to TB
    if write_graph:
        print("Writing TB graph...")
        tb_write_graph(writer, model, train_dataloader, device)

    # Write embeddings to TB
    if write_emb:
        print("Writing TB embeddings...")
        image_size = 256 if write_emb_img else 0

        # FIXME: be able to select images (balanced, train vs val, etc)
        image_list = list(train_dataset.label_index["FileName"])[:1000]
        # disease = chosen_diseases[0]
        # positive = train_dataset.label_index[train_dataset.label_index[disease] == 1]
        # negative = train_dataset.label_index[train_dataset.label_index[disease] == 0]
        # positive_images = list(positive["FileName"])[:25]
        # negative_images = list(negative["FileName"])[:25]
        # image_list = positive_images + negative_images

        all_images, all_embeddings, all_predictions, all_ground_truths = gen_embeddings(
            model,
            train_dataset,
            device,
            image_list=image_list,
            image_size=image_size)
        tb_write_embeddings(
            writer,
            chosen_diseases,
            all_images,
            all_embeddings,
            all_predictions,
            all_ground_truths,
            global_step=n_epochs,
            use_images=write_emb_img,
            tag="1000_{}".format("img" if write_emb_img else "no_img"),
        )

    # Save confusion matrices (is expensive to calculate them afterwards)
    if save_cms:
        print("Saving confusion matrices...")
        # Assure folder
        cms_dir = os.path.join(base_dir, "cms", experiment_mode)
        os.makedirs(cms_dir, exist_ok=True)
        base_fname = os.path.join(cms_dir, run_name)

        n_diseases = len(chosen_diseases)

        def extract_cms(metrics):
            """Extract confusion matrices from a metrics dict."""
            cms = []
            for disease in chosen_diseases:
                key = "cm_" + disease
                if key not in metrics:
                    cm = np.array([[-1, -1], [-1, -1]])
                else:
                    cm = metrics[key].numpy()

                cms.append(cm)
            return np.array(cms)

        # Train confusion matrix
        train_cms = extract_cms(trainer.state.metrics)
        np.save(base_fname + "_train", train_cms)
        tb_write_cms(writer, "train", chosen_diseases, train_cms)

        # Validation confusion matrix
        val_cms = extract_cms(validator.state.metrics)
        np.save(base_fname + "_val", val_cms)
        tb_write_cms(writer, "val", chosen_diseases, val_cms)

        # All confusion matrix (train + val)
        all_cms = train_cms + val_cms
        np.save(base_fname + "_all", all_cms)

        # Print to console
        if len(chosen_diseases) == 1:
            print("Train CM: ")
            print(train_cms[0])
            print("Val CM: ")
            print(val_cms[0])


#             print("Train CM 2: ")
#             print(trainer.state.metrics["cm_" + chosen_diseases[0]])
#             print("Val CM 2: ")
#             print(validator.state.metrics["cm_" + chosen_diseases[0]])

    if write_img:
        # NOTE: this option is not recommended, use Testing notebook to plot and analyze images

        print("Writing images to TB...")

        test_dataset, test_dataloader = utilsT.prepare_data(
            dataset_dir,
            "test",
            chosen_diseases,
            batch_size,
            max_images=test_max_images,
        )

        # TODO: add a way to select images?
        # image_list = list(test_dataset.label_index["FileName"])[:3]

        # Examples in test_dataset (with bboxes available):
        image_list = [
            # "00010277_000.png", # (Effusion, Infiltrate, Mass, Pneumonia)
            # "00018427_004.png", # (Atelectasis, Effusion, Mass)
            # "00021703_001.png", # (Atelectasis, Effusion, Infiltrate)
            # "00028640_008.png", # (Effusion, Infiltrate)
            # "00019124_104.png", # (Pneumothorax)
            # "00019124_090.png", # (Nodule)
            # "00020318_007.png", # (Pneumothorax)
            "00000003_000.png",  # (0)
            # "00000003_001.png", # (0)
            # "00000003_002.png", # (0)
            "00000732_005.png",  # (Cardiomegaly, Pneumothorax)
            # "00012261_001.png", # (Cardiomegaly, Pneumonia)
            # "00013249_033.png", # (Cardiomegaly, Pneumonia)
            # "00029808_003.png", # (Cardiomegaly, Pneumonia)
            # "00022215_012.png", # (Cardiomegaly, Pneumonia)
            # "00011402_007.png", # (Cardiomegaly, Pneumonia)
            # "00019018_007.png", # (Cardiomegaly, Infiltrate)
            # "00021009_001.png", # (Cardiomegaly, Infiltrate)
            # "00013670_151.png", # (Cardiomegaly, Infiltrate)
            # "00005066_030.png", # (Cardiomegaly, Infiltrate, Effusion)
            "00012288_000.png",  # (Cardiomegaly)
            "00008399_007.png",  # (Cardiomegaly)
            "00005532_000.png",  # (Cardiomegaly)
            "00005532_014.png",  # (Cardiomegaly)
            "00005532_016.png",  # (Cardiomegaly)
            "00005827_000.png",  # (Cardiomegaly)
            # "00006912_007.png", # (Cardiomegaly)
            # "00007037_000.png", # (Cardiomegaly)
            # "00007043_000.png", # (Cardiomegaly)
            # "00012741_004.png", # (Cardiomegaly)
            # "00007551_020.png", # (Cardiomegaly)
            # "00007735_040.png", # (Cardiomegaly)
            # "00008339_010.png", # (Cardiomegaly)
            # "00008365_000.png", # (Cardiomegaly)
            # "00012686_003.png", # (Cardiomegaly)
        ]

        tb_write_images(writer, model, test_dataset, chosen_diseases, n_epochs,
                        device, image_list)

    # Close TB writer
    if experiment_mode != "debug":
        writer.close()

    # Run post_train
    print("-" * 50)
    print("Running post_train...")

    print("Loading test dataset...")
    test_dataset, test_dataloader = utilsT.prepare_data(
        dataset_dir,
        "test",
        chosen_diseases,
        batch_size,
        max_images=test_max_images)

    save_cms_with_names(run_name, experiment_mode, model, test_dataset,
                        test_dataloader, chosen_diseases)

    evaluate_model(run_name,
                   model,
                   optimizer,
                   device,
                   loss_name,
                   loss_params,
                   chosen_diseases,
                   test_dataloader,
                   experiment_mode=experiment_mode,
                   base_dir=base_dir)

    # Return values for debugging
    model_run = ModelRun(model, run_name, model_name, chosen_diseases)
    if experiment_mode == "debug":
        model_run.save_debug_data(writer, trainer, validator, train_dataset,
                                  train_dataloader, val_dataset,
                                  val_dataloader)

    return model_run
示例#19
0
                        num_workers=6)

model = BaselineModel(
    config=experiment_config.pop("model"),
    embeddings_result_file=data_config.get("embeddings_result_file"),
    vocab=vocab)

optimizer = Adam(model.parameters(), training_config.pop("lr"))
loss = CrossEntropyLoss()

trainer = create_supervised_trainer(model, optimizer, loss, device=device)
evaluator = create_supervised_evaluator(
    model, metrics={'accuracy': VisualQAAccuracy()}, device=device)

# create and add handlers
run_avg = RunningAverage(output_transform=lambda x: x)
run_avg.attach(trainer, 'loss')
pbar = ProgressBar(persist=False, bar_format=None)
pbar.attach(trainer, ['loss'])
pbar.attach(evaluator)
eval_handler = EvalHandler(evaluator=evaluator, data_loader=val_loader)
eval_handler.attach(trainer)
if not DEBUGGING_MODE:
    tb_handler = TensorboardHandler(evaluator=evaluator)
    tb_handler.attach(trainer)
    mlflow_handler = MlflowHandler(evaluator=evaluator)
    mlflow_handler.attach(trainer)

# finally run training process
trainer.run(train_loader, max_epochs=training_config.pop("n_epochs"))
示例#20
0
def run(train_dir,
        val_dir=None,
        learning_rate=1e-4,
        num_workers=1,
        num_epochs=100,
        batch_size=16,
        shuffle=False,
        num_controls=2,
        num_intentions=4,
        hidden_dim=256,
        log_interval=10,
        log_dir='./logs',
        seed=2605,
        accumulation_steps=4,
        save_model='models',
        resume=None):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    cudnn.benchmark = True
    train_loader, val_loader = get_dataloader(train_dir,
                                              val_dir,
                                              num_workers=num_workers,
                                              batch_size=batch_size,
                                              shuffle=shuffle)
    if resume:
        model = torch.load(resume)
    else:
        model = DepthIntentionEncodeModel(num_controls=num_controls,
                                          num_intentions=num_intentions,
                                          hidden_dim=hidden_dim)
    model = model.to(device)
    writer = create_summary_writer(model, train_loader, log_dir)
    criterion = nn.MSELoss()
    check_manual_seed(seed)

    # optim = RAdam(model.parameters(),lr=learning_rate,betas=(0.9,0.999))
    optim = SGD(model.parameters(), lr=learning_rate)

    lr_scheduler = ExponentialLR(optim, gamma=0.95)
    checkpoints = ModelCheckpoint(save_model,
                                  'Model',
                                  save_interval=1,
                                  n_saved=3,
                                  create_dir=True,
                                  require_empty=False,
                                  save_as_state_dict=False)

    def update_fn(engine, batch):
        model.train()
        optim.zero_grad()

        x, y = batch
        x = list(map(lambda x: x.to(device), x))
        y = y.to(device)
        y_pred = model(*x)

        loss = criterion(y_pred, y)
        loss.backward()
        optim.step()

        return loss.item()

    def evaluate_fn(engine, batch):
        engine.state.metrics = dict()
        model.eval()

        x, y = batch

        x = list(map(lambda x: x.to(device), x))
        y = y.to(device)

        y_pred = model(*x)
        mse_loss = F.mse_loss(y_pred, y)
        mae_loss = F.l1_loss(y_pred, y)

        engine.state.metrics['mse'] = mse_loss
        engine.state.metrics['mae'] = mae_loss

    trainer = Engine(update_fn)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoints,
                              {'model': model})
    avg_loss = RunningAverage(output_transform=lambda x: x, alpha=0.1)
    avg_loss.attach(trainer, 'running_avg_loss')
    pbar = ProgressBar()
    pbar.attach(trainer, ['running_avg_loss'])

    evaluator = Engine(evaluate_fn)
    pbar.attach(evaluator)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("[Epoch: {}][Iteration: {}/{}] loss: {:.4f}".format(
                engine.state.epoch, iter, len(train_loader),
                engine.state.output))
            writer.add_scalar("training/loss", engine.state.output,
                              engine.state.iteration)
            for name, param in model.named_parameters():
                writer.add_histogram(name,
                                     param.clone().cpu().data.numpy(), iter)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        mse = metrics['mse']
        mae = metrics['mae']
        print("Training Results - Epoch: {}  mae: {:.5f} mse: {:.5f}".format(
            engine.state.epoch, mse, mae))
        writer.add_scalar("training/mse", mse, engine.state.epoch)
        writer.add_scalar("training/mae", mae, engine.state.epoch)

    # @trainer.on(Events.EPOCH_COMPLETED)
    # def log_validation_results(engine):
    #     evaluator.run(val_loader)
    #     metrics = evaluator.state.metrics
    #     mse = metrics['mse']
    #     mae = metrics['mae']
    #     print("Validation Results - Epoch: {}  mae: {:.2f} mse: {:.2f}".format(engine.state.epoch, mse, mae))
    #     writer.add_scalar("valid/mse", mse, engine.state.epoch)
    #     writer.add_scalar("valid/mae", mae, engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def update_lr_scheduler(engine):
        lr_scheduler.step()
        print('learning rate is: {:6f}'.format(lr_scheduler.get_lr()[0]))

    trainer.run(train_loader, max_epochs=num_epochs)
    writer.close()
示例#21
0
def test_integration():

    n_iters = 100
    batch_size = 10
    n_classes = 10
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    loss_values = iter(range(n_iters))

    def update_fn(engine, batch):
        loss_value = next(loss_values)
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(
            y_true_batch)

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(
        Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha)
    acc_metric.attach(trainer, "running_avg_accuracy")

    avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha)
    avg_output.attach(trainer, "running_avg_output")

    running_avg_acc = [
        None,
    ]

    @trainer.on(Events.ITERATION_COMPLETED)
    def manual_running_avg_acc(engine):
        _, y_pred, y = engine.state.output
        indices = torch.max(y_pred, 1)[1]
        correct = torch.eq(indices, y).view(-1)
        num_correct = torch.sum(correct).item()
        num_examples = correct.shape[0]
        batch_acc = num_correct * 1.0 / num_examples
        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.EPOCH_STARTED)
    def running_avg_output_init(engine):
        engine.state.running_avg_output = None

    @trainer.on(Events.ITERATION_COMPLETED)
    def running_avg_output_update(engine):
        if engine.state.running_avg_output is None:
            engine.state.running_avg_output = engine.state.output[0]
        else:
            engine.state.running_avg_output = (
                engine.state.running_avg_output * alpha +
                (1.0 - alpha) * engine.state.output[0])

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert (
            engine.state.running_avg_acc ==
            engine.state.metrics["running_avg_accuracy"]
        ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_output_values(engine):
        assert (
            engine.state.running_avg_output ==
            engine.state.metrics["running_avg_output"]
        ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"

    np.random.seed(10)
    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)

    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)
def train(model, train_loader, eval_loaders, optimizer, loss_fn,
          n_it_max, patience, split_names, viz=None, device='cpu', name=None,
          log_steps=None, log_epoch=False, _run=None):
    """

    :param model:
    :param datasets: list containing the datasets corresponding to the different
            datasplits (train, val[, test])
    :param task_id:
    :param batch_sizes:
    :param optimizer:
    :param max_epoch:
    :param patience:
    :param log_interval:
    :param viz:
    :param device:
    :param name:
    :param log_steps:
    :param log_epoch:
    :param _run:
    :return:
    """
    if not log_steps and not log_epoch:
        logger.warning('/!\\ No logging during training /!\\')

    if log_steps is None:
        log_steps = []
    if log_epoch:
        log_steps.append(len(train_loader))

    trainer = create_supervised_trainer(model, optimizer, loss_fn,
                                        device=device)
    trainer._logger.setLevel(logging.WARNING)

    train_loss = RunningAverage(output_transform=lambda loss: loss,
                                epoch_bound=False)
    train_loss.attach(trainer, 'train_loss')

    StopAfterIterations([n_it_max]).attach(trainer)
    # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                          persist=True, disable=not (_run or viz))
    # epoch_pbar.attach(trainer, metric_names=['train_loss'])

    # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                             persist=True, disable=not (_run or viz))
    # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED,
    #                      closing_event_name=Events.COMPLETED)

    eval_metrics = {'nll': Loss(lambda y_pred, y: loss_fn(y_pred, y).mean())}
    for i in range(model.n_out):
        eval_metrics['accuracy_{}'.format(i)] = \
            Accuracy(output_transform=get_attr_transform(i))

    evaluator = create_supervised_evaluator(model, metrics=eval_metrics,
                                            device=device)
    all_metrics = defaultdict(dict)
    last_iteration = 0
    patience_counter = 0
    best_loss = float('inf')
    best_state_dict = None
    best_iter = -1

    def log_results(evaluator, data_loader, iteration, split_name):
        evaluator.run(data_loader)
        metrics = evaluator.state.metrics

        log_metrics = {}

        for metric_name, metric_val in metrics.items():
            log_name = '{} {}'.format(split_name, metric_name)
            if viz:
                viz.line([metric_val], X=[iteration], win=metric_name,
                         name=log_name,
                         update='append', opts={'title': metric_name,
                                                'showlegend': True,
                                                'width': 500})
            if _run:
                _run.log_scalar(log_name, metric_val, iteration)
            log_metrics[log_name] = metric_val
            all_metrics[log_name][iteration] = metric_val

        return log_metrics

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_event(trainer):
        iteration = trainer.state.iteration if trainer.state else 0
        nonlocal last_iteration, patience_counter, \
            best_state_dict, best_loss, best_iter

        if not log_steps or not \
                (iteration in log_steps or iteration % log_steps[-1] == 0):
            return
        all_metrics['training_epoch'][iteration] = iteration / len(train_loader)
        all_metrics['training_iterations'][iteration] = iteration
        if trainer.state and 'train_loss' in trainer.state.metrics:
            all_metrics['train_loss'][iteration] = trainer.state.metrics['train_loss']
        else:
            all_metrics['train_loss'][iteration] = float('nan')
        iter_this_step = iteration - last_iteration
        for d_loader, name in zip(eval_loaders, split_names):
            if name == 'Train':
                continue
            split_metrics = log_results(evaluator, d_loader, iteration, name)
            if name == 'Val' and patience > 0:
                if split_metrics['Val nll'] < best_loss:
                    best_loss = split_metrics['Val nll']
                    best_iter = iteration
                    patience_counter = 0
                    best_state_dict = copy.deepcopy(model.state_dict())
                else:
                    patience_counter += iter_this_step
                    if patience_counter >= patience:
                        logger.info('#####')
                        logger.info('# Early stopping Run')
                        logger.info('#####')
                        trainer.terminate()
        last_iteration = iteration

    log_event(trainer)
    max_epoch = int(n_it_max / len(train_loader)) + 1
    trainer.run(train_loader, max_epochs=max_epoch)

    # all_metrics['mean_loss'] = all_metrics['Val nll']
    all_metrics['mean_loss'] = best_loss
    all_metrics['training_iteration'] = best_iter
    return trainer.state.iteration, all_metrics, best_state_dict