示例#1
0
    def fit(self,
            train_loader,
            epochs,
            val_loader=None,
            send_weixin=False,
            save_per_epochs=None,
            callbacks=[]):
        validate = val_loader is not None
        # Weixin
        if send_weixin:
            self._enable_send_weixin()

        # Create engine
        engine = self._create_engine()

        # Register events
        engine.add_event_handler(Events.EPOCH_STARTED, self._log_epochs,
                                 epochs)

        if validate:
            engine.add_event_handler(Events.EPOCH_COMPLETED, self._evaluate,
                                     val_loader)
        engine.add_event_handler(Events.EPOCH_COMPLETED, self._log_results,
                                 validate)

        # Set checkpoint
        if save_per_epochs:
            checkpoint_handler = ModelCheckpoint(self.save_path,
                                                 self.name,
                                                 save_per_epochs,
                                                 save_as_state_dict=True,
                                                 require_empty=False)
            checkpoint_handler._iteration = self.epochs()
            engine.add_event_handler(Events.EPOCH_COMPLETED,
                                     checkpoint_handler, {"trainer": self})

        for callback in callbacks:
            engine.add_event_handler(Events.EPOCH_COMPLETED,
                                     _callback_wrapper(callback), self)

        # Run
        engine.run(train_loader, epochs)

        # Destroy
        self._disable_send_weixin()

        # Return history
        hist = {
            metric: hist[-epochs:]
            for metric, hist in self.metric_history.items()
        }
        if not validate:
            hist = keyfilter(lambda k: not k.startswith("val_"), hist)
        return hist
示例#2
0
def do_train(cfg,
             model,
             train_loader,
             val_loader,
             optimizer,
             scheduler,
             loss_fn,
             swriter,
             resume_iter=0):
    log_period = cfg.SOLVER.LOG_PERIOD
    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    output_dir = cfg.OUTPUT_DIR
    epochs = cfg.SOLVER.MAX_EPOCHS

    logger = logging.getLogger("RFRender.%s.train" %
                               cfg.OUTPUT_DIR.split('/')[-1])
    logger.info("Start training")
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_fn,
                                        coarse_stage=cfg.SOLVER.COARSE_STAGE,
                                        swriter=swriter)

    checkpointer = ModelCheckpoint(output_dir,
                                   'rfnr',
                                   n_saved=10,
                                   require_empty=False)
    checkpointer._iteration = resume_iter

    timer = Timer(average=True)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {
        'model': model,
        'optimizer': optimizer,
        'scheduler': scheduler
    })
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED,
                 step=Events.ITERATION_COMPLETED)

    RunningAverage(output_transform=lambda x: x).attach(trainer, 'avg_loss')

    def val_vis(engine):
        avg_loss = evaluator(val_loader, model, loss_fn, swriter,
                             engine.state.iteration)
        logger.info("Validation Results - Epoch: {} Avg Loss: {:.3f}".format(
            engine.state.epoch, avg_loss))
        swriter.add_scalar('Loss/val_loss', avg_loss, engine.state.epoch)

        #xyz, density = vis_density(model)

        #res = torch.cat([xyz[0],density[0]],dim=1).detach().cpu().numpy()
        #np.savetxt(os.path.join(output_dir,'voxels_%d.txt' % engine.state.epoch),res)

    @trainer.on(Events.STARTED)
    def resume_training(engine):
        if resume_iter > 0:
            engine.state.iteration = resume_iter
            engine.state.epoch = resume_iter // len(train_loader)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_period == 0:
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
            logger.info(
                "Epoch[{}] Iteration[{}/{}] Loss: {:.3e} Lr: {:.2e} Speed: {:.1f}[rays/s]"
                .format(engine.state.epoch, iter, len(train_loader),
                        engine.state.metrics['avg_loss'], lr,
                        float(cfg.SOLVER.BUNCH) / timer.value()))
        if iter % 1000 == 1:
            val_vis(engine)

        scheduler.step()

    #@trainer.on(Events.EPOCH_COMPLETED)
    #def adjust_learning_rate(engine):
    #    scheduler.step()

    # adding handlers using `trainer.on` decorator API
    @trainer.on(Events.EPOCH_COMPLETED)
    def print_times(engine):
        logger.info(
            'Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[rays/s]'.
            format(engine.state.epoch,
                   timer.value() * timer.step_count,
                   float(cfg.SOLVER.BUNCH) / timer.value()))
        timer.reset()

    if val_loader is not None:

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(engine):
            val_vis(engine)
            pass

    trainer.run(train_loader, max_epochs=epochs)