Пример #1
0
    def warm_up(self, scaler, model, dataloader, cfg, prefix='train'):
        optimizer = build_optimizer(cfg, model)
        model.train()

        cur_iter = 0
        while cur_iter < cfg.WARMUP.ITERS:
            for i, sample in enumerate(dataloader):
                cur_iter += 1
                if cur_iter >= cfg.WARMUP.ITERS:
                    break
                lr = get_warmup_lr(cur_iter, cfg)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                losses = self.run_step(scaler, model, sample, optimizer, None, None, prefix)

                if self.cfg.local_rank == 0:
                    template = "[iter {}/{}, lr {}] Total train loss: {:.4f} \n" "{}"
                    logger.info(
                        template.format(
                            cur_iter, cfg.WARMUP.ITERS, round(get_current_lr(optimizer), 6),
                            losses["loss"].item(),
                            "\n".join(
                                ["{}: {:.4f}".format(n, l.item()) for n, l in losses.items() if n != "loss"]),
                        )
                    )
        del optimizer
Пример #2
0
    def train_epoch(self, scaler, epoch, model, dataset, dataloader, optimizer, prefix="train"):
        model.train()

        _timer = Timer()
        lossLogger = LossLogger()
        performanceLogger = build_evaluator(self.cfg, dataset)

        num_iters = len(dataloader)
        for i, sample in enumerate(dataloader):
            self.n_iters_elapsed += 1
            _timer.tic()
            self.run_step(scaler, model, sample, optimizer, lossLogger, performanceLogger, prefix)
            torch.cuda.synchronize()
            _timer.toc()

            if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0:
                if self.cfg.local_rank == 0:
                    template = "[epoch {}/{}, iter {}/{}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f})\n" "{}"
                    logger.info(
                        template.format(
                            epoch, self.cfg.N_MAX_EPOCHS - 1, i, num_iters - 1,
                            round(get_current_lr(optimizer), 6),
                            lossLogger.meters["loss"].value,
                                   self.batch_size * self.cfg.N_ITERS_TO_DISPLAY_STATUS / _timer.diff,
                            "\n".join(
                                ["{}: {:.4f}".format(n, l.value) for n, l in lossLogger.meters.items() if n != "loss"]),
                        )
                    )

        if self.cfg.TENSORBOARD and self.cfg.local_rank == 0:
            # Logging train losses
            [self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg, epoch) for n, l in lossLogger.meters.items()]
            performances = performanceLogger.evaluate()
            if performances is not None and len(performances):
                [self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in performances.items()]

        if self.cfg.TENSORBOARD_WEIGHT and False:
            for name, param in model.named_parameters():
                layer, attr = os.path.splitext(name)
                attr = attr[1:]
                self.tb_writer.add_histogram("{}/{}".format(layer, attr), param, epoch)