Пример #1
0
    def __call__(self, global_step, loss: AverageMeter, epoch: int, fp16: bool,
                 grad_scaler: torch.cuda.amp.GradScaler):
        if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0:
            if self.init:
                try:
                    speed: float = self.frequent * self.batch_size / (
                        time.time() - self.tic)
                    speed_total = speed * self.world_size
                except ZeroDivisionError:
                    speed_total = float('inf')

                time_now = (time.time() - self.time_start) / 3600
                time_total = time_now / ((global_step + 1) / self.total_step)
                time_for_end = time_total - time_now
                if self.writer is not None:
                    self.writer.add_scalar('time_for_end', time_for_end,
                                           global_step)
                    self.writer.add_scalar('loss', loss.avg, global_step)
                if fp16:
                    msg = "Speed %.2f samples/sec   Loss %.4f   Epoch: %d   Global Step: %d   "\
                          "Fp16 Grad Scale: %2.f   Required: %1.f hours" % (
                        speed_total, loss.avg, epoch, global_step, grad_scaler.get_scale(), time_for_end
                    )
                else:
                    msg = "Speed %.2f samples/sec   Loss %.4f   Epoch: %d   Global Step: %d   Required: %1.f hours" % (
                        speed_total, loss.avg, epoch, global_step,
                        time_for_end)
                logging.info(msg)
                loss.reset()
                self.tic = time.time()
            else:
                self.init = True
                self.tic = time.time()
Пример #2
0
    def __call__(self, global_step, loss: AverageMeter, epoch: int,
                 lr_backbone_value, lr_pfc_value):
        if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0:
            if self.init:
                try:
                    speed: float = self.frequent * self.batch_size / (
                        time.time() - self.tic)
                    speed_total = speed * self.world_size
                except ZeroDivisionError:
                    speed_total = float('inf')

                time_now = (time.time() - self.time_start) / 3600
                time_total = time_now / ((global_step + 1) / self.total_step)
                time_for_end = time_total - time_now
                if self.writer is not None:
                    self.writer.add_scalar('time_for_end', time_for_end,
                                           global_step)
                    self.writer.add_scalar('loss', loss.avg, global_step)
                msg = "Speed %.2f samples/sec   Loss %.4f   Epoch: %d   Global Step: %d   Required: %1.f hours, lr_backbone_value: %f, lr_pfc_value: %f" % (
                    speed_total, loss.avg, epoch, global_step, time_for_end,
                    lr_backbone_value, lr_pfc_value)
                logging.info(msg)
                loss.reset()
                self.tic = time.time()
            else:
                self.init = True
                self.tic = time.time()
Пример #3
0
class CallBackLogging(object):
    def __init__(self,
                 frequent,
                 total_step,
                 batch_size,
                 world_size,
                 writer=None):
        self.frequent: int = frequent

        self.time_start = time.time()
        self.total_step: int = total_step
        self.batch_size: int = batch_size
        self.world_size: int = world_size
        self.writer = writer

        self.init = False
        self.tic = 0
        self.losses = AverageMeter()

    def metric_cb(self, global_step: int, epoch: int, learning_rate: float):
        def callback(loss):
            loss = loss.mean()
            self.losses.update(loss, 1)
            if global_step % self.frequent == 0:

                if self.init:
                    try:
                        speed: float = self.frequent * self.batch_size / (
                            time.time() - self.tic)
                        speed_total = speed * self.world_size
                    except ZeroDivisionError:
                        speed_total = float('inf')

                    time_now = (time.time() - self.time_start) / 3600
                    time_total = time_now / (
                        (global_step + 1) / self.total_step)
                    time_for_end = time_total - time_now
                    if self.writer is not None:
                        self.writer.add_scalar('time_for_end', time_for_end,
                                               global_step)
                        self.writer.add_scalar('learning_rate', learning_rate,
                                               global_step)
                        self.writer.add_scalar('loss', loss.avg, global_step)
                    else:
                        msg = "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   " \
                              "Required: %1.f hours" % (
                                  speed_total, self.losses.avg, learning_rate, epoch, global_step, time_for_end
                              )
                    logging.info(msg)
                    self.losses.reset()
                    self.tic = time.time()
                else:
                    self.init = True
                    self.tic = time.time()

        return callback
Пример #4
0
    def __call__(
        self,
        global_step: int,
        loss: AverageMeter,
        epoch: int,
        fp16: bool,
        learning_rate: float,
        grad_scaler=None,
    ):
        if self.rank == 0 and global_step % self.frequent == 0:
            if self.init:
                try:
                    speed: float = self.frequent * self.batch_size / (
                        time.time() - self.tic)
                    speed_total = speed * self.world_size
                except ZeroDivisionError:
                    speed_total = float("inf")

                time_now = (time.time() - self.time_start) / 3600
                time_total = time_now / ((global_step + 1) / self.total_step)
                time_for_end = time_total - time_now
                if self.writer is not None:
                    self.writer.add_scalar("time_for_end", time_for_end,
                                           global_step)
                    self.writer.add_scalar("learning_rate", learning_rate,
                                           global_step)
                    self.writer.add_scalar("loss", loss.avg, global_step)
                if fp16:
                    msg = (
                        "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   "
                        "Fp16 Grad Scale: %2.f   Required: %1.f hours" % (
                            speed_total,
                            loss.avg,
                            learning_rate,
                            epoch,
                            global_step,
                            time_for_end,
                        ))
                else:
                    msg = (
                        "Speed %.2f samples/sec   Loss %.4f   LearningRate %.4f   Epoch: %d   Global Step: %d   "
                        "Required: %1.f hours" % (
                            speed_total,
                            loss.avg,
                            learning_rate,
                            epoch,
                            global_step,
                            time_for_end,
                        ))
                logging.info(msg)
                loss.reset()
                self.tic = time.time()
            else:
                self.init = True
                self.tic = time.time()