def __call__(self, global_step, loss: AverageMeter, epoch: int, fp16: bool, grad_scaler: torch.cuda.amp.GradScaler): if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0: if self.init: try: speed: float = self.frequent * self.batch_size / ( time.time() - self.tic) speed_total = speed * self.world_size except ZeroDivisionError: speed_total = float('inf') time_now = (time.time() - self.time_start) / 3600 time_total = time_now / ((global_step + 1) / self.total_step) time_for_end = time_total - time_now if self.writer is not None: self.writer.add_scalar('time_for_end', time_for_end, global_step) self.writer.add_scalar('loss', loss.avg, global_step) if fp16: msg = "Speed %.2f samples/sec Loss %.4f Epoch: %d Global Step: %d "\ "Fp16 Grad Scale: %2.f Required: %1.f hours" % ( speed_total, loss.avg, epoch, global_step, grad_scaler.get_scale(), time_for_end ) else: msg = "Speed %.2f samples/sec Loss %.4f Epoch: %d Global Step: %d Required: %1.f hours" % ( speed_total, loss.avg, epoch, global_step, time_for_end) logging.info(msg) loss.reset() self.tic = time.time() else: self.init = True self.tic = time.time()
def __call__(self, global_step, loss: AverageMeter, epoch: int, lr_backbone_value, lr_pfc_value): if self.rank is 0 and global_step > 0 and global_step % self.frequent == 0: if self.init: try: speed: float = self.frequent * self.batch_size / ( time.time() - self.tic) speed_total = speed * self.world_size except ZeroDivisionError: speed_total = float('inf') time_now = (time.time() - self.time_start) / 3600 time_total = time_now / ((global_step + 1) / self.total_step) time_for_end = time_total - time_now if self.writer is not None: self.writer.add_scalar('time_for_end', time_for_end, global_step) self.writer.add_scalar('loss', loss.avg, global_step) msg = "Speed %.2f samples/sec Loss %.4f Epoch: %d Global Step: %d Required: %1.f hours, lr_backbone_value: %f, lr_pfc_value: %f" % ( speed_total, loss.avg, epoch, global_step, time_for_end, lr_backbone_value, lr_pfc_value) logging.info(msg) loss.reset() self.tic = time.time() else: self.init = True self.tic = time.time()
class CallBackLogging(object): def __init__(self, frequent, total_step, batch_size, world_size, writer=None): self.frequent: int = frequent self.time_start = time.time() self.total_step: int = total_step self.batch_size: int = batch_size self.world_size: int = world_size self.writer = writer self.init = False self.tic = 0 self.losses = AverageMeter() def metric_cb(self, global_step: int, epoch: int, learning_rate: float): def callback(loss): loss = loss.mean() self.losses.update(loss, 1) if global_step % self.frequent == 0: if self.init: try: speed: float = self.frequent * self.batch_size / ( time.time() - self.tic) speed_total = speed * self.world_size except ZeroDivisionError: speed_total = float('inf') time_now = (time.time() - self.time_start) / 3600 time_total = time_now / ( (global_step + 1) / self.total_step) time_for_end = time_total - time_now if self.writer is not None: self.writer.add_scalar('time_for_end', time_for_end, global_step) self.writer.add_scalar('learning_rate', learning_rate, global_step) self.writer.add_scalar('loss', loss.avg, global_step) else: msg = "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " \ "Required: %1.f hours" % ( speed_total, self.losses.avg, learning_rate, epoch, global_step, time_for_end ) logging.info(msg) self.losses.reset() self.tic = time.time() else: self.init = True self.tic = time.time() return callback
def __call__( self, global_step: int, loss: AverageMeter, epoch: int, fp16: bool, learning_rate: float, grad_scaler=None, ): if self.rank == 0 and global_step % self.frequent == 0: if self.init: try: speed: float = self.frequent * self.batch_size / ( time.time() - self.tic) speed_total = speed * self.world_size except ZeroDivisionError: speed_total = float("inf") time_now = (time.time() - self.time_start) / 3600 time_total = time_now / ((global_step + 1) / self.total_step) time_for_end = time_total - time_now if self.writer is not None: self.writer.add_scalar("time_for_end", time_for_end, global_step) self.writer.add_scalar("learning_rate", learning_rate, global_step) self.writer.add_scalar("loss", loss.avg, global_step) if fp16: msg = ( "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " "Fp16 Grad Scale: %2.f Required: %1.f hours" % ( speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end, )) else: msg = ( "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " "Required: %1.f hours" % ( speed_total, loss.avg, learning_rate, epoch, global_step, time_for_end, )) logging.info(msg) loss.reset() self.tic = time.time() else: self.init = True self.tic = time.time()