class TrainingStats(object): """Track vital training statistics.""" def __init__(self, model): # Window size for smoothing tracked values (with median filtering) self.WIN_SZ = int(20 / cfg.NUM_GPUS) # Output logging period in SGD iterations self.LOG_PERIOD = int(20 /cfg.NUM_GPUS) self.smoothed_losses_and_metrics = { key: SmoothedValue(self.WIN_SZ) for key in model.losses + model.metrics } self.losses_and_metrics = { key: 0 for key in model.losses + model.metrics } self.smoothed_total_loss = SmoothedValue(self.WIN_SZ) self.smoothed_mb_qsize = SmoothedValue(self.WIN_SZ) self.iter_total_loss = np.nan self.iter_timer = Timer() self.model = model def IterTic(self): self.iter_timer.tic() def IterToc(self): return self.iter_timer.toc(average=False) def ResetIterTimer(self): self.iter_timer.reset() def UpdateIterStats(self): """Update tracked iteration statistics.""" for k in self.losses_and_metrics.keys(): if k in self.model.losses: self.losses_and_metrics[k] = nu.sum_multi_gpu_blob(k) else: self.losses_and_metrics[k] = nu.average_multi_gpu_blob(k) for k, v in self.smoothed_losses_and_metrics.items(): v.AddValue(self.losses_and_metrics[k]) self.iter_total_loss = np.sum( np.array([self.losses_and_metrics[k] for k in self.model.losses]) ) self.smoothed_total_loss.AddValue(self.iter_total_loss) self.smoothed_mb_qsize.AddValue( self.model.roi_data_loader._minibatch_queue.qsize() ) def LogIterStats(self, cur_iter, lr): """Log the tracked statistics.""" num_iter_per_epoch = self.model.roi_data_loader.get_num_iter_per_epoch() if (cur_iter % self.LOG_PERIOD == 0 or cur_iter == cfg.SOLVER.MAX_ITER * num_iter_per_epoch - 1): stats = self.GetStats(cur_iter, lr) log_json_stats(stats) def GetStats(self, cur_iter, lr): num_iter_per_epoch = self.model.roi_data_loader.get_num_iter_per_epoch() eta_seconds = self.iter_timer.average_time * ( cfg.SOLVER.MAX_ITER * num_iter_per_epoch - cur_iter ) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict( iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetAverageValue(), eta=eta, mb_qsize=int( np.round(self.smoothed_mb_qsize.GetAverageValue()) ), mem=int(np.ceil(mem_usage / 1024 / 1024)) ) for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetAverageValue() return stats
class TrainingStats(object): """Track vital training statistics.""" def __init__(self, model): # Window size for smoothing tracked values (with median filtering) self.WIN_SZ = int(1280 / cfg.NUM_GPUS) # Output logging period in SGD iterations self.LOG_PERIOD = int(1280 / cfg.NUM_GPUS) self.smoothed_losses_and_metrics = { key: SmoothedValue(self.WIN_SZ) for key in model.losses + model.metrics } self.losses_and_metrics = { key: 0 for key in model.losses + model.metrics } self.smoothed_total_loss = SmoothedValue(self.WIN_SZ) self.smoothed_mb_qsize = SmoothedValue(self.WIN_SZ) self.iter_total_loss = np.nan self.iter_timer = Timer() self.model = model self.mem = dict() self.mem = None def IterTic(self): self.iter_timer.tic() def IterToc(self): return self.iter_timer.toc(average=False) def ResetIterTimer(self): self.iter_timer.reset() def UpdateIterStats(self): """Update tracked iteration statistics.""" for k in self.losses_and_metrics.keys(): self.losses_and_metrics[k] = nu.average_multi_gpu_blob(k) for k, v in self.smoothed_losses_and_metrics.items(): v.AddValue(self.losses_and_metrics[k]) self.iter_total_loss = np.sum( np.array([self.losses_and_metrics[k] for k in self.model.losses])) self.smoothed_total_loss.AddValue(self.iter_total_loss) self.smoothed_mb_qsize.AddValue( self.model.roi_data_loader._minibatch_queue.qsize()) if self.mem is not None: self.GetMem() def LogIterStats(self, cur_iter, lr): """Log the tracked statistics.""" if (cur_iter % self.LOG_PERIOD == 0 or cur_iter == cfg.SOLVER.MAX_ITER - 1): stats = self.GetStats(cur_iter, lr) log_json_stats(stats) if self.mem is not None: mem_sorted = sorted(self.mem.items(), key=lambda d: d[1]) print(mem_sorted) def GetStats(self, cur_iter, lr): eta_seconds = self.iter_timer.average_time * (cfg.SOLVER.MAX_ITER - cur_iter) eta = str(datetime.timedelta(seconds=int(eta_seconds))) mem_stats = c2_py_utils.GetGPUMemoryUsageStats() mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) stats = dict(iter=cur_iter, lr=float(lr), time=self.iter_timer.average_time, loss=self.smoothed_total_loss.GetAverageValue(), eta=eta, mb_qsize=int( np.round(self.smoothed_mb_qsize.GetAverageValue())), mem=int(np.ceil(mem_usage / 1024 / 1024))) for k, v in self.smoothed_losses_and_metrics.items(): stats[k] = v.GetAverageValue() return stats def is_grad(self, b): name = str(b) return name.endswith("_grad") def is_shared(self, b): name = str(b) return name.endswith("_shared") def GetMem(self): for op_idx in range(len(self.model.net._net.op)): op = self.model.net._net.op[op_idx] for b in list(op.output): if self.is_grad(b): pass elif self.is_shared(b): pass else: continue blob = workspace.FetchBlob(str(b)) if b not in self.mem.keys(): self.mem[str(b)] = 0 self.mem[str(b)] = max(self.mem[str(b)], blob.size)