def __init__(self, epoch_iters): self.epoch_iters = epoch_iters self.max_iter = cfg.OPTIM.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0
def __init__(self, max_iter): self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Min errors (over the full test set) self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.is_best = False
def compute_full_loader(data_loader, epoch=1): """Computes full loader time.""" timer = Timer() epoch_avg = [] data_loader_len = len(data_loader) for j in range(epoch): timer.tic() for i, (inputs, labels) in enumerate(data_loader): inputs = inputs.cuda() labels = labels.cuda() timer.toc() logger.info( "Epoch {}/{}, Iter {}/{}: Dataloader time is: {}".format( j + 1, epoch, i + 1, data_loader_len, timer.diff)) timer.tic() epoch_avg.append(timer.average_time) timer.reset() return epoch_avg
def compute_time_loader(data_loader): """Computes loader time.""" timer = Timer() data_loader_iterator = iter(data_loader) total_iter = cfg.PREC_TIME.NUM_ITER + cfg.PREC_TIME.WARMUP_ITER total_iter = min(total_iter, len(data_loader)) for cur_iter in range(total_iter): if cur_iter == cfg.PREC_TIME.WARMUP_ITER: timer.reset() timer.tic() next(data_loader_iterator) timer.toc() return timer.average_time
def compute_time_train(model, loss_fun): """Computes precise model forward + backward time using dummy data.""" # Use train mode model.train() # Generate a dummy mini-batch and copy data to GPU im_size, batch_size = cfg.TRAIN.IM_SIZE, int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS) inputs = torch.rand(batch_size, 3, im_size, im_size).cuda(non_blocking=False) labels = torch.zeros(batch_size, dtype=torch.int64).cuda(non_blocking=False) # Cache BatchNorm2D running stats bns = [m for m in model.modules() if isinstance(m, torch.nn.BatchNorm2d)] bn_stats = [[bn.running_mean.clone(), bn.running_var.clone()] for bn in bns] # Compute precise forward backward pass time fw_timer, bw_timer = Timer(), Timer() total_iter = cfg.PREC_TIME.NUM_ITER + cfg.PREC_TIME.WARMUP_ITER for cur_iter in range(total_iter): # Reset the timers after the warmup phase if cur_iter == cfg.PREC_TIME.WARMUP_ITER: fw_timer.reset() bw_timer.reset() # Forward fw_timer.tic() if cfg.DARTS.AUX_WEIGHT > 0 and cfg.MODEL.TYPE == 'darts_cnn': preds, aux_preds = model(inputs) loss = loss_fun(preds, labels) loss += cfg.DARTS.AUX_WEIGHT * loss_fun(aux_preds, labels) else: # Perform the forward pass in AMP preds = model(inputs) # Compute the loss in AMP loss = loss_fun(preds, labels) torch.cuda.synchronize() fw_timer.toc() # Backward bw_timer.tic() loss.backward() torch.cuda.synchronize() bw_timer.toc() # Restore BatchNorm2D running stats for bn, (mean, var) in zip(bns, bn_stats): bn.running_mean, bn.running_var = mean, var return fw_timer.average_time, bw_timer.average_time
def compute_time_eval(model): """Computes precise model forward test time using dummy data.""" # Use eval mode model.eval() # Generate a dummy mini-batch and copy data to GPU im_size, batch_size = cfg.TRAIN.IM_SIZE, int(cfg.TEST.BATCH_SIZE / cfg.NUM_GPUS) inputs = torch.zeros(batch_size, 3, im_size, im_size).cuda(non_blocking=False) # Compute precise forward pass time timer = Timer() total_iter = cfg.PREC_TIME.NUM_ITER + cfg.PREC_TIME.WARMUP_ITER for cur_iter in range(total_iter): # Reset the timers after the warmup phase if cur_iter == cfg.PREC_TIME.WARMUP_ITER: timer.reset() # Forward timer.tic() model(inputs) torch.cuda.synchronize() timer.toc() return timer.average_time
class TrainMeter(object): """Measures training stats.""" def __init__(self, epoch_iters): self.epoch_iters = epoch_iters self.max_iter = cfg.OPTIM.MAX_EPOCH * epoch_iters self.iter_timer = Timer() self.loss = ScalarMeter(cfg.LOG_PERIOD) self.loss_total = 0.0 self.lr = None # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def reset(self, timer=False): if timer: self.iter_timer.reset() self.loss.reset() self.loss_total = 0.0 self.lr = None self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, top1_err, top5_err, loss, lr, mb_size): # Current minibatch stats self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.loss.add_value(loss) self.lr = lr # Aggregate stats self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.loss_total += loss * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): cur_iter_total = cur_epoch * self.epoch_iters + cur_iter + 1 eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.epoch_iters), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "eta": time_string(eta_sec), "top1_err": self.mb_top1_err.get_win_median(), "top5_err": self.mb_top5_err.get_win_median(), "loss": self.loss.get_win_median(), "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "train_iter")) def get_epoch_stats(self, cur_epoch): cur_iter_total = (cur_epoch + 1) * self.epoch_iters eta_sec = self.iter_timer.average_time * (self.max_iter - cur_iter_total) mem_usage = gpu_mem_usage() top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples avg_loss = self.loss_total / self.num_samples stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "eta": time_string(eta_sec), "top1_err": top1_err, "top5_err": top5_err, "loss": avg_loss, "lr": self.lr, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch, tenosrboard_writer=None): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "train_epoch")) if tenosrboard_writer is not None: tenosrboard_writer.add_scalar('train/top1', stats['top1_err'], cur_epoch) tenosrboard_writer.add_scalar('train/top5', stats['top5_err'], cur_epoch) tenosrboard_writer.add_scalar('train/loss', stats['loss'], cur_epoch) tenosrboard_writer.add_scalar('train/lr', stats['lr'], cur_epoch)
class TestMeter(object): """Measures testing stats.""" def __init__(self, max_iter): self.max_iter = max_iter self.iter_timer = Timer() # Current minibatch errors (smoothed over a window) self.mb_top1_err = ScalarMeter(cfg.LOG_PERIOD) self.mb_top5_err = ScalarMeter(cfg.LOG_PERIOD) # Min errors (over the full test set) self.min_top1_err = 100.0 self.min_top5_err = 100.0 # Number of misclassified examples self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 self.is_best = False def reset(self, min_errs=False): if min_errs: self.min_top1_err = 100.0 self.min_top5_err = 100.0 self.iter_timer.reset() self.mb_top1_err.reset() self.mb_top5_err.reset() self.num_top1_mis = 0 self.num_top5_mis = 0 self.num_samples = 0 def iter_tic(self): self.iter_timer.tic() def iter_toc(self): self.iter_timer.toc() def update_stats(self, top1_err, top5_err, mb_size): self.mb_top1_err.add_value(top1_err) self.mb_top5_err.add_value(top5_err) self.num_top1_mis += top1_err * mb_size self.num_top5_mis += top5_err * mb_size self.num_samples += mb_size def get_iter_stats(self, cur_epoch, cur_iter): mem_usage = gpu_mem_usage() iter_stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "iter": "{}/{}".format(cur_iter + 1, self.max_iter), "time_avg": self.iter_timer.average_time, "time_diff": self.iter_timer.diff, "top1_err": self.mb_top1_err.get_win_median(), "top5_err": self.mb_top5_err.get_win_median(), "mem": int(np.ceil(mem_usage)), } return iter_stats def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "test_iter")) def get_epoch_stats(self, cur_epoch): top1_err = self.num_top1_mis / self.num_samples top5_err = self.num_top5_mis / self.num_samples if top1_err < self.min_top1_err: self.is_best = True self.min_top1_err = min(self.min_top1_err, top1_err) self.min_top5_err = min(self.min_top5_err, top5_err) mem_usage = gpu_mem_usage() stats = { "epoch": "{}/{}".format(cur_epoch + 1, cfg.OPTIM.MAX_EPOCH), "time_avg": self.iter_timer.average_time, "top1_err": top1_err, "top5_err": top5_err, "min_top1_err": self.min_top1_err, "min_top5_err": self.min_top5_err, "mem": int(np.ceil(mem_usage)), } return stats def log_epoch_stats(self, cur_epoch, tenosrboard_writer=None): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "test_epoch")) if tenosrboard_writer is not None: tenosrboard_writer.add_scalar('test/top1', stats['top1_err'], cur_epoch) tenosrboard_writer.add_scalar('test/top5', stats['top5_err'], cur_epoch)