def reduce_gradients(model, sync=False): """ average gradients """ if sync: for name, param in model.named_parameters(): if param.requires_grad: link.allreduce(param.grad.data) else: link.synchronize()
def sync_gradients(self): """ average gradients """ if self.sync and link.get_world_size() > 1: for name, param in self.module.named_parameters(): if param.requires_grad: link.allreduce(param.grad.data) else: link.synchronize()
def evaluate(self): batch_time = AverageMeter(0) losses = AverageMeter(0) top1 = AverageMeter(0) top5 = AverageMeter(0) self.model.eval() criterion = torch.nn.CrossEntropyLoss() val_iter = len(self.val_data['loader']) end = time.time() for i, (input, target) in enumerate(self.val_data['loader']): input = input.cuda().half() if self.fp16 else input.cuda() target = target.squeeze().view(-1).cuda().long() logits = self.model(input) # measure accuracy and record loss # / world_size # loss should not be scaled here, it's reduced later! loss = criterion(logits, target) prec1, prec5 = accuracy(logits.data, target, topk=(1, 5)) num = input.size(0) losses.update(loss.item(), num) top1.update(prec1.item(), num) top5.update(prec5.item(), num) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if (i + 1) % self.config.saver.print_freq == 0: self.logger.info( f'Test: [{i+1}/{val_iter}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' ) # gather final results total_num = torch.Tensor([losses.count]) loss_sum = torch.Tensor([losses.avg * losses.count]) top1_sum = torch.Tensor([top1.avg * top1.count]) top5_sum = torch.Tensor([top5.avg * top5.count]) link.allreduce(total_num) link.allreduce(loss_sum) link.allreduce(top1_sum) link.allreduce(top5_sum) final_loss = loss_sum.item() / total_num.item() final_top1 = top1_sum.item() / total_num.item() final_top5 = top5_sum.item() / total_num.item() self.logger.info( f' * Prec@1 {final_top1:.3f}\tPrec@5 {final_top5:.3f}\t\ Loss {final_loss:.3f}\ttotal_num={total_num.item()}') self.model.train() return final_loss, final_top1, final_top5
def validate(val_loader, model, fusion_list=None, fuse_prob=False): batch_time = AverageMeter(0) losses = AverageMeter(0) top1 = AverageMeter(0) top5 = AverageMeter(0) # switch to evaluate mode if fusion_list is not None: model_list = [] for i in range(len(fusion_list)): model_list.append(model_entry(config.model)) model_list[i].cuda() model_list[i] = DistModule(model_list[i], args.sync) load_state(fusion_list[i], model_list[i]) model_list[i].eval() if fuse_prob: softmax = nn.Softmax(dim=1) else: model.eval() rank = link.get_rank() world_size = link.get_world_size() logger = logging.getLogger('global_logger') criterion = nn.CrossEntropyLoss() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(val_loader): input = input.cuda() if not args.fp16 else input.half().cuda() target = target.cuda() # compute output if fusion_list is not None: output_list = [] for model_idx in range(len(fusion_list)): output = model_list[model_idx](input) if fuse_prob: output = softmax(output) output_list.append(output) output = torch.stack(output_list, 0) output = torch.mean(output, 0) else: output = model(input) # measure accuracy and record loss loss = criterion( output, target ) #/ world_size ## loss should not be scaled here, it's reduced later! prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) num = input.size(0) losses.update(loss.item(), num) top1.update(prec1.item(), num) top5.update(prec5.item(), num) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0 and rank == 0: logger.info( 'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(i, len(val_loader), batch_time=batch_time)) # gather final results total_num = torch.Tensor([losses.count]) loss_sum = torch.Tensor([losses.avg * losses.count]) top1_sum = torch.Tensor([top1.avg * top1.count]) top5_sum = torch.Tensor([top5.avg * top5.count]) link.allreduce(total_num) link.allreduce(loss_sum) link.allreduce(top1_sum) link.allreduce(top5_sum) final_loss = loss_sum.item() / total_num.item() final_top1 = top1_sum.item() / total_num.item() final_top5 = top5_sum.item() / total_num.item() if rank == 0: logger.info( ' * Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'. format(final_top1, final_top5, final_loss, total_num.item())) model.train() return final_loss, final_top1, final_top5
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter, tb_logger): global best_prec1 batch_time = AverageMeter(config.print_freq) fw_time = AverageMeter(config.print_freq) bp_time = AverageMeter(config.print_freq) sy_time = AverageMeter(config.print_freq) step_time = AverageMeter(config.print_freq) data_time = AverageMeter(config.print_freq) losses = AverageMeter(config.print_freq) top1 = AverageMeter(config.print_freq) top5 = AverageMeter(config.print_freq) # switch to train mode model.train() world_size = link.get_world_size() rank = link.get_rank() logger = logging.getLogger('global_logger') end = time.time() label_smooth = config.get('label_smooth', 0.0) if label_smooth > 0: logger.info('using label_smooth: {}'.format(label_smooth)) criterion = LabelSmoothCELoss(label_smooth, 1000) else: criterion = nn.CrossEntropyLoss() for i, (input, target) in enumerate(train_loader): curr_step = start_iter + i lr_scheduler.step(curr_step) current_lr = lr_scheduler.get_lr()[0] # measure data loading time data_time.update(time.time() - end) # transfer input to gpu target = target.cuda() input = input.cuda() if not args.fp16 else input.cuda().half() # forward output = model(input) loss = criterion(output, target) / world_size # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec5 = prec5.clone() / world_size link.allreduce(reduced_loss) link.allreduce(reduced_prec1) link.allreduce(reduced_prec5) losses.update(reduced_loss.item()) top1.update(reduced_prec1.item()) top5.update(reduced_prec5.item()) # backward optimizer.zero_grad() if isinstance(optimizer, FusedFP16SGD): optimizer.backward(loss) reduce_gradients(model, args.sync) optimizer.step() elif isinstance(optimizer, FP16SGD): def closure(): # backward optimizer.backward(loss, False) # sync gradients reduce_gradients(model, args.sync) # check overflow, convert to fp32 grads, downscale optimizer.update_master_grads() return loss optimizer.step(closure) else: loss.backward() reduce_gradients(model, args.sync) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) if curr_step % config.print_freq == 0 and rank == 0: tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('acc1_train', top1.avg, curr_step) tb_logger.add_scalar('acc5_train', top5.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info('Iter: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR {lr:.4f}'.format(curr_step, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=current_lr)) if curr_step > 0 and curr_step % config.val_freq == 0: val_loss, prec1, prec5 = validate(val_loader, model) if not tb_logger is None: tb_logger.add_scalar('loss_val', val_loss, curr_step) tb_logger.add_scalar('acc1_val', prec1, curr_step) tb_logger.add_scalar('acc5_val', prec5, curr_step) if rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'step': curr_step, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, config.save_path + '/ckpt') end = time.time()
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter, tb_logger): global best_prec1 batch_time = AverageMeter(config.print_freq) fw_time = AverageMeter(config.print_freq) bp_time = AverageMeter(config.print_freq) sy_time = AverageMeter(config.print_freq) step_time = AverageMeter(config.print_freq) data_time = AverageMeter(config.print_freq) losses = AverageMeter(config.print_freq) top1 = AverageMeter(config.print_freq) top5 = AverageMeter(config.print_freq) # switch to train mode model.train() world_size = link.get_world_size() rank = link.get_rank() logger = logging.getLogger('global_logger') end = time.time() label_smooth = config.get('label_smooth', 0.0) if label_smooth > 0: logger.info('using label_smooth: {}'.format(label_smooth)) criterion = LabelSmoothCELoss(label_smooth, 1000) else: criterion = nn.CrossEntropyLoss() T_min, T_max = args.Tmin, args.Tmax # print (T_min, T_max) def Log_UP(K_min, K_max, ITEMS, ALL_ITEMS): Kmin, Kmax = math.log(K_min) / math.log(10), math.log(K_max) / math.log(10) return torch.tensor([math.pow(10, Kmin + (Kmax - Kmin) / ALL_ITEMS * ITEMS)]).float().cuda() # print (model) TIME = time.time() for i, (input, target) in enumerate(train_loader): curr_step = start_iter + i lr_scheduler.step(curr_step) current_lr = lr_scheduler.get_lr()[0] if (curr_step % config.print_freq == 0): t = Log_UP(T_min, T_max, curr_step, len(train_loader)) if (t < 1): k = 1 / t else: k = torch.tensor([1]).float().cuda() for i in range(3): model.module.layer1[i].conv1.k = k model.module.layer1[i].conv2.k = k model.module.layer1[i].conv1.t = t model.module.layer1[i].conv2.t = t for i in range(4): model.module.layer2[i].conv1.k = k model.module.layer2[i].conv2.k = k model.module.layer2[i].conv1.t = t model.module.layer2[i].conv2.t = t for i in range(6): model.module.layer3[i].conv1.k = k model.module.layer3[i].conv2.k = k model.module.layer3[i].conv1.t = t model.module.layer3[i].conv2.t = t for i in range(3): model.module.layer4[i].conv1.k = k model.module.layer4[i].conv2.k = k model.module.layer4[i].conv1.t = t model.module.layer4[i].conv2.t = t # print ('current k {:.5e} current t {:.5e}'.format(k[0], t[0])) # measure data loading time data_time.update(time.time() - end) # transfer input to gpu target = target.cuda() input = input.cuda() if not args.fp16 else input.cuda().half() # forward output = model(input) loss = criterion(output, target) / world_size # measure accuracy and record loss prec1, prec5 = accuracy(output, target, topk=(1, 5)) reduced_loss = loss.clone() reduced_prec1 = prec1.clone() / world_size reduced_prec5 = prec5.clone() / world_size link.allreduce(reduced_loss) link.allreduce(reduced_prec1) link.allreduce(reduced_prec5) losses.update(reduced_loss.item()) top1.update(reduced_prec1.item()) top5.update(reduced_prec5.item()) # backward optimizer.zero_grad() if isinstance(optimizer, FusedFP16SGD): optimizer.backward(loss) reduce_gradients(model, args.sync) optimizer.step() elif isinstance(optimizer, FP16SGD): def closure(): # backward optimizer.backward(loss, False) # sync gradients reduce_gradients(model, args.sync) # check overflow, convert to fp32 grads, downscale optimizer.update_master_grads() return loss optimizer.step(closure) else: loss.backward() reduce_gradients(model, args.sync) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) if curr_step % config.print_freq == 0 and rank == 0: tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('acc1_train', top1.avg, curr_step) tb_logger.add_scalar('acc5_train', top5.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info('Iter: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR {lr:.4f}'.format( curr_step, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=current_lr)) if curr_step > 0 and curr_step % config.val_freq == 0: val_loss, prec1, prec5 = validate(val_loader, model) if not tb_logger is None: tb_logger.add_scalar('loss_val', val_loss, curr_step) tb_logger.add_scalar('acc1_val', prec1, curr_step) tb_logger.add_scalar('acc5_val', prec5, curr_step) if rank == 0: # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) # save_checkpoint({ # 'step': curr_step, # 'arch': config.model.arch, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # 'optimizer' : optimizer.state_dict(), # }, is_best, config.save_path+'/ckpt'+str(TIME % 100000)) end = time.time()
def reduce_update(self, tensor, num=1): link.allreduce(tensor) self.update(tensor.item(), num=num)