def train(train_loader, model, criterion, optimizer, epoch, scheduler, args): '''Train model on data in train_loader for a single epoch''' print('Starting training epoch {}'.format(epoch)) # Prepare value counters and timers batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() # Switch model to train mode model.train() # Train for single eopch end = time.time() for i, (input_gray, input_ab, target) in enumerate(train_loader): if args.mixup: input_gray, target_a, target_b, lam = mixup_data( input_gray, target, args.alpha) # Use GPU if available input_gray_variable = Variable( input_gray).cuda() if use_gpu else Variable(input_gray) input_ab_variable = Variable(input_ab).cuda() if use_gpu else Variable( input_ab) target_variable = Variable(target).cuda() if use_gpu else Variable( target) # Record time to load data (above) data_time.update(time.time() - end) # Run forward pass output_ab = model(input_gray_variable) # throw away class predictions if args.mixup: loss = mixup_criterion(criterion, output_ab, target_a, target_b, lam, args.smooth) else: loss = criterion(output_ab, input_ab_variable) # MSE # Record loss and measure accuracy losses.update(loss.item(), input_gray.size(0)) # Compute gradient and optimize optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # Record time to do forward and backward passes batch_time.update(time.time() - end) end = time.time() # Print model accuracy -- in the code below, val refers to value, not validation if i % args.print_freq == 0: for param_group in optimizer.param_groups: current_lr = param_group['lr'] print('({0}) lr:[{1}] ' 'Epoch: [{2}][{3}/{4}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.6f} ({loss.avg:.6f})\t'.format( args.optmzr, current_lr, epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) print('Finished training epoch {}'.format(epoch))
def qtrain(train_loader, criterion, optimizer, scheduler, epoch, model, args, layers, rew_layers, eps): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() idx_loss_dict = {} # switch to train mode model.train() if args.masked_retrain and not args.combine_progressive: print("full acc re-train masking") masks = {} for name, W in (model.named_parameters()): weight = W.cpu().detach().numpy() non_zeros = weight != 0 non_zeros = non_zeros.astype(np.float32) zero_mask = torch.from_numpy(non_zeros).cuda() W = torch.from_numpy(weight).cuda() W.data = W masks[name] = zero_mask elif args.combine_progressive: print("progressive rew-train/re-train masking") masks = {} for name, W in (model.named_parameters()): weight = W.cpu().detach().numpy() non_zeros = weight != 0 non_zeros = non_zeros.astype(np.float32) zero_mask = torch.from_numpy(non_zeros).cuda() W = torch.from_numpy(weight).cuda() W.data = W masks[name] = zero_mask end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # adjust learning rate if args.masked_retrain: scheduler.step() input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) if args.mixup: input, target_a, target_b, lam = mixup_data( input, target, args.alpha) # compute output output = model(input) if args.mixup: ce_loss = mixup_criterion(criterion, output, target_a, target_b, lam, args.smooth) else: ce_loss = criterion(output, target, smooth=args.smooth) if args.rew: if i == 0: print("reweighted l1 training...\n") adjust_rew_learning_rate2(optimizer, epoch, rew_milestone, args) l1_loss = 0 # add reweighted l1 loss if i == 0 and epoch - 1 in rew_milestone: print("reweighted l1 update") for j in range(len(layers)): if args.sparsity_type == "irregular": rew_layers[j] = (1 / (layers[j].data + eps)) elif args.sparsity_type == "column": rew_layers[j] = ( 1 / (torch.norm(layers[j].data, dim=0) + eps)) elif args.sparsity_type == "kernel": rew_layers[j] = ( 1 / (torch.norm(layers[j].data, dim=[2, 3]) + eps)) elif args.sparsity_type == "filter": rew_layers[j] = ( 1 / (torch.norm(torch.norm(layers[j].data, dim=1), dim=[1, 2]) + eps)) for j in range(len(layers)): rew = rew_layers[j] conv_layer = layers[j] if args.sparsity_type == "irregular": l1_loss = l1_loss + 1e-6 * torch.sum( (torch.abs(rew * conv_layer))) elif args.sparsity_type == "column": l1_loss = l1_loss + penalty_para[j] * torch.sum( rew * torch.norm(conv_layer, dim=0)) elif args.sparsity_type == "kernel": l1_loss = l1_loss + 1e-5 * torch.sum( rew * torch.norm(conv_layer, dim=[2, 3])) elif args.sparsity_type == "filter": l1_loss = l1_loss + 1e-3 * torch.sum(rew * torch.norm( torch.norm(conv_layer, dim=1), dim=[1, 2])) ce_loss = l1_loss + ce_loss # measure accuracy and record loss acc1, _ = accuracy(output, target, topk=(1, 5)) losses.update(ce_loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() ce_loss.backward() if args.combine_progressive: with torch.no_grad(): for name, W in (model.named_parameters()): if name in masks: W.grad *= masks[name] #W.grad=hard_quant(W.grad,args.param_bits)#make sure gradient is quantized if args.masked_retrain: with torch.no_grad(): for name, W in (model.named_parameters()): if name in masks: W.grad *= masks[name] #W.grad=hard_quant(W.grad,args.param_bits)#make sure gradient is quantized optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() # print(i) if i % args.log_interval == 0: for param_group in optimizer.param_groups: current_lr = param_group['lr'] print('({0}) lr:[{1:.5f}] ' 'Epoch: [{2}][{3}/{4}]\t' 'Status: rew-[{5}] retrain-[{6}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f}% ({top1.avg:.3f}%)\t'.format( args.optmzr, current_lr, epoch, i, len(train_loader), args.rew, args.masked_retrain, batch_time=data_time, loss=losses, top1=top1)) if i % 100 == 0: idx_loss_dict[i] = losses.avg return idx_loss_dict