def save_json(args, model, reglog, optimizer, loader): pred_label = [] log_top1 = AverageMeter() for iter_epoch, (inp, target) in enumerate(loader): # measure data loading time learning_rate_decay(optimizer, len(loader) * args.epoch + iter_epoch, args.lr) # start at iter start_iter if iter_epoch < args.start_iter: continue # move to gpu inp = inp.cuda(non_blocking=True) target = target.cuda(non_blocking=True) if 'VOC2007' in args.data_path: target = target.float() # forward with torch.no_grad(): output = model(inp) output = reglog(output) _, pred = output.topk(1, 1, True, True) pred = pred.t() pred_var = pred.data.cpu().numpy().reshape(-1) for i in range(len(pred_var)): pred_label.append(pred_var[i]) prec1 = accuracy(args, output, target) log_top1.update(prec1.item(), output.size(0)) def load_json(file_path): assert os.path.exists(file_path), "{} does not exist".format(file_path) with open(file_path, 'r') as fp: data = json.load(fp) img_names = list(data.keys()) return img_names json_predictions,img_names = {}, [] img_names = load_json('./val_targets.json') for idx in range(len(pred_label)): json_predictions[img_names[idx]] = int(pred_label[idx]) output_file = os.path.join(args.json_save_path, args.json_save_name) with open(output_file, 'w') as fp: json.dump(json_predictions, fp) return log_top1.avg
def train_network(args, model, reglog, optimizer, loader): """ Train the models on the dataset. """ # running statistics batch_time = AverageMeter() data_time = AverageMeter() # training statistics log_top1 = AverageMeter() log_loss = AverageMeter() end = time.perf_counter() if 'pascal' in args.data_path: criterion = nn.BCEWithLogitsLoss(reduction='none') else: criterion = nn.CrossEntropyLoss().cuda() for iter_epoch, (inp, target) in enumerate(loader): # measure data loading time data_time.update(time.perf_counter() - end) learning_rate_decay(optimizer, len(loader) * args.epoch + iter_epoch, args.lr) # start at iter start_iter if iter_epoch < args.start_iter: continue # move to gpu inp = inp.cuda(non_blocking=True) target = target.cuda(non_blocking=True) if 'pascal' in args.data_path: target = target.float() # forward with torch.no_grad(): output = model(inp) output = reglog(output) # compute cross entropy loss loss = criterion(output, target) if 'pascal' in args.data_path: mask = (target == 255) loss = torch.sum(loss.masked_fill_(mask, 0)) / target.size(0) optimizer.zero_grad() # compute the gradients loss.backward() # step optimizer.step() # log # signal received, relaunch experiment if os.environ['SIGNAL_RECEIVED'] == 'True': if not args.rank: torch.save({ 'epoch': args.epoch, 'start_iter': iter_epoch + 1, 'state_dict': reglog.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.dump_path, 'checkpoint.pth.tar')) trigger_job_requeue(os.path.join(args.dump_path, 'checkpoint.pth.tar')) # update stats log_loss.update(loss.item(), output.size(0)) if not 'pascal' in args.data_path: prec1 = accuracy(args, output, target) log_top1.update(prec1.item(), output.size(0)) batch_time.update(time.perf_counter() - end) end = time.perf_counter() # verbose if iter_epoch % 100 == 0: logger.info('Epoch[{0}] - Iter: [{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec {log_top1.val:.3f} ({log_top1.avg:.3f})\t' .format(args.epoch, iter_epoch, len(loader), batch_time=batch_time, data_time=data_time, loss=log_loss, log_top1=log_top1)) # end of epoch args.start_iter = 0 args.epoch += 1 # dump checkpoint if not args.rank: torch.save({ 'epoch': args.epoch, 'start_iter': 0, 'state_dict': reglog.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.dump_path, 'checkpoint.pth.tar')) return (args.epoch - 1, args.epoch * len(loader), log_top1.avg, log_loss.avg)
def train_network(args, model, optimizer, dataset): """ Train the models on the dataset. """ # swith to train mode model.train() sampler = torch.utils.data.distributed.DistributedSampler(dataset) loader = torch.utils.data.DataLoader( dataset, sampler=sampler, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, ) # running statistics batch_time = AverageMeter() data_time = AverageMeter() # training statistics log_top1 = AverageMeter() log_loss = AverageMeter() end = time.perf_counter() cel = nn.CrossEntropyLoss().cuda() for iter_epoch, (inp, target) in enumerate(loader): # measure data loading time data_time.update(time.perf_counter() - end) # start at iter start_iter if iter_epoch < args.start_iter: continue # move to gpu inp = inp.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # forward output = model(inp) # compute cross entropy loss loss = cel(output, target) optimizer.zero_grad() # compute the gradients loss.backward() # step optimizer.step() # log # signal received, relaunch experiment if os.environ['SIGNAL_RECEIVED'] == 'True': if not args.rank: torch.save( { 'epoch': args.epoch, 'start_iter': iter_epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.dump_path, 'checkpoint.pth.tar')) trigger_job_requeue( os.path.join(args.dump_path, 'checkpoint.pth.tar')) # update stats log_loss.update(loss.item(), output.size(0)) prec1 = accuracy(args, output, target) log_top1.update(prec1.item(), output.size(0)) batch_time.update(time.perf_counter() - end) end = time.perf_counter() # verbose if iter_epoch % 100 == 0: logger.info( 'Epoch[{0}] - Iter: [{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec {log_top1.val:.3f} ({log_top1.avg:.3f})\t'.format( args.epoch, iter_epoch, len(loader), batch_time=batch_time, data_time=data_time, loss=log_loss, log_top1=log_top1)) # end of epoch args.start_iter = 0 args.epoch += 1 # dump checkpoint if not args.rank: torch.save( { 'epoch': args.epoch, 'start_iter': 0, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.dump_path, 'checkpoint.pth.tar')) if not (args.epoch - 1) % args.checkpoint_freq: shutil.copyfile( os.path.join(args.dump_path, 'checkpoint.pth.tar'), os.path.join(args.dump_checkpoints, 'checkpoint' + str(args.epoch - 1) + '.pth.tar'), ) return (args.epoch - 1, args.epoch * len(loader), log_top1.avg, log_loss.avg)