def train(train_loader, pfld_backbone, auxiliarynet, criterion, optimizer, cur_epoch): losses = AverageMeter() for img, landmark_gt, euler_angle_gt in train_loader: img.requires_grad = False img = img.cuda(non_blocking=True) landmark_gt.requires_grad = False landmark_gt = landmark_gt.cuda(non_blocking=True) euler_angle_gt.requires_grad = False euler_angle_gt = euler_angle_gt.cuda(non_blocking=True) pfld_backbone = pfld_backbone.cuda() auxiliarynet = auxiliarynet.cuda() features, landmarks = pfld_backbone(img) angle = auxiliarynet(features) weighted_loss, loss = criterion(landmark_gt, euler_angle_gt, angle, landmarks, args.train_batchsize) optimizer.zero_grad() weighted_loss.backward() optimizer.step() losses.update(loss.item()) return weighted_loss, loss
def eval_32bit(model, test_loader): device = model.device criterion = model.criterion model.eval() batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for i, data in enumerate(tqdm(test_loader)): image = data[0].type(torch.FloatTensor).to(device) label = data[1].type(torch.LongTensor).to(device) pred_label = model(image) loss = criterion(pred_label, label) # measure accuracy and record loss prec1, prec5 = accuracy(pred_label.data, label.data, topk=(1, 5)) losses.update(loss.item(), image.size(0)) top1.update(prec1.item(), image.size(0)) top5.update(prec5.item(), image.size(0)) # timing batch_time.update(time.time() - end) end = time.time() print('Loss: {:.3f} | Acc1: {:.3f}% | Acc5: {:.3f}%'.format( losses.avg, top1.avg, top5.avg)) acc = top1.avg loss = losses.avg return acc, loss
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record lossdks prec1 = accuracy(output, target) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1)) savelogloss = save_loss_log(epoch, losses.avg, args.result, filename='losslog.txt') save_log_graph(log_file=savelogloss)
def train(train_loader, linear_backbone, criterion, optimizer, cur_epoch): losses = AverageMeter() for samples in train_loader: img = samples['image'] landmark_gt = samples['landmarks'] img.requires_grad = False img = img.cuda(non_blocking=True) landmark_gt.requires_grad = False landmark_gt = landmark_gt.cuda(non_blocking=True) linear_backbone = linear_backbone.cuda() landmarks = linear_backbone(img) loss = criterion(landmark_gt, landmarks, args.train_batchsize) optimizer.zero_grad() loss.backward() optimizer.step() losses.update(loss.item()) return loss
def test_epoch(self, epoch, data_loader, model, criterion, opt, logger): print('test at epoch {}'.format(epoch)) model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() for i, (inputs, targets, _) in enumerate(data_loader): data_time.update(time.time() - end_time) if not opt.no_cuda: targets = targets.cuda(non_blocking=True) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) logger.log({ 'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg }) return losses.avg
def validate(val_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): target_index_output, target_index_target = list(), list() end = time.time() for i, (input, target) in enumerate(val_loader): input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1 = accuracy(output, target) losses.update(loss.item(), input.size(0)) top1.update(prec1[0], input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # for auroc get value from target index output_cpu = output.cpu().data.numpy() output_cpu = np.array( [softmax(out)[args.target_index] for out in output_cpu]) target_index_output.extend(output_cpu.astype(np.float)) target_index_target.extend( np.equal(target.cpu().data.numpy(), args.target_index).astype(np.int)) # -------------------------------------- if i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) auc, roc = compute_auroc(target_index_output, target_index_target) print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) save_auroc(auc, roc, os.path.join(args.result, 'rocgraph' + '.png')) return top1.avg
def validate(loader, model, criterion_lidar, criterion_rgb, criterion_local, criterion_guide, epoch=0): # batch_time = AverageMeter() losses = AverageMeter() metric = Metrics(max_depth=args.max_depth, disp=args.use_disp, normal=args.normal) score = AverageMeter() score_1 = AverageMeter() # Evaluate model model.eval() weight_map = None # Only forward pass, hence no grads needed with torch.no_grad(): # end = time.time() for i, (input, gt) in tqdm(enumerate(loader)): if not args.no_cuda: input, gt = input.cuda(non_blocking=True), gt.cuda( non_blocking=True) prediction, lidar_out, precise, guide = model(input, epoch) weight_map = torch.ceil(gt / 50) loss = criterion_local(prediction, gt, weight_map, epoch) loss_lidar = criterion_lidar(lidar_out, gt, weight_map, epoch) loss_rgb = criterion_rgb(precise, gt, weight_map, epoch) loss_guide = criterion_guide(guide, gt, weight_map, epoch) loss = args.wpred * loss + args.wlid * loss_lidar + args.wrgb * loss_rgb + args.wguide * loss_guide if args.wpred == 0.0 and args.wlid == 0.0 and args.wrgb == 1.0: prediction = precise elif args.wpred == 0.0 and args.wlid == 1.0 and args.wrgb == 0.0: prediction = lidar_out losses.update(loss.item(), input.size(0)) metric.calculate(prediction[:, 0:1], gt) score.update(metric.get_metric(args.metric), metric.num) score_1.update(metric.get_metric(args.metric_1), metric.num) if (i + 1) % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Metric {score.val:.4f} ({score.avg:.4f})'.format( i + 1, len(loader), loss=losses, score=score)) if args.evaluate: print("===> Average RMSE score on validation set is {:.4f}".format( score.avg)) print("===> Average MAE score on validation set is {:.4f}".format( score_1.avg)) return score.avg, score_1.avg, losses.avg
def validate(loader, model, criterion_rgb, criterion_local, epoch=0): # batch_time = AverageMeter() losses = AverageMeter() metric = Metrics(max_depth=args.max_depth, disp=args.use_disp, normal=args.normal) score = AverageMeter() score_1 = AverageMeter() loss_rgb = torch.zeros(1) # Evaluate model model.eval() # Only forward pass, hence no grads needed with torch.no_grad(): # end = time.time() for i, (input, gt) in enumerate(loader): if not args.no_cuda: input, gt = input.cuda(non_blocking=True), gt.cuda(non_blocking=True) prediction = model(input) if 'mod' in args.mod or 'stacked' in args.mod: loss = criterion_local(prediction[0], gt) loss_rgb = criterion_rgb(prediction[1], gt) loss += args.wrgb*loss_rgb prediction = prediction[0] else: loss = criterion_local(prediction, gt) losses.update(loss.item(), input.size(0)) metric.calculate(prediction[:, 0:1], gt) score.update(metric.get_metric(args.metric), metric.num) score_1.update(metric.get_metric(args.metric_1), metric.num) if (i + 1) % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Metric {score.val:.4f} ({score.avg:.4f})'.format( i+1, len(loader), loss=losses, score=score)) # Synchronization needed if args.world_size>1: score.synchronize_between_processes() score_1.synchronize_between_processes() if args.evaluate: print("===> Average RMSE score on validation set is {:.4f}".format(score.avg)) print("===> Average MAE score on validation set is {:.4f}".format(score_1.avg)) return score.avg, score_1.avg, losses.avg
def main(): global args args = parser.parse_args() if args.num_samples == 0: args.num_samples = None if args.val_batch_size is None: args.val_batch_size = args.batch_size if args.seed: random.seed(args.seed) torch.manual_seed(args.seed) # torch.backends.cudnn.deterministic = True # warnings.warn('You have chosen to seed training. ' # 'This will turn on the CUDNN deterministic setting, ' # 'which can slow down your training considerably! ' # 'You may see unexpected behavior when restarting from checkpoints.') # For distributed training # init_distributed_mode(args) if not args.no_cuda and not torch.cuda.is_available(): raise Exception("No gpu available for usage") torch.backends.cudnn.benchmark = args.cudnn # Init model channels_in = 1 if args.input_type == 'depth' else 4 model = Models.define_model(mod=args.mod, in_channels=channels_in, thres=args.thres) define_init_weights(model, args.weight_init) # Load on gpu before passing params to optimizer if not args.no_cuda: if not args.multi: model = model.cuda() else: model = torch.nn.DataParallel(model).cuda() # model.cuda() # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model = model.module save_id = '{}_{}_{}_{}_{}_batch{}_pretrain{}_wlid{}_wrgb{}_wguide{}_wpred{}_patience{}_num_samples{}_multi{}'.\ format(args.mod, args.optimizer, args.loss_criterion, args.learning_rate, args.input_type, args.batch_size, args.pretrained, args.wlid, args.wrgb, args.wguide, args.wpred, args.lr_decay_iters, args.num_samples, args.multi) # INIT optimizer/scheduler/loss criterion optimizer = define_optim(args.optimizer, model.parameters(), args.learning_rate, args.weight_decay) scheduler = define_scheduler(optimizer, args) # Optional to use different losses criterion_local = define_loss(args.loss_criterion) criterion_lidar = define_loss(args.loss_criterion) criterion_rgb = define_loss(args.loss_criterion) criterion_guide = define_loss(args.loss_criterion) # INIT dataset dataset = Datasets.define_dataset(args.dataset, args.data_path, args.input_type, args.side_selection) dataset.prepare_dataset() train_loader, valid_loader, valid_selection_loader = get_loader( args, dataset) # Resume training best_epoch = 0 lowest_loss = np.inf args.save_path = os.path.join(args.save_path, save_id) mkdir_if_missing(args.save_path) log_file_name = 'log_train_start_0.txt' args.resume = first_run(args.save_path) if args.resume and not args.test_mode and not args.evaluate: path = os.path.join( args.save_path, 'checkpoint_model_epoch_{}.pth.tar'.format(int(args.resume))) if os.path.isfile(path): log_file_name = 'log_train_start_{}.txt'.format(args.resume) # stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(path) args.start_epoch = checkpoint['epoch'] lowest_loss = checkpoint['loss'] best_epoch = checkpoint['best epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: log_file_name = 'log_train_start_0.txt' # stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) print("=> no checkpoint found at '{}'".format(path)) # Only evaluate elif args.evaluate: print("Evaluate only") best_file_lst = glob.glob(os.path.join(args.save_path, 'model_best*')) if len(best_file_lst) != 0: best_file_name = best_file_lst[0] print(best_file_name) if os.path.isfile(best_file_name): sys.stdout = Logger( os.path.join(args.save_path, 'Evaluate.txt')) print("=> loading checkpoint '{}'".format(best_file_name)) checkpoint = torch.load(best_file_name) model.load_state_dict(checkpoint['state_dict']) else: print("=> no checkpoint found at '{}'".format(best_file_name)) else: print("=> no checkpoint found at due to empy list in folder {}". format(args.save_path)) validate(valid_selection_loader, model, criterion_lidar, criterion_rgb, criterion_local, criterion_guide) return # Start training from clean slate else: # Redirect stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) # INIT MODEL print(40 * "=" + "\nArgs:{}\n".format(args) + 40 * "=") print("Init model: '{}'".format(args.mod)) print("Number of parameters in model {} is {:.3f}M".format( args.mod.upper(), sum(tensor.numel() for tensor in model.parameters()) / 1e6)) # Load pretrained state for cityscapes in GLOBAL net if args.pretrained and not args.resume: if not args.load_external_mod: if not args.multi: target_state = model.depthnet.state_dict() else: target_state = model.module.depthnet.state_dict() check = torch.load('erfnet_pretrained.pth') for name, val in check.items(): # Exclude multi GPU prefix mono_name = name[7:] if mono_name not in target_state: continue try: target_state[mono_name].copy_(val) except RuntimeError: continue print('Successfully loaded pretrained model') else: check = torch.load('external_mod.pth.tar') lowest_loss_load = check['loss'] target_state = model.state_dict() for name, val in check['state_dict'].items(): if name not in target_state: continue try: target_state[name].copy_(val) except RuntimeError: continue print("=> loaded EXTERNAL checkpoint with best rmse {}".format( lowest_loss_load)) # Start training for epoch in range(args.start_epoch, args.nepochs): print("\n => Start EPOCH {}".format(epoch + 1)) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print(args.save_path) # Adjust learning rate if args.lr_policy is not None and args.lr_policy != 'plateau': scheduler.step() lr = optimizer.param_groups[0]['lr'] print('lr is set to {}'.format(lr)) # Define container objects batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() score_train = AverageMeter() score_train_1 = AverageMeter() metric_train = Metrics(max_depth=args.max_depth, disp=args.use_disp, normal=args.normal) # Train model for args.nepochs model.train() # compute timing end = time.time() # Load dataset for i, (input, gt) in tqdm(enumerate(train_loader)): # Time dataloader data_time.update(time.time() - end) # Put inputs on gpu if possible if not args.no_cuda: input, gt = input.cuda(), gt.cuda() prediction, lidar_out, precise, guide = model(input, epoch) loss = criterion_local(prediction, gt) loss_lidar = criterion_lidar(lidar_out, gt) loss_rgb = criterion_rgb(precise, gt) loss_guide = criterion_guide(guide, gt) loss = args.wpred * loss + args.wlid * loss_lidar + args.wrgb * loss_rgb + args.wguide * loss_guide losses.update(loss.item(), input.size(0)) metric_train.calculate(prediction[:, 0:1].detach(), gt.detach()) score_train.update(metric_train.get_metric(args.metric), metric_train.num) score_train_1.update(metric_train.get_metric(args.metric_1), metric_train.num) # Clip gradients (usefull for instabilities or mistakes in ground truth) if args.clip_grad_norm != 0: nn.utils.clip_grad_norm(model.parameters(), args.clip_grad_norm) # Setup backward pass optimizer.zero_grad() loss.backward() optimizer.step() # Time trainig iteration batch_time.update(time.time() - end) end = time.time() # Print info if (i + 1) % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Metric {score.val:.4f} ({score.avg:.4f})'.format( epoch + 1, i + 1, len(train_loader), batch_time=batch_time, loss=losses, score=score_train)) print("===> Average RMSE score on training set is {:.4f}".format( score_train.avg)) print("===> Average MAE score on training set is {:.4f}".format( score_train_1.avg)) # Evaulate model on validation set print("=> Start validation set") score_valid, score_valid_1, losses_valid = validate( valid_loader, model, criterion_lidar, criterion_rgb, criterion_local, criterion_guide, epoch) print("===> Average RMSE score on validation set is {:.4f}".format( score_valid)) print("===> Average MAE score on validation set is {:.4f}".format( score_valid_1)) # Evaluate model on selected validation set if args.subset is None: print("=> Start selection validation set") score_selection, score_selection_1, losses_selection = validate( valid_selection_loader, model, criterion_lidar, criterion_rgb, criterion_local, criterion_guide, epoch) total_score = score_selection print("===> Average RMSE score on selection set is {:.4f}".format( score_selection)) print("===> Average MAE score on selection set is {:.4f}".format( score_selection_1)) else: total_score = score_valid print("===> Last best score was RMSE of {:.4f} in epoch {}".format( lowest_loss, best_epoch)) # Adjust lr if loss plateaued if args.lr_policy == 'plateau': scheduler.step(total_score) lr = optimizer.param_groups[0]['lr'] print('LR plateaued, hence is set to {}'.format(lr)) # File to keep latest epoch with open(os.path.join(args.save_path, 'first_run.txt'), 'w') as f: f.write(str(epoch)) # Save model to_save = False if total_score < lowest_loss: to_save = True best_epoch = epoch + 1 lowest_loss = total_score save_checkpoint( { 'epoch': epoch + 1, 'best epoch': best_epoch, 'arch': args.mod, 'state_dict': model.state_dict(), 'loss': lowest_loss, 'optimizer': optimizer.state_dict() }, to_save, epoch) if not args.no_tb: writer.close()
def train(gpu, config): dist.init_process_group(backend='nccl', init_method='env://', world_size=config['num_gpus'], rank=gpu) torch.cuda.set_device(gpu) """ @ build the dataset for training """ dataset = get_data(config) trainset = dataset(config, "train") testset = dataset(config, "test") sampler_train = DistributedSampler(trainset, num_replicas=config['num_gpus'], rank=gpu) sampler_val = DistributedSampler(testset, num_replicas=config['num_gpus'], rank=gpu) batch_size = config['batch_size'] loader_train = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=False, num_workers=config['num_threads'], pin_memory=True, sampler=sampler_train, drop_last=True) loader_val = DataLoader(dataset=testset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True, sampler=sampler_val, drop_last=True) model = UNet(config["in_channels"], config["out_channels"], post_processing=True) model.cuda(gpu) mask_sampling = masksamplingv2() """ @ init parameter """ save_folder = os.path.join( config['save_root'], 'batch_{}_lr_{}'.format(config['batch_size'], config['lr'])) best_epoch = 0 lowest_loss = 0. resume = 0 print('=>Save folder: {}\n'.format(save_folder)) if not os.path.exists(save_folder): os.makedirs(save_folder) optimizer = define_optim(config['optimizer'], model.parameters(), float(config['lr']), 0) criterion_1 = define_loss(config['loss_type']) criterion_2 = define_loss("Multimse") scheduler = define_scheduler(optimizer, config) """ @ justify the resume model """ if config['resume'] != 'None': checkpoint = torch.load(config['resume'], map_location=torch.device('cpu')) resume = checkpoint['epoch'] lowest_loss = checkpoint['loss'] best_epoch = checkpoint['best_epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) model, optimizer = amp.initialize(model, optimizer, opt_level='O0', verbosity=0) amp.load_state_dict(checkpoint['amp']) print("=> loaded checkpoint '{}' (epoch {})".format( resume, checkpoint['epoch'])) del checkpoint log_file = 'log_train_start_{}.txt'.format(resume) """ @ convert model to multi-gpus modes for training """ model = apex.parallel.convert_syncbn_model(model) if config['resume'] == 'None': model, optimizer = amp.initialize(model, optimizer, opt_level='O0', verbosity=0) model = DDP(model) if gpu == 0: sys.stdout = Logger(os.path.join(save_folder, log_file)) print("Number of parameters in model is {:.3f}M".format( sum(tensor.numel() for tensor in model.parameters()) / 1e6)) """ @ start to train """ for epoch in range(resume + 1, config['epoches'] + 1): print('=> Starch Epoch {}\n'.format(epoch)) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('learning rate is set to {}.\n'.format( optimizer.param_groups[0]['lr'])) model.train() sampler_train.set_epoch(epoch) batch_time = AverageMeter() losses = AverageMeter() metric_train = Metrics() rmse_train = AverageMeter() mae_train = AverageMeter() time_snap = time.time() for i, inputs in tqdm(enumerate(loader_train)): gt, noise = inputs['gt'].cuda(gpu), inputs['noise'].cuda(gpu) optimizer.zero_grad() """ update the train inputs """ # patten = np.random.randint(0, 4, 1) patten = torch.randint(0, 8, (1, )) redinput, blueinput = mask_sampling(noise, patten) # redinput, blueinput = generator(noise, mask1, mask2) output = model(redinput) loss = criterion_1(output, blueinput) fulloutput = model(noise) redoutput, blueoutput = mask_sampling(fulloutput, patten) # redoutput, blueoutput = generator(fulloutput, mask1, mask2) loss2 = criterion_2(output, blueinput, redoutput, blueoutput) losssum = config["gamma"] * loss2 + loss with amp.scale_loss(losssum, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() "@ map-reduce tensor" rt = reduce_tensor(losssum.data) torch.cuda.synchronize() losses.update(rt.item(), loader_train.batch_size) metric_train.calculate(fulloutput.detach(), gt) rmse_train.update(metric_train.get_metric('mse'), metric_train.num) mae_train.update(metric_train.get_metric('mae'), metric_train.num) batch_time.update(time.time() - time_snap) time_snap = time.time() if (i + 1) % config['print_freq'] == 0: if gpu == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.6f} ({loss.avg:.6f})\t' 'Metric {rmse_train.val:.6f} ({rmse_train.avg:.6f})'. format(epoch, i + 1, len(loader_train), batch_time=batch_time, loss=losses, rmse_train=rmse_train)) if (i + 1) % config['save_freq'] == 0: print('=> Start sub-selection validation set') rmse, mae = val(model, loader_val, epoch, gpu) model.train() if gpu == 0: print("===> Average RMSE score on selection set is {:.6f}". format(rmse)) print("===> Average MAE score on selection set is {:.6f}". format(mae)) print( "===> Last best score was RMSE of {:.6f} in epoch {}". format(lowest_loss, best_epoch)) if rmse > lowest_loss: lowest_loss = rmse best_epoch = epoch states = { 'epoch': epoch, 'best_epoch': best_epoch, 'loss': lowest_loss, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'amp': amp.state_dict() } save_checkpoints(states, save_folder, epoch, gpu, True) # save checkpoints print('=> Start selection validation set') rmse, mae = val(model, loader_val, epoch, gpu) model.train() if gpu == 0: print("===> Average RMSE score on selection set is {:.6f}".format( rmse)) print("===> Average MAE score on selection set is {:.6f}".format( mae)) print("===> Last best score was RMSE of {:.6f} in epoch {}".format( lowest_loss, best_epoch)) if rmse > lowest_loss: best_epoch = epoch lowest_loss = rmse states = { 'epoch': epoch, 'best_epoch': best_epoch, 'loss': lowest_loss, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'amp': amp.state_dict() } save_checkpoints(states, save_folder, epoch, gpu, True) if config['lr_policy'] == 'plateau': scheduler.step(rmse) else: scheduler.step() # if (epoch) % 10 == 0: # config["gamma"] += 0.5 print('=>> the model training finish!')
def val_epoch(self, epoch, data_loader, model, criterion, opt, logger): print('validation at epoch {}'.format(epoch)) model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() confusion_matrix = np.zeros((opt.n_classes, opt.n_classes)) confidence_for_each_validation = {} ########################################################################### # pdb.set_trace() for i, (inputs, targets, paths) in enumerate(data_loader): data_time.update(time.time() - end_time) targets = targets.cuda(non_blocking=True) with torch.no_grad(): inputs = Variable(inputs) targets = Variable(targets) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) ######## temp line, needs to be removed################################## for j in range(len(targets)): key = paths[j].split('/')[-1] confidence_for_each_validation[key] = [ x.item() for x in outputs[j] ] rows = [int(x) for x in targets] columns = [int(x) for x in np.argmax(outputs.data.cpu(), 1)] assert len(rows) == len(columns) for idx in range(len(rows)): confusion_matrix[rows[idx]][columns[idx]] += 1 ########################################################################### losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) ######### temp line, needs to be removed################################## # print(confusion_matrix) confusion_matrix = pd.DataFrame(confusion_matrix) # confusion_matrix.to_csv(file) confidence_matrix = pd.DataFrame.from_dict( confidence_for_each_validation, orient='index') # confidence_matrix.to_csv('confidence_matrix.csv') ######### temp line, needs to be removed################################## logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) return losses.avg, confusion_matrix, confidence_matrix
def train_epoch(self, epoch, data_loader, model, criterion, optimizer, opt, epoch_logger, batch_logger): print('train at epoch {}'.format(epoch)) model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() for i, (inputs, targets, _) in enumerate(data_loader): data_time.update(time.time() - end_time) targets = targets.cuda(non_blocking=True) inputs = Variable(inputs) targets = Variable(targets) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.data, inputs.size(0)) accuracies.update(acc, inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() batch_logger.log({ 'epoch': epoch, 'batch': i + 1, 'iter': (epoch - 1) * len(data_loader) + (i + 1), 'loss': losses.val, 'acc': accuracies.val, 'lr': optimizer.param_groups[0]['lr'] }) print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( epoch, i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) epoch_logger.log({ 'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg, 'lr': optimizer.param_groups[0]['lr'] }) if epoch % opt.checkpoint == 0: save_file_path = os.path.join(opt.Results_directory, 'save_{}.pth'.format(epoch)) states = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path)
def validate(loader, model, criterion_rgb, criterion_local, epoch=0): # batch_time = AverageMeter() losses = AverageMeter() metric = Metrics(max_depth=args.max_depth, disp=args.use_disp) score = AverageMeter() score_1 = AverageMeter() loss_rgb = torch.zeros(1) # Evaluate model model.eval() # Only forward pass, hence no grads needed with torch.no_grad(): # end = time.time() for i, (input, gt) in tqdm(enumerate(loader)): if not args.no_cuda: input, gt = input.cuda(non_blocking=True), gt.cuda( non_blocking=True) prediction, hidden = model(input, hidden=(None, None)) if 'mod' in args.mod or 'stacked' in args.mod: loss = criterion_local(prediction[0], gt) loss_rgb = criterion_rgb(prediction[1], gt) loss += args.wrgb * loss_rgb prediction = prediction[0] else: loss = criterion_local(prediction, gt) losses.update(loss.item(), input.size(0)) metric.calculate(prediction[:, 0:1], gt) score.update(metric.get_metric(args.metric), metric.num) score_1.update(metric.get_metric(args.metric_1), metric.num) # save 8 images for visualization skip = 50 if args.modality == 'd': img_merge = None else: if args.modality == 'rgb': rgb = input elif args.modality == 'rgbd': rgb = input[:, :3, :, :] depth = input[:, 3, :, :] if i == 0: if args.modality == 'rgbd': img_merge = merge_into_row_with_gt( rgb, depth, gt, prediction) else: img_merge = merge_into_row(rgb, gt, prediction) elif (i < 8 * skip) and (i % skip == 0): if args.modality == 'rgbd': row = merge_into_row_with_gt(rgb, depth, gt, prediction) else: row = merge_into_row(rgb, gt, prediction) img_merge = add_row(img_merge, row) elif i == 8 * skip: filename = args.save_path + '/comparison_' + str( epoch) + '.png' save_image(img_merge, filename) if (i + 1) % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Metric {score.val:.4f} ({score.avg:.4f})'.format( i + 1, len(loader), loss=losses, score=score)) if args.evaluate: print("===> Average RMSE score on validation set is {:.4f}".format( score.avg)) print("===> Average MAE score on validation set is {:.4f}".format( score_1.avg)) return score.avg, score_1.avg, losses.avg
def main(): global args args = parser.parse_args() if args.num_samples == 0: # Use all lidar points args.num_samples = None else: args.data_path = "" # path to precomputed 500 samples assert args.num_samples == 500 print("changed path to samples500 dataset") if args.val_batch_size is None: args.val_batch_size = args.batch_size if args.seed: random.seed(args.seed) torch.manual_seed(args.seed) # init_distributed_mode(args) if not args.no_cuda and not torch.cuda.is_available(): raise Exception("No gpu available for usage") torch.backends.cudnn.benchmark = args.cudnn # Init model args.channels_in = 3 if args.input_type == 'rgb' else 4 model = Models.define_model(args.mod, args) # define_init_weights(model, args.weight_init) # Load on gpu before passing params to optimizer if not args.no_cuda: if not args.multi: model = model.cuda() else: model = torch.nn.DataParallel(model).cuda() # model.cuda() # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # model = model.module save_id = '{}_{}_{}_{}_{}_batch{}_pretrain{}_wrgb{}_drop{}_patience{}_num_samples{}_multi{}_submod{}'.\ format(args.mod, args.optimizer, args.loss_criterion, args.learning_rate, args.input_type, args.batch_size, args.pretrained, args.wrgb, args.drop, args.lr_decay_iters, args.num_samples, args.multi, args.submod) # INIT optimizer/scheduler/loss criterion optimizer = define_optim(args.optimizer, model.parameters(), args.learning_rate, args.weight_decay) scheduler = define_scheduler(optimizer, args) # Optional to use different losses criterion_local = define_loss(args.loss_criterion) criterion_rgb = define_loss(args.loss_criterion) # INIT dataset dataset = Datasets.define_dataset(args.dataset, args.data_path, args.input_type) dataset.prepare_dataset() train_loader, _, valid_loader, valid_selection_loader = get_loader( args, dataset) # Resume training best_epoch = 0 lowest_loss = np.inf args.save_path = os.path.join(args.save_path, save_id) mkdir_if_missing(args.save_path) log_file_name = 'log_train_start_0.txt' args.resume = first_run(args.save_path) if args.resume and not args.test_mode and not args.evaluate: path = os.path.join( args.save_path, 'checkpoint_model_epoch_{}.pth.tar'.format(int(args.resume))) if os.path.isfile(path): log_file_name = 'log_train_start_{}.txt'.format(args.resume) # stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(path) args.start_epoch = checkpoint['epoch'] lowest_loss = checkpoint['loss'] best_epoch = checkpoint['best epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: log_file_name = 'log_train_start_0.txt' # stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) print("=> no checkpoint found at '{}'".format(path)) # Only evaluate elif args.evaluate: print("Evaluate only") best_file_lst = glob.glob(os.path.join(args.save_path, 'model_best*')) if len(best_file_lst) != 0: best_file_name = best_file_lst[0] print(best_file_name) if os.path.isfile(best_file_name): sys.stdout = Logger( os.path.join(args.save_path, 'Evaluate.txt')) print("=> loading checkpoint '{}'".format(best_file_name)) checkpoint = torch.load(best_file_name) model.load_state_dict(checkpoint['state_dict']) else: print("=> no checkpoint found at '{}'".format(best_file_name)) else: print("=> no checkpoint found at due to empy list in folder {}". format(args.save_path)) validate(valid_selection_loader, model, criterion_global, criterion_local) return # Start training from clean slate else: # Redirect stdout sys.stdout = Logger(os.path.join(args.save_path, log_file_name)) # INIT MODEL print(40 * "=" + "\nArgs:{}\n".format(args) + 40 * "=") print("Init model: '{}'".format(args.mod)) print("Number of parameters in model {} is {:.3f}M".format( args.mod.upper(), sum(tensor.numel() for tensor in model.parameters()) / 1e6)) # Load pretrained state for cityscapes in GLOBAL net # if args.pretrained and not args.resume: # if not args.multi: # target_state = model.depthnet.state_dict() # else: # target_state = model.module.depthnet.state_dict() # check = torch.load('erfnet_pretrained.pth') # for name, val in check.items(): # # Exclude multi GPU prefix # mono_name = name[7:] # if mono_name not in target_state: # continue # try: # target_state[mono_name].copy_(val) # except RuntimeError: # continue # print('Successfully loaded pretrained model') # Create summary writer log_path = os.path.join(args.save_path, "logs") if not os.path.exists(log_path): os.makedirs(log_path) logger = SummaryWriter(log_path) with open(os.path.join(args.save_path, 'commandline_args.txt'), 'w') as f: json.dump(args.__dict__, f, indent=2) # Start training for epoch in range(args.start_epoch, args.nepochs): print("\n => Start EPOCH {}".format(epoch + 1)) print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print(args.save_path) # Adjust learning rate if args.lr_policy is not None and args.lr_policy != 'plateau': scheduler.step() lr = optimizer.param_groups[0]['lr'] print('lr is set to {}'.format(lr)) # Define container objects batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() score_train = AverageMeter() score_train_1 = AverageMeter() metric_train = Metrics(max_depth=args.max_depth, disp=args.use_disp) # Train model for args.nepochs model.train() # compute timing end = time.time() # Load dataset for i, (input, gt) in tqdm(enumerate(train_loader)): # Time dataloader data_time.update(time.time() - end) # Put inputs on gpu if possible if not args.no_cuda: input, gt = input.cuda(), gt.cuda() prediction, hidden = model(input, hidden=(None, None)) if 'mod' in args.mod or 'stacked' in args.mod: loss = criterion_local(prediction[0], gt) loss_rgb = criterion_rgb(prediction[1], gt) loss += args.wrgb * loss_rgb prediction = prediction[0] else: loss = criterion_local(prediction, gt) losses.update(loss.item(), input.size(0)) metric_train.calculate(prediction[:, 0:1].detach(), gt.detach()) score_train.update(metric_train.get_metric(args.metric), metric_train.num) score_train_1.update(metric_train.get_metric(args.metric_1), metric_train.num) # Clip gradients (usefull for instabilities or mistakes in ground truth) if args.clip_grad_norm != 0: nn.utils.clip_grad_norm(model.parameters(), args.clip_grad_norm) # Setup backward pass optimizer.zero_grad() loss.backward() optimizer.step() # Time trainig iteration batch_time.update(time.time() - end) end = time.time() # Print info if (i + 1) % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Metric {score.val:.4f} ({score.avg:.4f})'.format( epoch + 1, i + 1, len(train_loader), batch_time=batch_time, loss=losses, score=score_train)) batch_num = len(train_loader) if ((i + 1) % 10 == 0) and (logger is not None): current_step = epoch * batch_num + i # Add scalar summaries logger.add_scalar('Train_loss/Loss', loss.item(), current_step) # utils.record_scalar_summary(result, average_meter, current_step, logger, "Train") # # Add system info # logger.add_scalar('System/gpu_time', average_meter.average().gpu_time, current_step) # logger.add_scalar('System/data_time', average_meter.average().data_time, current_step) if ((i + 1) % 200 == 0): # Add some image summary if args.modality == "rgb": input_images = input.cpu() else: input_images = input[:, :3, :, :].cpu() input_depth = torch.unsqueeze(input[:, 3, :, :], dim=1).cpu() rgb_grid = make_grid(input_images[0:6, :, :, :], nrow=3, normalize=True), target_grid = make_grid(gt.cpu()[0:6, :, :, :], nrow=3, normalize=True) pred_grid = make_grid(prediction.cpu()[0:6, :, :, :], nrow=3, normalize=True) logger.add_image('Train/RGB', rgb_grid[0].data.numpy(), current_step) logger.add_image('Train/Depth_gt', target_grid.data.numpy(), current_step) logger.add_image('Train/Depth_pred', pred_grid.data.numpy(), current_step) if args.modality == "rgbd": depth_grid = make_grid(input_depth[0:6, :, :, :], nrow=3, normalize=True) logger.add_image('Train/Depth_input', depth_grid.data.numpy(), current_step) print("===> Average RMSE score on training set is {:.4f}".format( score_train.avg)) print("===> Average MAE score on training set is {:.4f}".format( score_train_1.avg)) # Evaulate model on validation set print("=> Start validation set") score_valid, score_valid_1, losses_valid = validate( valid_loader, model, criterion_rgb, criterion_local, epoch) print("===> Average RMSE score on validation set is {:.4f}".format( score_valid)) print("===> Average MAE score on validation set is {:.4f}".format( score_valid_1)) # Evaluate model on selected validation set if args.subset is None: print("=> Start selection validation set") score_selection, score_selection_1, losses_selection = validate( valid_selection_loader, model, criterion_rgb, criterion_local, epoch) total_score = score_selection print("===> Average RMSE score on selection set is {:.4f}".format( score_selection)) print("===> Average MAE score on selection set is {:.4f}".format( score_selection_1)) else: total_score = score_valid print("===> Last best score was RMSE of {:.4f} in epoch {}".format( lowest_loss, best_epoch)) # Adjust lr if loss plateaued if args.lr_policy == 'plateau': scheduler.step(total_score) lr = optimizer.param_groups[0]['lr'] print('LR plateaued, hence is set to {}'.format(lr)) # File to keep latest epoch with open(os.path.join(args.save_path, 'first_run.txt'), 'w') as f: f.write(str(epoch)) # Save model to_save = False if total_score < lowest_loss: to_save = True best_epoch = epoch + 1 lowest_loss = total_score save_checkpoint( { 'epoch': epoch + 1, 'best epoch': best_epoch, 'arch': args.mod, 'state_dict': model.state_dict(), 'loss': lowest_loss, 'optimizer': optimizer.state_dict() }, to_save, epoch) if not args.no_tb: writer.close()
def main(): global args args = parser.parse_args() if args.num_samples == 0: args.num_samples = None if args.cuda and not torch.cuda.is_available(): raise Exception("No gpu available for usage") # Init model channels_in = 1 if args.input_type == 'depth' else 4 model = Models.define_model(mod=args.mod, in_channels=channels_in) if args.mod == 'mod': define_init_weights(model, args.weight_init) # Load on gpu before passing params to optimizer if args.cuda: model = model.cuda() save_id = '{}_{}_{}_{}_batch{}_pretrain{}_wlid{}_wrgb{}_wguide{}_wpred{}_num_samples{}'.\ format(args.mod, args.loss_criterion_source, args.learning_rate, args.input_type, args.batch_size, args.load_path!='', args.wlid, args.wrgb, args.wguide, args.wpred, args.num_samples) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # Optional to use different losses criterion_source = define_loss(args.loss_criterion_source) criterion_target = define_loss(args.loss_criterion_target) # INIT KITTI dataset print('Load KITTI') dataset = Datasets.define_dataset('kitti', args.data_path_target, args.input_type) dataset.prepare_dataset() train_loader = get_loader(args, dataset, only_train=True) # INIT Carla dataset print('Load Carla') dataset = Datasets.define_dataset('carla', args.data_path_source, args.input_type) dataset.prepare_dataset() # The sparsification of the data and projection from the LiDAR reference # frame to the RGB camera explained in the paper happens in the dataloader train_loader_carla = get_loader(args, dataset, is_carla=True, only_train=True) train_loader_iter = iter(train_loader) # Resume training if args.save_name == '': args.save_path = os.path.join(args.save_path, save_id) else: args.save_path = os.path.join(args.save_path, args.save_name) if os.path.exists(args.save_path): raise Exception('Save path already exists') mkdir_if_missing(args.save_path) # INIT MODEL print(40 * "=" + "\nArgs:{}\n".format(args) + 40 * "=") print("Init model: '{}'".format(args.mod)) print("Number of parameters in model {} is {:.3f}M".format( args.mod.upper(), sum(tensor.numel() for tensor in model.parameters()) / 1e6)) # Load pretrained state if args.load_path != '': print("=> loading checkpoint {:s}".format(args.load_path)) check = torch.load( args.load_path, map_location=lambda storage, loc: storage)['state_dict'] model.load_state_dict(check) if args.use_image_translation: image_trans_net = ResnetGeneratorCycle(3, 3, 64, n_blocks=9) state_dict = torch.load('./image_translation_weights.pth') image_trans_net.load_state_dict(state_dict) image_trans_net.eval() if args.cuda: image_trans_net = image_trans_net.cuda() # Start training global_step = 0 for epoch in range(args.start_epoch, args.nepochs): print("\n => Start EPOCH {}".format(epoch + 1)) # Define container objects batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() score_train_rmse = AverageMeter() score_train_mae = AverageMeter() metric_train = Metrics(max_depth=args.max_depth) # Train model for args.nepochs model.train() # compute timing end = time.time() for i, (input, gt, filepath) in tqdm(enumerate(train_loader_carla)): # Time dataloader data_time.update(time.time() - end) loss_extra = 0 # Put inputs on gpu if possible if args.cuda: input, gt = input.cuda(), gt.cuda() # The LiDAR depths have large regions where no input depth is given # We remove all of the GT in the synthetic data where no input information is given # in a NxN window around the GT point (we set N=41) to avoid the model trying to estimate # depth for areas without any input guidance input_depth = input[:, 0:1] input_depth, gt = filter_data(input_depth, gt, max_depth=args.max_depth) input[:, 0:1] = input_depth ### Load target set (KITTI) data if args.train_target: try: input_target, gt_target, filepath_t = next( train_loader_iter) except: train_loader_iter = iter(train_loader) input_target, gt_target, filepath_t = next( train_loader_iter) if args.cuda: input_target, gt_target = input_target.cuda( ), gt_target.cuda() if args.use_image_translation: # The CycleGAN model was trained with inputs in the range of [-1, 1] with torch.no_grad(): rgb_trans = image_trans_net(input[:, 1:] / 128.5 - 1) rgb_trans = 128.5 * (rgb_trans + 1) rgb_trans = rgb_trans.clamp(0, 255) input = torch.cat([input[:, :1], rgb_trans], 1) if args.train_target: input_joint = torch.cat([input, input_target]) prediction, lidar_out, precise, guide = model( input_joint, epoch) # We separate predictions from the target domain and source domain prediction_target, lidar_out_target, precise_target, guide_target = prediction[ args.batch_size:], lidar_out[args.batch_size:], precise[ args.batch_size:], guide[args.batch_size:] prediction, lidar_out, precise, guide = prediction[:args. batch_size], lidar_out[: args . batch_size], precise[: args . batch_size], guide[: args . batch_size] else: prediction, lidar_out, precise, guide = model(input, epoch) # We compute the loss for the source domain data loss = criterion_source(prediction, gt) loss_lidar = criterion_source(lidar_out, gt) loss_rgb = criterion_source(precise, gt) loss_guide = criterion_source(guide, gt) loss = args.wpred * loss + args.wlid * loss_lidar + args.wrgb * loss_rgb + args.wguide * loss_guide if args.train_target: loss_target = 0 # We filter the input data for supervision as explained in the paper filtered_sparse_data = filter_sparse_guidance( input_target[:, :1], args.filter_window, args.filter_th) # We compute the loss for the target domain data loss_target += args.wpred * (criterion_target( prediction_target, filtered_sparse_data)) loss_target += args.wlid * (criterion_target( lidar_out_target, filtered_sparse_data)) loss_target += args.wrgb * (criterion_target( precise_target, filtered_sparse_data)) loss_target += args.wguide * (criterion_target( guide_target, filtered_sparse_data)) loss = loss + loss_target metric_train.calculate(prediction[:, 0:1].detach(), gt.detach()) score_train_rmse.update(metric_train.get_metric('rmse'), metric_train.num) score_train_mae.update(metric_train.get_metric('mae'), metric_train.num) losses.update(loss.item(), input.size(0)) # Optimization step optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() global_step += 1 # Print info if (i + 1) % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'RMSE Train {score.val:.4f} ({score.avg:.4f})'.format( epoch + 1, i + 1, len(train_loader_carla), batch_time=batch_time, loss=losses, score=score_train_rmse)) if global_step == args.n_training_iterations: dict_save = { 'epoch': epoch + 1, 'arch': args.mod, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } save_checkpoint(dict_save, False, epoch + 1, global_step) return 1 print("===> Average RMSE score on training set is {:.4f}".format( score_train_rmse.avg)) print("===> Average MAE score on training set is {:.4f}".format( score_train_mae.avg)) dict_save = { 'epoch': epoch + 1, 'arch': args.mod, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } save_checkpoint(dict_save, False, epoch + 1)