def train(config, num_workers, num_threads, cuda, restart_train, mGPU): # torch.set_num_threads(num_threads) train_config = config['training'] arch_config = config['architecture'] batch_size = train_config['batch_size'] lr = train_config['learning_rate'] weight_decay = train_config['weight_decay'] decay_step = train_config['decay_steps'] lr_decay = train_config['lr_decay'] n_epoch = train_config['num_epochs'] use_cache = train_config['use_cache'] print('Configs:', config) # checkpoint path checkpoint_dir = train_config['checkpoint_dir'] if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # logs path logs_dir = train_config['logs_dir'] if not os.path.exists(logs_dir): os.makedirs(logs_dir) shutil.rmtree(logs_dir) log_writer = SummaryWriter(logs_dir) # dataset and dataloader data_set = TrainDataSet(train_config['dataset_configs'], img_format='.bmp', degamma=True, color=False, blind=arch_config['blind_est']) data_loader = DataLoader(data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers) dataset_config = read_config(train_config['dataset_configs'], _configspec_path())['dataset_configs'] # model here model = KPN(color=False, burst_length=dataset_config['burst_length'], blind_est=arch_config['blind_est'], kernel_size=list(map(int, arch_config['kernel_size'].split())), sep_conv=arch_config['sep_conv'], channel_att=arch_config['channel_att'], spatial_att=arch_config['spatial_att'], upMode=arch_config['upMode'], core_bias=arch_config['core_bias']) if cuda: model = model.cuda() if mGPU: model = nn.DataParallel(model) model.train() # loss function here loss_func = LossFunc(coeff_basic=1.0, coeff_anneal=1.0, gradient_L1=True, alpha=arch_config['alpha'], beta=arch_config['beta']) # Optimizer here if train_config['optimizer'] == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) elif train_config['optimizer'] == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay) else: raise ValueError( "Optimizer must be 'sgd' or 'adam', but received {}.".format( train_config['optimizer'])) optimizer.zero_grad() # learning rate scheduler here scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=lr_decay) average_loss = MovingAverage(train_config['save_freq']) if not restart_train: try: checkpoint = load_checkpoint(checkpoint_dir, 'best') start_epoch = checkpoint['epoch'] global_step = checkpoint['global_iter'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['lr_scheduler']) print('=> loaded checkpoint (epoch {}, global_step {})'.format( start_epoch, global_step)) except: start_epoch = 0 global_step = 0 best_loss = np.inf print('=> no checkpoint file to be loaded.') else: start_epoch = 0 global_step = 0 best_loss = np.inf if os.path.exists(checkpoint_dir): pass # files = os.listdir(checkpoint_dir) # for f in files: # os.remove(os.path.join(checkpoint_dir, f)) else: os.mkdir(checkpoint_dir) print('=> training') burst_length = dataset_config['burst_length'] data_length = burst_length if arch_config['blind_est'] else burst_length + 1 patch_size = dataset_config['patch_size'] for epoch in range(start_epoch, n_epoch): epoch_start_time = time.time() # decay the learning rate lr_cur = [param['lr'] for param in optimizer.param_groups] if lr_cur[0] > 5e-6: scheduler.step() else: for param in optimizer.param_groups: param['lr'] = 5e-6 print( '=' * 20, 'lr={}'.format([param['lr'] for param in optimizer.param_groups]), '=' * 20) t1 = time.time() for step, (burst_noise, gt, white_level) in enumerate(data_loader): if cuda: burst_noise = burst_noise.cuda() gt = gt.cuda() # print('white_level', white_level, white_level.size()) # pred_i, pred = model(burst_noise, burst_noise[:, 0:burst_length, ...], white_level) # loss_basic, loss_anneal = loss_func(sRGBGamma(pred_i), sRGBGamma(pred), sRGBGamma(gt), global_step) loss = loss_basic + loss_anneal # backward optimizer.zero_grad() loss.backward() optimizer.step() # update the average loss average_loss.update(loss) # calculate PSNR psnr = calculate_psnr(pred.unsqueeze(1), gt.unsqueeze(1)) ssim = calculate_ssim(pred.unsqueeze(1), gt.unsqueeze(1)) # add scalars to tensorboardX log_writer.add_scalar('loss_basic', loss_basic, global_step) log_writer.add_scalar('loss_anneal', loss_anneal, global_step) log_writer.add_scalar('loss_total', loss, global_step) log_writer.add_scalar('psnr', psnr, global_step) log_writer.add_scalar('ssim', ssim, global_step) # print print( '{:-4d}\t| epoch {:2d}\t| step {:4d}\t| loss_basic: {:.4f}\t| loss_anneal: {:.4f}\t|' ' loss: {:.4f}\t| PSNR: {:.2f}dB\t| SSIM: {:.4f}\t| time:{:.2f} seconds.' .format(global_step, epoch, step, loss_basic, loss_anneal, loss, psnr, ssim, time.time() - t1)) t1 = time.time() # global_step global_step += 1 if global_step % train_config['save_freq'] == 0: if average_loss.get_value() < best_loss: is_best = True best_loss = average_loss.get_value() else: is_best = False save_dict = { 'epoch': epoch, 'global_iter': global_step, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'lr_scheduler': scheduler.state_dict() } save_checkpoint(save_dict, is_best, checkpoint_dir, global_step, max_keep=train_config['ckpt_to_keep']) print('Epoch {} is finished, time elapsed {:.2f} seconds.'.format( epoch, time.time() - epoch_start_time))
def train(args): # torch.set_num_threads(4) # torch.manual_seed(args.seed) # checkpoint = utility.checkpoint(args) data_set = SingleLoader(noise_dir=args.noise_dir, gt_dir=args.gt_dir, image_size=args.image_size) data_loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) loss_basic = BasicLoss() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint_dir = args.checkpoint if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) model = MWRN_lv3().to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [5, 10, 15, 20, 25, 30], 0.5) optimizer.zero_grad() average_loss = MovingAverage(args.save_every) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") try: checkpoint = load_checkpoint(checkpoint_dir, device == 'cuda', 'latest') start_epoch = checkpoint['epoch'] global_step = checkpoint['global_iter'] best_loss = checkpoint['best_loss'] state_dict = checkpoint['state_dict'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print('=> loaded checkpoint (epoch {}, global_step {})'.format( start_epoch, global_step)) except: start_epoch = 0 global_step = 0 best_loss = np.inf print('=> no checkpoint file to be loaded.') DWT = common.DWT() param = [x for name, x in model.named_parameters()] clip_grad_D = 1e4 grad_norm_D = 0 for epoch in range(start_epoch, args.epoch): for step, (noise, gt) in enumerate(data_loader): noise = noise.to(device) gt = gt.to(device) x1 = DWT(gt).to(device) x2 = DWT(x1).to(device) x3 = DWT(x2).to(device) y1 = DWT(noise).to(device) y2 = DWT(y1).to(device) y3 = DWT(y2).to(device) lv3_out, img_lv3 = model(y3, None) scale_loss_lv3 = loss_basic(x3, img_lv3) loss = scale_loss_lv3 optimizer.zero_grad() loss.backward() total_norm_D = nn.utils.clip_grad_norm_(param, clip_grad_D) grad_norm_D = (grad_norm_D * (step / (step + 1)) + total_norm_D / (step + 1)) optimizer.step() average_loss.update(loss) if global_step % args.save_every == 0: print("Save : epoch ", epoch, " step : ", global_step, " with avg loss : ", average_loss.get_value(), ", best loss : ", best_loss) if average_loss.get_value() < best_loss: is_best = True best_loss = average_loss.get_value() else: is_best = False save_dict = { 'epoch': epoch, 'global_iter': global_step, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), } save_checkpoint(save_dict, is_best, checkpoint_dir, global_step) if global_step % args.loss_every == 0: print(global_step, ": ", average_loss.get_value()) global_step += 1 clip_grad_D = min(clip_grad_D, grad_norm_D) scheduler.step() print("Epoch : ", epoch, "end at step: ", global_step)
def train(): log_writer = SummaryWriter('./logs') parser = argparse.ArgumentParser() parser.add_argument('--restart', '-r', action='store_true') args = parser.parse_args() config = read_config('kpn_specs/att_kpn_config.conf', 'kpn_specs/configspec.conf') train_config = config['training'] data_set = TrainDataSet( train_config['dataset_configs'], img_format='.bmp', degamma=True, color=True, blind=False ) data_loader = DataLoader( dataset=data_set, batch_size=32, shuffle=True, num_workers=4 ) loss_fn = nn.L1Loss() model = Network(True).cuda() model.train() optimizer = optim.Adam(model.parameters(), lr=5e-5) if not args.restart: model.load_state_dict(load_checkpoint('./noise_models', best_or_latest='best')) global_iter = 0 min_loss = np.inf loss_ave = MovingAverage(200) import os if not os.path.exists('./noise_models'): os.mkdir('./noise_models') for epoch in range(100): for step, (data, A, B) in enumerate(data_loader): feed = data[:, 0, ...].cuda() gt = data[:, -1, ...].cuda() # print(data.size()) pred = model(feed) loss = loss_fn(pred, gt) global_iter += 1 optimizer.zero_grad() loss.backward() optimizer.step() log_writer.add_scalar('loss', loss, global_iter) loss_ave.update(loss) if global_iter % 200 == 0: loss_t = loss_ave.get_value() min_loss = min(min_loss, loss_t) is_best = min_loss == loss_t save_checkpoint( model.state_dict(), is_best=is_best, checkpoint_dir='./noise_models', n_iter=global_iter ) print('{: 6d}, epoch {: 3d}, iter {: 4d}, loss {:.4f}'.format(global_iter, epoch, step, loss))
def train(args): torch.set_num_threads(args.num_workers) torch.manual_seed(0) if args.data_type == 'rgb': data_set = SingleLoader(noise_dir=args.noise_dir, gt_dir=args.gt_dir, image_size=args.image_size) elif args.data_type == 'raw': data_set = SingleLoader_raw(noise_dir=args.noise_dir, gt_dir=args.gt_dir, image_size=args.image_size) else: print("Data type not valid") exit() data_loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loss_func = losses.CharbonnierLoss().to(device) # loss_func = losses.AlginLoss().to(device) adaptive = robust_loss.adaptive.AdaptiveLossFunction( num_dims=3 * args.image_size**2, float_dtype=np.float32, device=device) checkpoint_dir = args.checkpoint if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if args.model_type == "MIR": model = MIRNet(in_channels=args.n_colors, out_channels=args.out_channels).to(device) elif args.model_type == "KPN": model = MIRNet_kpn(in_channels=args.n_colors, out_channels=args.out_channels).to(device) else: print(" Model type not valid") return optimizer = optim.Adam(model.parameters(), lr=args.lr) optimizer.zero_grad() average_loss = MovingAverage(args.save_every) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2, 4, 6, 8, 10, 12, 14, 16], 0.8) if args.restart: start_epoch = 0 global_step = 0 best_loss = np.inf print('=> no checkpoint file to be loaded.') else: try: checkpoint = load_checkpoint(checkpoint_dir, device == 'cuda', 'latest') start_epoch = checkpoint['epoch'] global_step = checkpoint['global_iter'] best_loss = checkpoint['best_loss'] state_dict = checkpoint['state_dict'] # new_state_dict = OrderedDict() # for k, v in state_dict.items(): # name = "model."+ k # remove `module.` # new_state_dict[name] = v model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print('=> loaded checkpoint (epoch {}, global_step {})'.format( start_epoch, global_step)) except: start_epoch = 0 global_step = 0 best_loss = np.inf print('=> no checkpoint file to be loaded.') eps = 1e-4 for epoch in range(start_epoch, args.epoch): for step, (noise, gt) in enumerate(data_loader): noise = noise.to(device) gt = gt.to(device) pred = model(noise) # print(pred.size()) loss = loss_func(pred, gt) # bs = gt.size()[0] # diff = noise - gt # loss = torch.sqrt((diff * diff) + (eps * eps)) # loss = loss.view(bs,-1) # loss = adaptive.lossfun(loss) # loss = torch.mean(loss) optimizer.zero_grad() loss.backward() optimizer.step() average_loss.update(loss) if global_step % args.save_every == 0: print(len(average_loss._cache)) if average_loss.get_value() < best_loss: is_best = True best_loss = average_loss.get_value() else: is_best = False save_dict = { 'epoch': epoch, 'global_iter': global_step, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), } save_checkpoint(save_dict, is_best, checkpoint_dir, global_step) if global_step % args.loss_every == 0: print(global_step, "PSNR : ", calculate_psnr(pred, gt)) print(average_loss.get_value()) global_step += 1 print('Epoch {} is finished.'.format(epoch)) scheduler.step()
def train(num_workers, cuda, restart_train, mGPU): # torch.set_num_threads(num_threads) color = True batch_size = args.batch_size lr = 2e-4 lr_decay = 0.89125093813 n_epoch = args.epoch # num_workers = 8 save_freq = args.save_every loss_freq = args.loss_every lr_step_size = 100 burst_length = args.burst_length # checkpoint path checkpoint_dir = "checkpoints/" + args.checkpoint if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # logs path logs_dir = "checkpoints/logs/" + args.checkpoint if not os.path.exists(logs_dir): os.makedirs(logs_dir) shutil.rmtree(logs_dir) log_writer = SummaryWriter(logs_dir) # dataset and dataloader data_set = SingleLoader_DGF(noise_dir=args.noise_dir,gt_dir=args.gt_dir,image_size=args.image_size,burst_length=burst_length) data_loader = DataLoader( data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers ) # model here if args.model_type == "attKPN": model = Att_KPN_noise_DGF( color=color, burst_length=burst_length, blind_est=False, kernel_size=[5], sep_conv=False, channel_att=True, spatial_att=True, upMode="bilinear", core_bias=False ) elif args.model_type == "attWKPN": model = Att_Weight_KPN_noise_DGF( color=color, burst_length=burst_length, blind_est=False, kernel_size=[5], sep_conv=False, channel_att=True, spatial_att=True, upMode="bilinear", core_bias=False ) elif args.model_type == 'KPN': model = KPN_noise_DGF( color=color, burst_length=burst_length, blind_est=False, kernel_size=[5], sep_conv=False, channel_att=False, spatial_att=False, upMode="bilinear", core_bias=False ) else: print(" Model type not valid") return if cuda: model = model.cuda() if mGPU: model = nn.DataParallel(model) model.train() # loss function here loss_func = LossBasic() if args.wavelet_loss: print("Use wavelet loss") loss_func2 = WaveletLoss() # Optimizer here optimizer = optim.Adam( model.parameters(), lr=lr ) optimizer.zero_grad() # learning rate scheduler here scheduler = lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=lr_decay) average_loss = MovingAverage(save_freq) if not restart_train: try: checkpoint = load_checkpoint(checkpoint_dir,cuda , best_or_latest=args.load_type) start_epoch = checkpoint['epoch'] global_step = checkpoint['global_iter'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['lr_scheduler']) print('=> loaded checkpoint (epoch {}, global_step {})'.format(start_epoch, global_step)) except: start_epoch = 0 global_step = 0 best_loss = np.inf print('=> no checkpoint file to be loaded.') else: start_epoch = 0 global_step = 0 best_loss = np.inf if os.path.exists(checkpoint_dir): pass # files = os.listdir(checkpoint_dir) # for f in files: # os.remove(os.path.join(checkpoint_dir, f)) else: os.mkdir(checkpoint_dir) print('=> training') for epoch in range(start_epoch, n_epoch): epoch_start_time = time.time() # decay the learning rate # print('='*20, 'lr={}'.format([param['lr'] for param in optimizer.param_groups]), '='*20) t1 = time.time() for step, (image_noise_hr,image_noise_lr, image_gt_hr, _) in enumerate(data_loader): # print(burst_noise.size()) # print(gt.size()) if cuda: burst_noise = image_noise_lr.cuda() gt = image_gt_hr.cuda() image_noise_hr = image_noise_hr.cuda() noise_gt = (image_noise_hr-image_gt_hr).cuda() else: burst_noise = image_noise_lr gt = image_gt_hr noise_gt = image_noise_hr - image_gt_hr # _, pred,noise = model(burst_noise,image_noise_hr) # print(pred.size()) # loss_basic = loss_func(pred, gt) loss_noise = loss_func(noise,noise_gt) loss = loss_basic + loss_noise if args.wavelet_loss: loss_wave = loss_func2(pred,gt) loss_wave_noise = loss_func2(noise,noise_gt) # print(loss_wave) loss = loss_basic + loss_wave + loss_noise + loss_wave_noise # backward optimizer.zero_grad() loss.backward() optimizer.step() # update the average loss average_loss.update(loss) # global_step if not color: pred = pred.unsqueeze(1) gt = gt.unsqueeze(1) if global_step %loss_freq ==0: # calculate PSNR print("burst_noise : ",burst_noise.size()) print("gt : ",gt.size()) psnr = calculate_psnr(pred, gt) ssim = calculate_ssim(pred, gt) # add scalars to tensorboardX log_writer.add_scalar('loss_basic', loss_basic, global_step) log_writer.add_scalar('loss_total', loss, global_step) log_writer.add_scalar('psnr', psnr, global_step) log_writer.add_scalar('ssim', ssim, global_step) # print print('{:-4d}\t| epoch {:2d}\t| step {:4d}\t| loss_basic: {:.4f}\t|' ' loss: {:.4f}\t| PSNR: {:.2f}dB\t| SSIM: {:.4f}\t| time:{:.2f} seconds.' .format(global_step, epoch, step, loss_basic, loss, psnr, ssim, time.time()-t1)) t1 = time.time() if global_step % save_freq == 0: if average_loss.get_value() < best_loss: is_best = True best_loss = average_loss.get_value() else: is_best = False save_dict = { 'epoch': epoch, 'global_iter': global_step, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'lr_scheduler': scheduler.state_dict() } save_checkpoint( save_dict, is_best, checkpoint_dir, global_step, max_keep=10 ) global_step += 1 print('Epoch {} is finished, time elapsed {:.2f} seconds.'.format(epoch, time.time()-epoch_start_time)) lr_cur = [param['lr'] for param in optimizer.param_groups] if lr_cur[0] > 5e-6: scheduler.step() else: for param in optimizer.param_groups: param['lr'] = 5e-6
def train(config, restart_training, num_workers, num_threads): torch.set_num_threads(num_threads) print("Using {} CPU threads".format(torch.get_num_threads())) # TODO: de-hardcode this one. N_CHANNEL = 3 train_config = config["training"] batch_size = train_config["batch_size"] lr = train_config["learning_rate"] w_decay = train_config["weight_decay"] step_size = train_config["decay_steps"] gamma = train_config["lr_decay"] betas = (train_config["beta1"], train_config["beta2"]) n_epochs = train_config["num_epochs"] dataset_configs = train_config["dataset_configs"] use_cache = train_config["use_cache"] print("Configs:", config) # create dir for model checkpoint_dir = train_config["checkpoint_dir"] if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) logger = Logger(train_config["logs_dir"]) use_gpu = torch.cuda.is_available() num_gpu = list(range(torch.cuda.device_count())) print("Using On the fly TRAIN datasets") train_data = OnTheFlyDataset(train_config["dataset_configs"], im_size=(train_config["image_width"], train_config["image_height"]), use_cache=use_cache) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers) model = get_model(config["architecture"]) l1_loss = nn.SmoothL1Loss() if use_gpu: ts = time.time() model = model.cuda() model = nn.DataParallel(model, device_ids=num_gpu) print("Finish cuda loading, time elapsed {}".format(time.time() - ts)) # for sanity check all_parameters = [ p for n, p in model.named_parameters() if p.requires_grad ] if train_config["optimizer"] == "adam": print("Using Adam.") optimizer = optim.Adam([ { 'params': all_parameters }, ], lr=lr, betas=betas, weight_decay=w_decay, amsgrad=True) elif train_config["optimizer"] == "sgd": print("Using SGD.") optimizer = optim.SGD([ { 'params': all_parameters }, ], lr=lr, momentum=betas[0], weight_decay=w_decay) else: raise ValueError( "Optimizer must be 'sgd' or 'adam', received '{}'".format( train_config["optimizer"])) scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma) n_global_iter = 0 average_loss = MovingAverage(train_config["n_loss_average"]) best_loss = np.inf checkpoint_loaded = False if not restart_training: try: checkpoint = load_checkpoint(checkpoint_dir, 'best') start_epoch = checkpoint['epoch'] n_global_iter = checkpoint['global_iter'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) checkpoint_loaded = True print("=> loaded checkpoint (epoch {})".format( checkpoint['epoch'])) except: start_epoch = 0 n_global_iter = 0 best_loss = np.inf print("=> load checkpoint failed, training from scratch") else: start_epoch = 0 print("=> training from scratch") for epoch in range(start_epoch, n_epochs): scheduler.step() ts = time.time() t4 = None t_generate_data = [] t_train_disc = [] t_train_gen = [] t_vis = [] t_save = [] for iter, batch in enumerate(train_loader): if t4 is not None: # collect information and print out average time. t0_old = t0 t0 = time.time() if t4 is not None: t_generate_data.append(t0 - t4) t_train_disc.append(t1 - t0_old) t_train_gen.append(t2 - t1) t_vis.append(t3 - t2) t_save.append(t4 - t3) N_report = 100 N_print = 1000 if (iter % N_report) == 0: t_generate_data = np.mean(t_generate_data) t_train_disc = np.mean(t_train_disc) t_train_gen = np.mean(t_train_gen) t_vis = np.mean(t_vis) t_save = np.mean(t_save) t_total = t_generate_data + t_train_disc + t_train_gen + t_vis + t_save if (iter % N_print) == 0: print("t_generate_data: {:0.4g} s ({:0.4g}%)".format( t_generate_data, t_generate_data / t_total * 100)) print("t_train_disc: {:0.4g} s ({:0.4g}%)".format( t_train_disc, t_train_disc / t_total * 100)) print("t_train_gen: {:0.4g} s ({:0.4g}%)".format( t_train_gen, t_train_gen / t_total * 100)) print("t_vis: {:0.4g} s ({:0.4g}%)".format( t_vis, t_vis / t_total * 100)) print("t_save: {:0.4g} s ({:0.4g}%)".format( t_save, t_save / t_total * 100)) logger.scalar_summary('Steps per sec', 1.0 / t_total, n_global_iter) t_generate_data = [] t_train_disc = [] t_train_gen = [] t_vis = [] t_save = [] should_vis = ((n_global_iter + 1) % train_config["vis_freq"]) == 0 if use_gpu: degraded_img = batch['degraded_img'].cuda() target_img = batch['original_img'].cuda() else: degraded_img = batch['degraded_img'] target_img = batch['original_img'] t1 = time.time() optimizer.zero_grad() # Run the input through the model. output_img = model(degraded_img) loss = l1_loss(output_img, target_img) loss.backward() optimizer.step() logger.scalar_summary('Loss', loss.data[0], n_global_iter) psnr = calculate_psnr(output_img, target_img) logger.scalar_summary('Train PSNR', psnr, n_global_iter) average_loss.update(loss.data[0]) t2 = time.time() if iter % 10 == 0: print("epoch{}, iter{}, loss: {}" \ .format(epoch, iter, loss.data[0])) n_global_iter += 1 if should_vis: exp = batch['vis_exposure'] if 'vis_exposure' in batch else None img = create_vis(degraded_img[:, :3, ...], target_img, output_img, exp) logger.image_summary("Train Images", img, n_global_iter) t3 = time.time() if (n_global_iter % train_config["save_freq"]) == 0: if average_loss.get_value() < best_loss: is_best = True best_loss = average_loss.get_value() else: is_best = False save_dict = { 'epoch': epoch, 'global_iter': n_global_iter, 'state_dict': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), } save_checkpoint(save_dict, is_best, checkpoint_dir, n_global_iter) t4 = time.time() print("Finish epoch {}, time elapsed {}" \ .format(epoch, time.time() - ts))