# get the options options['dataset_options'] = dynsys_params.get_dataset_options( options['dataset']) options['model_options'] = model_params.get_model_options( options['model'], options['dataset'], options['dataset_options']) options['train_options'] = train_params.get_train_options( options['dataset']) options['test_options'] = train_params.get_test_options() # optimal model parameters options['model_options'].h_dim = options['optValue']['h_opt'] options['model_options'].z_dim = options['optValue']['z_opt'] options['model_options'].n_layers = options['optValue']['n_opt'] # save the options save_options(options, path_general, 'options.txt') # allocation rmse_all = np.zeros([options['MCsamples']]) vaf_all = np.zeros([options['MCsamples']]) logLikelihood_all = np.zeros([options['MCsamples']]) rmse_KF_all = np.zeros([options['MCsamples']]) vaf_KF_all = np.zeros([options['MCsamples']]) # print model type and dynamic system type print('\n\tModel Type: {}'.format(options['model'])) print('\tDynamic System: {}\n'.format(options['dataset'])) file_name_general = '{}_h{}_z{}_n{}'.format( options['dataset'], options['model_options'].h_dim, options['model_options'].z_dim, options['model_options'].n_layers)
def main(config): save_path = config['save_path'] epochs = config['epochs'] os.environ['TORCH_HOME'] = config['torch_home'] distributed = config['use_DDP'] start_ep = 0 start_cnt = 0 # initialize model print("Initializing model...") if distributed: initialize_distributed(config) rank = config['rank'] # map string name to class constructor model = get_model(config) model.apply(init_weights) if config['resume_ckpt'] is not None: # load weights from checkpoint state_dict = load_weights(config['resume_ckpt']) model.load_state_dict(state_dict) print("Moving model to GPU") model.cuda(torch.cuda.current_device()) print("Setting up losses") if config['use_vgg']: criterionVGG = Vgg19PerceptualLoss(config['reduced_w']) criterionVGG.cuda() validationLoss = criterionVGG if config['use_gan']: use_sigmoid = config['no_lsgan'] disc_input_channels = 3 discriminator = MultiscaleDiscriminator(disc_input_channels, config['ndf'], config['n_layers_D'], 'instance', use_sigmoid, config['num_D'], False, False) discriminator.apply(init_weights) if config['resume_ckpt_D'] is not None: # load weights from checkpoint print("Resuming discriminator from %s" % (config['resume_ckpt_D'])) state_dict = load_weights(config['resume_ckpt_D']) discriminator.load_state_dict(state_dict) discriminator.cuda(torch.cuda.current_device()) criterionGAN = GANLoss(use_lsgan=not config['no_lsgan']) criterionGAN.cuda() criterionFeat = nn.L1Loss().cuda() if config['use_l2']: criterionMSE = nn.MSELoss() criterionMSE.cuda() validationLoss = criterionMSE # initialize dataloader print("Setting up dataloaders...") train_dataloader, val_dataloader, train_sampler = setup_dataloaders(config) print("Done!") # run the training loop print("Initializing optimizers...") optimizer_G = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) if config['resume_ckpt_opt_G'] is not None: optimizer_G_state_dict = torch.load( config['resume_ckpt_opt_G'], map_location=lambda storage, loc: storage) optimizer_G.load_state_dict(optimizer_G_state_dict) if config['use_gan']: optimizer_D = optim.Adam(discriminator.parameters(), lr=config['learning_rate']) if config['resume_ckpt_opt_D'] is not None: optimizer_D_state_dict = torch.load( config['resume_ckpt_opt_D'], map_location=lambda storage, loc: storage) optimizer_D.load_state_dict(optimizer_D_state_dict) print("Done!") if distributed: print("Moving model to DDP...") model = DDP(model) if config['use_gan']: discriminator = DDP(discriminator, delay_allreduce=True) print("Done!") tb_logger = None if rank == 0: tb_logdir = os.path.join(save_path, 'tbdir') if not os.path.exists(tb_logdir): os.makedirs(tb_logdir) tb_logger = SummaryWriter(tb_logdir) # run training if not os.path.exists(save_path): os.makedirs(save_path) log_name = os.path.join(save_path, 'loss_log.txt') opt_name = os.path.join(save_path, 'opt.yaml') print(config) save_options(opt_name, config) log_handle = open(log_name, 'a') print("Starting training") cnt = start_cnt assert (config['use_warped'] or config['use_temporal']) for ep in range(start_ep, epochs): if train_sampler is not None: train_sampler.set_epoch(ep) for curr_batch in train_dataloader: optimizer_G.zero_grad() input_a = curr_batch['input_a'].cuda() target = curr_batch['target'].cuda() if config['use_warped'] and config['use_temporal']: input_a = torch.cat((input_a, input_a), 0) input_b = torch.cat((curr_batch['input_b'].cuda(), curr_batch['input_temporal'].cuda()), 0) target = torch.cat((target, target), 0) elif config['use_temporal']: input_b = curr_batch['input_temporal'].cuda() elif config['use_warped']: input_b = curr_batch['input_b'].cuda() output_dict = model(input_a, input_b) output_recon = output_dict['reconstruction'] loss_vgg = loss_G_GAN = loss_G_feat = loss_l2 = 0 if config['use_vgg']: loss_vgg = criterionVGG(output_recon, target) * config['vgg_lambda'] if config['use_gan']: predicted_landmarks = output_dict['input_a_gauss_maps'] # output_dict['reconstruction'] can be considered normalized loss_G_GAN, loss_D_real, loss_D_fake = apply_GAN_criterion( output_recon, target, predicted_landmarks.detach(), discriminator, criterionGAN) loss_D = (loss_D_fake + loss_D_real) * 0.5 if config['use_l2']: loss_l2 = criterionMSE(output_recon, target) * config['l2_lambda'] loss_G = loss_G_GAN + loss_G_feat + loss_vgg + loss_l2 loss_G.backward() # grad_norm clipping if not config['no_grad_clip']: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer_G.step() if config['use_gan']: optimizer_D.zero_grad() loss_D.backward() # grad_norm clipping if not config['no_grad_clip']: torch.nn.utils.clip_grad_norm_(discriminator.parameters(), 1.0) optimizer_D.step() if distributed: if config['use_vgg']: loss_vgg = reduce_tensor(loss_vgg, config['world_size']) if rank == 0: if cnt % 10 == 0: run_visualization(output_dict, output_recon, target, input_a, input_b, save_path, tb_logger, cnt) print_dict = {"learning_rate": get_learning_rate(optimizer_G)} if config['use_vgg']: tb_logger.add_scalar('vgg.loss', loss_vgg, cnt) print_dict['Loss_VGG'] = loss_vgg.data if config['use_gan']: tb_logger.add_scalar('gan.loss', loss_G_GAN, cnt) tb_logger.add_scalar('d_real.loss', loss_D_real, cnt) tb_logger.add_scalar('d_fake.loss', loss_D_fake, cnt) print_dict['Loss_G_GAN'] = loss_G_GAN print_dict['Loss_real'] = loss_D_real.data print_dict['Loss_fake'] = loss_D_fake.data if config['use_l2']: tb_logger.add_scalar('l2.loss', loss_l2, cnt) print_dict['Loss_L2'] = loss_l2.data log_iter(ep, cnt % len(train_dataloader), len(train_dataloader), print_dict, log_handle=log_handle) if loss_G != loss_G: print("NaN!!") exit(-2) cnt = cnt + 1 # end of train iter loop if cnt % config['val_freq'] == 0 and config['val_freq'] > 0: val_loss = run_val( model, validationLoss, val_dataloader, os.path.join(save_path, 'val_%d_renders' % (ep))) if distributed: val_loss = reduce_tensor(val_loss, config['world_size']) if rank == 0: tb_logger.add_scalar('validation.loss', val_loss, cnt) log_iter(ep, cnt % len(train_dataloader), len(train_dataloader), {"Loss_VGG": val_loss}, header="Validation loss: ", log_handle=log_handle) if rank == 0: if (ep % config['save_freq'] == 0): fname = 'checkpoint_%d.ckpt' % (ep) fname = os.path.join(save_path, fname) print("Saving model...") save_weights(model, fname, distributed) optimizer_g_fname = os.path.join( save_path, 'latest_optimizer_g_state.ckpt') torch.save(optimizer_G.state_dict(), optimizer_g_fname) if config['use_gan']: fname = 'checkpoint_D_%d.ckpt' % (ep) fname = os.path.join(save_path, fname) save_weights(discriminator, fname, distributed) optimizer_d_fname = os.path.join( save_path, 'latest_optimizer_d_state.ckpt') torch.save(optimizer_D.state_dict(), optimizer_d_fname)
def run_main_single(options, path_general, file_name_general): start_time = time.time() print('Run file: main_single.py') print(time.strftime("%c")) # get correct computing device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Device: {}'.format(device)) # get the options options['device'] = device options['dataset_options'] = dynsys_params.get_dataset_options( options['dataset']) options['model_options'] = model_params.get_model_options( options['model'], options['dataset'], options['dataset_options']) options['train_options'] = train_params.get_train_options( options['dataset']) options['test_options'] = train_params.get_test_options() # print model type and dynamic system type print('\n\tModel Type: {}'.format(options['model'])) print('\tDynamic System: {}\n'.format(options['dataset'])) file_name_general = file_name_general + '_h{}_z{}_n{}'.format( options['model_options'].h_dim, options['model_options'].z_dim, options['model_options'].n_layers) path = path_general + 'data/' # check if path exists and create otherwise if not os.path.exists(path): os.makedirs(path) # set logger set_redirects(path, file_name_general) # Specifying datasets loaders = loader.load_dataset( dataset=options["dataset"], dataset_options=options["dataset_options"], train_batch_size=options["train_options"].batch_size, test_batch_size=options["test_options"].batch_size, ) # Compute normalizers if options["normalize"]: normalizer_input, normalizer_output = compute_normalizer( loaders['train']) else: normalizer_input = normalizer_output = None # Define model modelstate = ModelState(seed=options["seed"], nu=loaders["train"].nu, ny=loaders["train"].ny, model=options["model"], options=options, normalizer_input=normalizer_input, normalizer_output=normalizer_output) modelstate.model.to(options['device']) # save the options save_options(options, path_general, 'options.txt') # allocation df = {} if options['do_train']: # train the model df = training.run_train(modelstate=modelstate, loader_train=loaders['train'], loader_valid=loaders['valid'], options=options, dataframe=df, path_general=path_general, file_name_general=file_name_general) if options['do_test']: # test the model df = testing.run_test(options, loaders, df, path_general, file_name_general) # save data # get saving path path = path_general + 'data/' # check if path exists and create otherwise if not os.path.exists(path): os.makedirs(path) # to pandas df = pd.DataFrame(df) # filename file_name = file_name_general + '.csv' # save data df.to_csv(path + file_name) # time output time_el = time.time() - start_time hours = time_el // 3600 min = time_el // 60 - hours * 60 sec = time_el - min * 60 - hours * 3600 print('Total ime of file execution: {}:{:2.0f}:{:2.0f} [h:min:sec]'.format( hours, min, sec)) print(time.strftime("%c"))