def train_model(args, metadata, device='cuda'): print('training on {}'.format(torch.cuda.get_device_name(device) if args.cuda else 'cpu')) # load data if not args.preload: dset = SyntheticDataset(args.file, 'cpu') # originally 'cpu' ???? train_loader = DataLoader(dset, shuffle=True, batch_size=args.batch_size) data_dim, latent_dim, aux_dim = dset.get_dims() args.N = len(dset) metadata.update(dset.get_metadata()) else: train_loader = DataLoaderGPU(args.file, shuffle=True, batch_size=args.batch_size) data_dim, latent_dim, aux_dim = train_loader.get_dims() args.N = train_loader.dataset_len metadata.update(train_loader.get_metadata()) if args.max_iter is None: args.max_iter = len(train_loader) * args.epochs if args.latent_dim is not None: latent_dim = args.latent_dim metadata.update({"train_latent_dim": latent_dim}) # define model and optimizer model = None if args.i_what == 'iVAE': model = iVAE(latent_dim, data_dim, aux_dim, n_layers=args.depth, activation='lrelu', device=device, hidden_dim=args.hidden_dim, anneal=args.anneal, # False file=metadata['file'], # Added dataset location for easier checkpoint loading seed=1, epochs=args.epochs) elif args.i_what == 'iFlow': metadata.update({"device": device}) model = iFlow(args=metadata).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \ factor=args.lr_drop_factor, \ patience=args.lr_patience, \ verbose=True) # factor=0.1 and patience=4 ste = time.time() print('setup time: {}s'.format(ste - st)) # setup loggers logger = Logger(logdir=LOG_FOLDER) # 'log/' exp_id = logger.get_id() # 1 tensorboard_run_name = TENSORBOARD_RUN_FOLDER + 'exp' + str(exp_id) + '_'.join( map(str, ['', args.batch_size, args.max_iter, args.lr, args.hidden_dim, args.depth, args.anneal])) # 'runs/exp1_64_12500_0.001_50_3_False' writer = SummaryWriter(logdir=tensorboard_run_name) if args.i_what == 'iFlow': logger.add('log_normalizer') logger.add('neg_log_det') logger.add('neg_trace') logger.add('loss') logger.add('perf') print('Beginning training for exp: {}'.format(exp_id)) # training loop epoch = 0 model.train() while epoch < args.epochs: # args.max_iter: #12500 est = time.time() for itr, (x, u, z) in enumerate(train_loader): acc_itr = itr + epoch * len(train_loader) # x is of shape [64, 4] # u is of shape [64, 40], one-hot coding of 40 classes # z is of shape [64, 2] # it += 1 # model.anneal(args.N, args.max_iter, it) optimizer.zero_grad() if args.cuda and not args.preload: x = x.cuda(device=device, non_blocking=True) u = u.cuda(device=device, non_blocking=True) if args.i_what == 'iVAE': elbo, z_est = model.elbo(x, u) # elbo is a scalar loss while z_est is of shape [64, 2] loss = elbo.mul(-1) elif args.i_what == 'iFlow': (log_normalizer, neg_trace, neg_log_det), z_est = model.neg_log_likelihood(x, u) loss = log_normalizer + neg_trace + neg_log_det loss.backward() optimizer.step() logger.update('loss', loss.item()) if args.i_what == 'iFlow': logger.update('log_normalizer', log_normalizer.item()) logger.update('neg_trace', neg_trace.item()) logger.update('neg_log_det', neg_log_det.item()) perf = mcc(z.cpu().numpy(), z_est.cpu().detach().numpy()) logger.update('perf', perf) if acc_itr % args.log_freq == 0: # % 25 logger.log() writer.add_scalar('data/performance', logger.get_last('perf'), acc_itr) writer.add_scalar('data/loss', logger.get_last('loss'), acc_itr) if args.i_what == 'iFlow': writer.add_scalar('data/log_normalizer', logger.get_last('log_normalizer'), acc_itr) writer.add_scalar('data/neg_trace', logger.get_last('neg_trace'), acc_itr) writer.add_scalar('data/neg_log_det', logger.get_last('neg_log_det'), acc_itr) scheduler.step(logger.get_last('loss')) if acc_itr % int(args.max_iter / 5) == 0 and not args.no_log: checkpoint(TORCH_CHECKPOINT_FOLDER, \ exp_id, \ acc_itr, \ model, \ optimizer, \ logger.get_last('loss'), \ logger.get_last('perf')) epoch += 1 eet = time.time() if args.i_what == 'iVAE': print('epoch {}: {:.4f}s;\tloss: {:.4f};\tperf: {:.4f}'.format(epoch, eet - est, logger.get_last('loss'), logger.get_last('perf'))) elif args.i_what == 'iFlow': print('epoch {}: {:.4f}s;\tloss: {:.4f} (l1: {:.4f}, l2: {:.4f}, l3: {:.4f});\tperf: {:.4f}'.format( \ epoch, eet - est, logger.get_last('loss'), logger.get_last('log_normalizer'), logger.get_last('neg_trace'), logger.get_last('neg_log_det'), logger.get_last('perf'))) et = time.time() print('training time: {}s'.format(et - ste)) # Save final model checkpoint(PT_MODELS_FOLDER, "", 'final', model, optimizer, logger.get_last('loss'), logger.get_last('perf')) writer.close() if not args.no_log: logger.add_metadata(**metadata) logger.save_to_json() logger.save_to_npz() print('total time: {}s'.format(et - st)) return model
args.max_iter = len(train_loader) * args.epochs if args.latent_dim is not None: latent_dim = args.latent_dim metadata.update({"train_latent_dim": latent_dim}) # define model and optimizer model = None if args.i_what == 'iVAE': model = iVAE( latent_dim, data_dim, aux_dim, n_layers=args.depth, activation='lrelu', device=device, hidden_dim=args.hidden_dim, anneal=args.anneal, #False file=metadata[ 'file'], # Added dataset location for easier checkpoint loading seed=1, epochs=args.epochs) elif args.i_what == 'iFlow': metadata.update({"device": device}) model = iFlow(args=metadata).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \ factor=args.lr_drop_factor, \ patience=args.lr_patience, \ verbose=True) # factor=0.1 and patience=4
batch_size=args.batch_size) data_dim, latent_dim, aux_dim = train_loader.get_dims() args.N = train_loader.dataset_len metadata.update(train_loader.get_metadata()) if args.max_iter is None: args.max_iter = len(train_loader) * args.epochs if args.latent_dim is not None: latent_dim = args.latent_dim metadata.update({"train_latent_dim": latent_dim}) # define model and optimizer model = iVAE(latent_dim, data_dim, aux_dim, activation='lrelu', device=device, hidden_dim=args.hidden_dim, anneal=args.anneal) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=4, verbose=True) ste = time.time() print('setup time: {}s'.format(ste - st)) # setup loggers logger = Logger(path=LOG_FOLDER) exp_id = logger.get_id()
metadata.update(train_loader.get_metadata()) if args.max_iter is None: args.max_iter = len(train_loader) * args.epochs if args.latent_dim is not None: latent_dim = args.latent_dim metadata.update({"train_latent_dim": latent_dim}) # define model and optimizer model = None if args.i_what == 'iVAE': model = iVAE(latent_dim, \ data_dim, \ aux_dim, \ n_layers=args.depth, \ activation='lrelu', \ device=device, \ hidden_dim=args.hidden_dim, \ anneal=args.anneal) # False elif args.i_what == 'iFlow': metadata.update({"device": device}) model = iFlow(args=metadata).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \ factor=args.lr_drop_factor, \ patience=args.lr_patience, \ verbose=True) # factor=0.1 and patience=4 ste = time.time() print('setup time: {}s'.format(ste - st))
def load_model_from_checkpoint(ckpt_path, device, model_seed=1): print('checkpoint path:', ckpt_path) model_args = ckpt_path.split('/')[ 1] # get folder name containing model and data properties ckpt_filename = ckpt_path.split('/')[-1] model_args = model_args.split('_') epochs = model_args[-1] model_name = model_args[-2] data_args = model_args[:-2] data_file = create_if_not_exist_dataset(root='data/{}/'.format(model_seed), arg_str="_".join(data_args)) nps = int(data_args[0]) ns = int(data_args[1]) aux_dim = int(data_args[1]) n = nps * ns latent_dim = int(data_args[2]) data_dim = int(data_args[3]) print('Loading model', model_name) model_path = ckpt_path print('Loading data', data_file) A = np.load(data_file) x = A['x'] # of shape x = torch.from_numpy(x).to(device) print("x.shape ==", x.shape) s = A['s'] # of shape # s = torch.from_numpy(s).to(device) print("s.shape ==", s.shape) u = A['u'] # of shape u = torch.from_numpy(u).to(device) print("u.shape ==", u.shape) checkpoint = torch.load(model_path) # Arguments (metadata, from argparse in main.py), have to correspond to selected dataset and model properties # Hyperparameter and configurations as precribed in the paper. metadata = { 'file': data_file, 'path': data_file, 'batch_size': 64, 'epochs': epochs, 'device': device, 'seed': 1, 'i_what': model_name, 'max_iter': None, 'hidden_dim': 50, 'depth': 3, 'lr': 1e-3, 'cuda': True, 'preload': True, 'anneal': False, 'log_freq': 25, 'flow_type': 'RQNSF_AG', 'num_bins': 8, 'nat_param_act': 'Softplus', 'gpu_id': '0', 'flow_length': 10, 'lr_drop_factor': 0.25, 'lr_patience': 10 } # Get dataset properties metadata.update({ 'nps': nps, 'ns': ns, 'n': n, 'latent_dim': latent_dim, 'data_dim': data_dim, 'aux_dim': aux_dim }) if model_name == 'iFlow': model = iFlow(args=metadata).to(device) elif model_name == "iVAE": model = iVAE( latent_dim, # latent_dim data_dim, # data_dim aux_dim, # aux_dim n_layers=metadata['depth'], activation='lrelu', device=device, hidden_dim=metadata['hidden_dim'], anneal=metadata['anneal'], # False file=metadata['file'], seed=1) model.load_state_dict(checkpoint['model_state_dict']) return model