def train(model, opt, train_loader, args, discriminators, writer): model.train() for disc in discriminators: if discriminators[disc][ 0] is not None and discriminators[disc][1] is None: print(f"define an optimizer for discriminator:{disc}.. exiting", file=sys.stderr) sys.stderr.flush() else: discriminators[disc][0].train() for idx, data in enumerate(train_loader): # if there are any discriminators used for training then, graph may have to be retained if len(discriminators) > 0: retain_graph = True else: retain_graph = False images = data[0] images = images.to(args.device) loss_dict, z = get_losses(model, images, args, discriminators) # print(loss_dict, file=sys.stderr) ############1111 AE loss 11111############ opt.zero_grad() loss_dict["recons_loss"].backward(retain_graph=retain_graph) opt.step() ###########111111 ----------- 111111######### # backprop all discriminators for disc_idx, disc in enumerate(discriminators): discriminators[disc][1].zero_grad() if disc_idx >= len(discriminators) - 1: retain_graph = False #print(f"disc:{disc}\t loss:{loss_dict[f'{disc}_loss']}\t retain:{retain_graph}", file=sys.stderr) loss_dict[f"{disc}_loss"].backward(retain_graph=retain_graph) discriminators[disc][1].step() # Logs if idx % args.print_interval == 0: print( "iter:", str(args.steps) + "/" + str(args.num_epochs * (len(train_loader))), "iter loss:", loss_dict["recons_loss"].item()) for disc in discriminators: print(f"{disc} loss:{loss_dict[f'{disc}_loss']}") sys.stderr.flush() sys.stderr.flush() train_util.log_losses("train", loss_dict, args.steps, writer) train_util.log_latent_metrics("train", z, args.steps, writer)
def train(epoch, data_loader, model, optimizer, args, writer, discriminators): for disc in discriminators: if discriminators[disc][ 0] is not None and discriminators[disc][1] is None: print(f"define an optimizer for discriminator:{disc}.. exiting", file=sys.stderr) else: discriminators[disc][0].train() torch.cuda.empty_cache() for idx, data in enumerate(data_loader): # if there are any discriminators used for training then, graph may have to be retained if len(discriminators) > 0: retain_graph = True else: retain_graph = False images = data[0] images = images.to(args.device) loss_dict, z = get_losses(model, images, args, discriminators) optimizer.zero_grad() loss_dict["recons_loss"].backward(retain_graph=retain_graph) optimizer.step() # backprop all discriminators for disc_idx, disc in enumerate(discriminators): discriminators[disc][1].zero_grad() if disc_idx >= len(discriminators) - 1: retain_graph = False loss_dict[f"{disc}_loss"].backward(retain_graph=retain_graph) discriminators[disc][1].step() # Logs if idx % 1000 == 0: print(f"iter:{args.steps}\trecons loss:{loss_dict['recons_loss']}", file=sys.stderr) for disc in discriminators: print(f"{disc} loss:{loss_dict[f'{disc}_loss']}", file=sys.stderr) sys.stderr.flush() train_util.log_losses("train", loss_dict, args.steps, writer) train_util.log_latent_metrics("train", z, args.steps, writer) if idx == 0: print(torch.cuda.max_memory_allocated(torch.device("cuda:0")), file=sys.stderr) #print(torch.cuda.max_memory_allocated(torch.device("cuda:1")), file=sys.stderr) sys.stderr.flush() args.steps += 1
def val_test(args): writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_filename = './models/{0}'.format(args.output_folder) train_loader, valid_loader, test_loader = train_util.get_dataloaders(args) recons_input_img = train_util.log_input_img_grid(test_loader, writer) input_dim = 3 model = VectorQuantizedVAE(input_dim, args.hidden_size, args.k, args.enc_type, args.dec_type) # if torch.cuda.device_count() > 1 and args.device == "cuda": # model = torch.nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) discriminators = {} if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] model.to(args.device) for disc in discriminators: discriminators[disc][0].to(args.device) if args.weights == "load": start_epoch = train_util.load_state(save_filename, model, optimizer, discriminators) else: start_epoch = 0 stop_patience = args.stop_patience best_loss = torch.tensor(np.inf) for epoch in tqdm(range(start_epoch, 4), file=sys.stdout): val_loss_dict, z = train_util.test(get_losses, model, valid_loader, args, discriminators, True) # if args.weights == "init" and epoch==1: # epoch+=1 # break train_util.log_recons_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_interp_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) train_util.save_state(model, optimizer, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, save_filename) print(val_loss_dict)
def main(args): writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_filename = './models/{0}'.format(args.output_folder) train_loader, val_loader, test_loader = train_util.get_dataloaders(args) recons_input_img = train_util.log_input_img_grid(test_loader, writer) input_dim = 3 model = ACAI(args.img_res, input_dim, args.hidden_size, args.enc_type, args.dec_type).to(args.device) disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) disc_opt = torch.optim.Adam(disc.parameters(), lr=args.disc_lr, amsgrad=True) # if torch.cuda.device_count() > 1 and args.device == "cuda": # model = torch.nn.DataParallel(model) opt = torch.optim.Adam(model.parameters(), lr=args.lr) # ae_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) # interp_disc_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(disc_opt, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) discriminators = {"interp_disc": [disc, disc_opt]} if args.recons_loss != "mse": if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) elif args.recons_loss == "comp": recons_disc = AnchorComparator(input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_2" in args.recons_loss: recons_disc = ClubbedPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_6" in args.recons_loss: recons_disc = FullPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] # Generate the samples first once train_util.save_recons_img_grid("test", recons_input_img, model, 0, args) if args.weights == "load": start_epoch = train_util.load_state(save_filename, model, opt, discriminators) else: start_epoch = 0 best_loss = torch.tensor(np.inf) for epoch in range(args.num_epochs): print("Epoch {}:".format(epoch)) train(model, opt, train_loader, args, discriminators, writer) # curr_loss = val(model, val_loader) # print(f"epoch val loss:{curr_loss}") val_loss_dict, z = train_util.test(get_losses, model, val_loader, args, discriminators) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) # train_util.log_recons_img_grid(recons_input_img, model, epoch+1, args.device, writer) # train_util.log_interp_img_grid(recons_input_img, model, epoch+1, args.device, writer) train_util.save_recons_img_grid("val", recons_input_img, model, epoch + 1, args) train_util.save_interp_img_grid("val", recons_input_img, model, epoch + 1, args) train_util.save_state(model, opt, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, save_filename)
def main(args): writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_filename = './models/{0}'.format(args.output_folder) train_loader, valid_loader, test_loader = train_util.get_dataloaders(args) num_channels = 3 model = VectorQuantizedVAE(num_channels, args.hidden_size, args.k, args.enc_type, args.dec_type) model.to(args.device) # Fixed images for Tensorboard recons_input_img = train_util.log_input_img_grid(test_loader, writer) train_util.log_recons_img_grid(recons_input_img, model, 0, args.device, writer) discriminators = {} input_dim = 3 if args.recons_loss != "mse": if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) elif args.recons_loss == "comp": recons_disc = AnchorComparator(input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_2" in args.recons_loss: recons_disc = ClubbedPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_6" in args.recons_loss: recons_disc = FullPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) recons_disc_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( recons_disc_opt, "min", patience=args.lr_patience, factor=0.5, threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) ae_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, "min", patience=args.lr_patience, factor=0.5, threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) if torch.cuda.device_count() > 1: model = train_util.ae_data_parallel(model) for disc in discriminators: discriminators[disc][0] = torch.nn.DataParallel( discriminators[disc][0]) model.to(args.device) for disc in discriminators: discriminators[disc][0].to(args.device) # Generate the samples first once recons_input_img = train_util.log_input_img_grid(test_loader, writer) train_util.log_recons_img_grid(recons_input_img, model, 0, args.device, writer) if args.weights == "load": start_epoch = train_util.load_state(save_filename, model, optimizer, discriminators) else: start_epoch = 0 stop_patience = args.stop_patience best_loss = torch.tensor(np.inf) for epoch in tqdm(range(start_epoch, args.num_epochs), file=sys.stdout): try: train(epoch, train_loader, model, optimizer, args, writer, discriminators) except RuntimeError as err: print("".join( traceback.TracebackException.from_exception(err).format()), file=sys.stderr) print("*******") print(err, file=sys.stderr) print(f"batch_size:{args.batch_size}", file=sys.stderr) exit(0) val_loss_dict, z = train_util.test(get_losses, model, valid_loader, args, discriminators) train_util.log_recons_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_interp_img_grid(recons_input_img, model, epoch + 1, args.device, writer) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) train_util.save_state(model, optimizer, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, save_filename) # early stop check # if val_loss_dict["recons_loss"] - best_loss < args.threshold: # stop_patience -= 1 # else: # stop_patience = args.stop_patience # if stop_patience == 0: # print("training early stopped!") # break ae_lr_scheduler.step(val_loss_dict["recons_loss"]) if args.recons_loss != "mse": recons_disc_lr_scheduler.step(val_loss_dict["recons_disc_loss"])
def train(epoch, data_loader, model, optimizer, args, writer, discriminators): # use a dict or add interp dict and optim separately model.train() for disc in discriminators: if discriminators[disc][ 0] is not None and discriminators[disc][1] is None: print(f"define an optimizer for discriminator:{disc}.. exiting", file=sys.stderr) sys.stderr.flush() else: discriminators[disc][0].train() for idx, data in enumerate(data_loader): retain_graph = True # print(f"shape:{data.shape}") loss_dict, z = get_losses(model, data, args, discriminators) ############1111 AE loss 11111############ optimizer.zero_grad() loss_dict["recons_loss"].backward(retain_graph=retain_graph) optimizer.step() ###########111111 ----------- 111111######### # backprop all discriminators for disc_idx, disc in enumerate(discriminators): discriminators[disc][1].zero_grad() if disc_idx >= len(discriminators) - 1: retain_graph = False #print(f"disc:{disc}\t loss:{loss_dict[f'{disc}_loss']}\t retain:{retain_graph}", file=sys.stderr) loss_dict[f"{disc}_loss"].backward(retain_graph=retain_graph) discriminators[disc][1].step() # Logs if idx % args.print_interval == 0: print("iter:", str(args.steps) + "/" + str(args.num_epochs * (len(data_loader))), "iter loss:", loss_dict["recons_loss"].item(), file=sys.stderr) print(f"memory:\n{subprocess.run('nvidia-smi')}") for disc in discriminators: print(f"{disc} loss:{loss_dict[f'{disc}_loss']}", file=sys.stderr) sys.stderr.flush() sys.stderr.flush() train_util.log_losses("train", loss_dict, args.steps, writer) train_util.log_latent_metrics("train", z, args.steps, writer) if idx == 0: print(torch.cuda.max_memory_allocated(torch.device(args.device)), file=sys.stderr) sys.stderr.flush() args.steps += 1
def main(args): input_dim = 3 model = VAE(input_dim, args.hidden_size, args.enc_type, args.dec_type) opt = torch.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-6) # ae_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, "min", patience=args.lr_patience, factor=0.5, # threshold=args.threshold, threshold_mode="abs", min_lr=1e-7) discriminators = {} if args.recons_loss != "mse": if args.recons_loss == "gan": recons_disc = Discriminator(input_dim, args.img_res, args.input_type).to(args.device) elif args.recons_loss == "comp": recons_disc = AnchorComparator(input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_2" in args.recons_loss: recons_disc = ClubbedPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) elif "comp_6" in args.recons_loss: recons_disc = FullPermutationComparator( input_dim * 2, args.img_res, args.input_type).to(args.device) recons_disc_opt = torch.optim.Adam(recons_disc.parameters(), lr=args.disc_lr, amsgrad=True) discriminators["recons_disc"] = [recons_disc, recons_disc_opt] if torch.cuda.device_count() > 1: model = train_util.ae_data_parallel(model) for disc in discriminators: discriminators[disc][0] = torch.nn.DataParallel( discriminators[disc][0]) model.to(args.device) for disc in discriminators: discriminators[disc][0].to(args.device) print("model built", file=sys.stderr) #print("model created") train_loader, val_loader, test_loader = train_util.get_dataloaders(args) print("loaders acquired", file=sys.stderr) #print("loaders acquired") model_name = f"vae_{args.recons_loss}" if args.output_folder is None: args.output_folder = os.path.join( model_name, args.dataset, f"depth_{args.enc_type}_{args.dec_type}_hs_{args.img_res}_{args.hidden_size}" ) log_save_path = os.path.join("./logs", args.output_folder) model_save_path = os.path.join("./models", args.output_folder) if not os.path.exists(log_save_path): os.makedirs(log_save_path) print(f"log:{log_save_path}", file=sys.stderr) sys.stderr.flush() if not os.path.exists(model_save_path): os.makedirs(model_save_path) writer = SummaryWriter(log_save_path) print(f"train loader length:{len(train_loader)}", file=sys.stderr) best_loss = torch.tensor(np.inf) if args.weights == "load": start_epoch = train_util.load_state(model_save_path, model, opt, discriminators) else: start_epoch = 0 recons_input_img = train_util.log_input_img_grid(test_loader, writer) train_util.log_recons_img_grid(recons_input_img, model, 0, args.device, writer) stop_patience = args.stop_patience for epoch in range(start_epoch, args.num_epochs): try: train(model, train_loader, opt, epoch, writer, args, discriminators) except RuntimeError as err: print("".join( traceback.TracebackException.from_exception(err).format()), file=sys.stderr) print("*******", file=sys.stderr) print(err, file=sys.stderr) exit(0) val_loss_dict, z = train_util.test(get_losses, model, val_loader, args, discriminators) print(f"epoch loss:{val_loss_dict['recons_loss'].item()}") train_util.save_recons_img_grid("test", recons_input_img, model, epoch + 1, args) train_util.save_interp_img_grid("test", recons_input_img, model, epoch + 1, args) train_util.log_losses("val", val_loss_dict, epoch + 1, writer) train_util.log_latent_metrics("val", z, epoch + 1, writer) train_util.save_state(model, opt, discriminators, val_loss_dict["recons_loss"], best_loss, args.recons_loss, epoch, model_save_path)