def main(args): global step, epoch, result_dict if args.checkpoint is not None: print("############# Get Old Args ##############") model, epoch, step, old_args = Checkpoint.restore(args.checkpoint) args.model = old_args.model args.model_input = old_args.model_input args.viewpoint = old_args.viewpoint args.rot_rep = old_args.rot_rep args.feat_dim = old_args.feat_dim args.voxel_dim = old_args.voxel_dim args.small_decoder = old_args.small_decoder args.reconstruction = old_args.reconstruction args.unet_output = old_args.unet_output args.depth_sculpt = old_args.depth_sculpt args.best_loss = old_args.best_loss else: model = get_model(args.model, args.rot_rep, args.model_input, args.unet_output, args.pretrained, args.no_refinement) print(args) loader = get_loaders( name=args.dataset, batch_size=args.batch_size, num_workers=args.num_workers, split=args.split, rot_rep=args.rot_rep, n_views=args.n_views, corrupt_vp=args.corrupt_vp, ) loader.dataset.__getitem__(0) # initialize result dictionaries logable_metrics, printable_metrics = get_metrics( args.reconstruction, args.viewpoint, args.realism_check, args.unet_output, ) result_dict = ResultDict(loader.dataset, logable_metrics, printable_metrics) # Train on GPU model.cuda() print("############# Start Evaluation ##############") eval_step(model, loader, args.split)
def main(eval_args): # ensures that weight initializations are all the same logging = utils.Logger(eval_args.local_rank, eval_args.save) # load a checkpoint logging.info('loading the model at:') logging.info(eval_args.checkpoint) checkpoint = torch.load(eval_args.checkpoint, map_location='cpu') args = checkpoint['args'] if not hasattr(args, 'ada_groups'): logging.info('old model, no ada groups was found.') args.ada_groups = False if not hasattr(args, 'min_groups_per_scale'): logging.info('old model, no min_groups_per_scale was found.') args.min_groups_per_scale = 1 if not hasattr(args, 'num_mixture_dec'): logging.info('old model, no num_mixture_dec was found.') args.num_mixture_dec = 10 logging.info('loaded the model at epoch %d', checkpoint['epoch']) arch_instance = utils.get_arch_cells(args.arch_instance) model = AutoEncoder(args, None, arch_instance) # Loading is not strict because of self.weight_normalized in Conv2D class in neural_operations. This variable # is only used for computing the spectral normalization and it is safe not to load it. Some of our earlier models # did not have this variable. model.load_state_dict(checkpoint['state_dict'], strict=False) model = model.cuda() logging.info('args = %s', args) logging.info('num conv layers: %d', len(model.all_conv_layers)) logging.info('param size = %fM ', utils.count_parameters_in_M(model)) if eval_args.eval_mode == 'evaluate': # load train valid queue args.data = eval_args.data train_queue, valid_queue, num_classes = datasets.get_loaders(args) if eval_args.eval_on_train: logging.info('Using the training data for eval.') valid_queue = train_queue # get number of bits num_output = utils.num_output(args.dataset) bpd_coeff = 1. / np.log(2.) / num_output valid_neg_log_p, valid_nelbo = test( valid_queue, model, num_samples=eval_args.num_iw_samples, args=args, logging=logging) logging.info('final valid nelbo %f', valid_nelbo) logging.info('final valid neg log p %f', valid_neg_log_p) logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff) logging.info('final valid neg log p in bpd %f', valid_neg_log_p * bpd_coeff) else: bn_eval_mode = not eval_args.readjust_bn num_samples = 16 with torch.no_grad(): n = int(np.floor(np.sqrt(num_samples))) set_bn(model, bn_eval_mode, num_samples=36, t=eval_args.temp, iter=500) for ind in range(10): # sampling is repeated. torch.cuda.synchronize() start = time() with autocast(): logits = model.sample(num_samples, eval_args.temp) output = model.decoder_output(logits) output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \ else output.sample() torch.cuda.synchronize() end = time() output_tiled = utils.tile_image(output_img, n).cpu().numpy().transpose( 1, 2, 0) logging.info('sampling time per batch: %0.3f sec', (end - start)) output_tiled = np.asarray(output_tiled * 255, dtype=np.uint8) output_tiled = np.squeeze(output_tiled) plt.imshow(output_tiled) plt.show()
def main(args): # ensures that weight initializations are all the same torch.manual_seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) logging = utils.Logger(args.global_rank, args.save) writer = utils.Writer(args.global_rank, args.save) # Get data loaders. train_queue, valid_queue, num_classes, _ = datasets.get_loaders(args) args.num_total_iter = len(train_queue) * args.epochs warmup_iters = len(train_queue) * args.warmup_epochs swa_start = len(train_queue) * (args.epochs - 1) arch_instance = utils.get_arch_cells(args.arch_instance) model = AutoEncoder(args, writer, arch_instance) model = model.cuda() logging.info('args = %s', args) logging.info('param size = %fM ', utils.count_parameters_in_M(model)) logging.info('groups per scale: %s, total_groups: %d', model.groups_per_scale, sum(model.groups_per_scale)) if args.fast_adamax: # Fast adamax has the same functionality as torch.optim.Adamax, except it is faster. cnn_optimizer = Adamax(model.parameters(), args.learning_rate, weight_decay=args.weight_decay, eps=1e-3) else: cnn_optimizer = torch.optim.Adamax(model.parameters(), args.learning_rate, weight_decay=args.weight_decay, eps=1e-3) cnn_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( cnn_optimizer, float(args.epochs - args.warmup_epochs - 1), eta_min=args.learning_rate_min) grad_scalar = GradScaler(2**10) num_output = utils.num_output(args.dataset, args) bpd_coeff = 1. / np.log(2.) / num_output # if load checkpoint_file = os.path.join(args.save, 'checkpoint.pt') if args.cont_training: logging.info('loading the model.') checkpoint = torch.load(checkpoint_file, map_location='cpu') init_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) model = model.cuda() cnn_optimizer.load_state_dict(checkpoint['optimizer']) grad_scalar.load_state_dict(checkpoint['grad_scalar']) cnn_scheduler.load_state_dict(checkpoint['scheduler']) global_step = checkpoint['global_step'] else: global_step, init_epoch = 0, 0 for epoch in range(init_epoch, args.epochs): # update lrs. if args.distributed: train_queue.sampler.set_epoch(global_step + args.seed) valid_queue.sampler.set_epoch(0) if epoch > args.warmup_epochs: cnn_scheduler.step() # Logging. logging.info('epoch %d', epoch) # Training. train_nelbo, global_step = train(train_queue, model, cnn_optimizer, grad_scalar, global_step, warmup_iters, writer, logging) logging.info('train_nelbo %f', train_nelbo) writer.add_scalar('train/nelbo', train_nelbo, global_step) model.eval() # generate samples less frequently eval_freq = 1 if args.epochs <= 50 else 20 if epoch % eval_freq == 0 or epoch == (args.epochs - 1): with torch.no_grad(): num_samples = 16 n = int(np.floor(np.sqrt(num_samples))) for t in [0.7, 0.8, 0.9, 1.0]: logits = model.sample(num_samples, t) output = model.decoder_output(logits) output_img = output.mean if isinstance( output, torch.distributions.bernoulli.Bernoulli ) else output.sample(t) output_tiled = utils.tile_image(output_img, n) writer.add_image('generated_%0.1f' % t, output_tiled, global_step) valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=10, args=args, logging=logging) logging.info('valid_nelbo %f', valid_nelbo) logging.info('valid neg log p %f', valid_neg_log_p) logging.info('valid bpd elbo %f', valid_nelbo * bpd_coeff) logging.info('valid bpd log p %f', valid_neg_log_p * bpd_coeff) writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch) writer.add_scalar('val/nelbo', valid_nelbo, epoch) writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch) writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch) save_freq = int(np.ceil(args.epochs / 100)) if epoch % save_freq == 0 or epoch == (args.epochs - 1): if args.global_rank == 0: logging.info('saving the model.') torch.save( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': cnn_optimizer.state_dict(), 'global_step': global_step, 'args': args, 'arch_instance': arch_instance, 'scheduler': cnn_scheduler.state_dict(), 'grad_scalar': grad_scalar.state_dict() }, checkpoint_file) # Final validation valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=1000, args=args, logging=logging) logging.info('final valid nelbo %f', valid_nelbo) logging.info('final valid neg log p %f', valid_neg_log_p) writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch + 1) writer.add_scalar('val/nelbo', valid_nelbo, epoch + 1) writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch + 1) writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch + 1) writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--mode", type=str, default="train") parser.add_argument("--model", type=str, default="mobilenet_v2") parser.add_argument("--dataset", type=str, default="cifar10") parser.add_argument("--dataroot", type=str, default="/tmp/data") parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--n_epochs", type=int, default=100) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--n_gpus", type=int, default=1) parser.add_argument("--checkpoint", type=str, default="/tmp/chkpt.pth.tar") parser.add_argument("--save_every", type=int, default=10) parser.add_argument("--pretrained", type=str, default=None) args = parser.parse_args() print(args) if torch.cuda.is_available(): print("cuda is available, use cuda") device = torch.device("cuda") else: print("cuda is not available, use cpu") device = torch.device("cpu") print("download dataset: {}".format(args.dataset)) train_loader, test_loader, n_classes = get_loaders( dataset=args.dataset, root=args.dataroot, batch_size=args.batch_size) print("build model: {}".format(args.model)) if args.model == "mobilenet": from models import MobileNet model = MobileNet(n_classes=n_classes) elif args.model == "mobilenet_v2": from models import MobileNet_v2 model = MobileNet_v2(n_classes=n_classes) elif args.model == "shufflenet": from models import ShuffleNet model = ShuffleNet(n_classes=n_classes) elif args.model == "shufflenet_v2": from models import ShuffleNet_v2 model = ShuffleNet_v2(n_classes=n_classes) elif args.model == "squeezenet": from models import SqueezeNet model = SqueezeNet(n_classes=n_classes) else: raise NotImplementedError model = model.to(device) if args.pretrained: model.load_state_dict(torch.load(args.checkpoint)) if args.n_gpus > 1: gpus = [] for i in range(args.n_gpus): gpus.append(i) model = nn.DataParallel(model, device_ids=gpus) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() if args.mode == "train": for epoch in range(args.n_epochs): train(epoch, model, optimizer, criterion, train_loader, device) if (epoch + 1) % args.save_every == 0: print("saving model...") torch.save(the_model.state_dict(), args.checkpoint) elif args.mode == "test": test(model, criterion, test_loader, device) else: raise NotImplementedError
def experiment(logdir: str, device: str): tb_logdir = logdir / "tensorboard" seed_all() model = SimpleNet().to(device) optimizer = optim.AdamW(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss() train_loader, valid_loader = get_loaders("") with TensorboardLogger(tb_logdir) as tb: stage = "stage0" n_epochs = 10 checkpointer = CheckpointManager( logdir=logdir / stage, metric="accuracy", metric_minimization=False, save_n_best=3, ) for ep in range(1, n_epochs + 1): print(f"[Epoch {ep}/{n_epochs}]") train_loss, train_acc = train_fn( model, train_loader, device, criterion, optimizer ) valid_loss, valid_acc = valid_fn(model, valid_loader, device, criterion) # log metrics tb.metric(f"{stage}/loss", {"train": train_loss, "valid": valid_loss}, ep) tb.metric( f"{stage}/accuracy", {"train": train_acc, "valid": valid_acc}, ep, ) epoch_metrics = { "train_loss": train_loss, "train_accuracy": train_acc, "valid_loss": valid_loss, "valid_accuracy": valid_acc, } # store checkpoints checkpointer.process( score=valid_acc, epoch=ep, checkpoint=make_checkpoint( stage, ep, model, optimizer, metrics=epoch_metrics, ), ) print() print(f" train loss - {train_loss:.5f}") print(f"train dataset accuracy - {train_acc:.5f}") print(f" valid loss - {valid_loss:.5f}") print(f"valid dataset accuracy - {valid_acc:.5f}") print() # do a next training stage stage = "stage1" n_epochs = 10 print(f"\n\nStage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric="accuracy", metric_minimization=False, save_n_best=3, ) load_checkpoint(logdir / "stage0" / "best.pth", model) optimizer = optim.Adam(model.parameters(), lr=1e-4 / 2) for ep in range(1, n_epochs + 1): print(f"[Epoch {ep}/{n_epochs}]") train_loss, train_acc = train_fn( model, train_loader, device, criterion, optimizer ) valid_loss, valid_acc = valid_fn(model, valid_loader, device, criterion) # log metrics tb.metric(f"{stage}/loss", {"train": train_loss, "valid": valid_loss}, ep) tb.metric( f"{stage}/accuracy", {"train": train_acc, "valid": valid_acc}, ep, ) epoch_metrics = { "train_loss": train_loss, "train_accuracy": train_acc, "valid_loss": valid_loss, "valid_accuracy": valid_acc, } # store checkpoints checkpointer.process( score=valid_acc, epoch=ep, checkpoint=make_checkpoint( stage, ep, model, optimizer, metrics=epoch_metrics, ), ) print() print(f" train loss - {train_loss:.5f}") print(f"train dataset accuracy - {train_acc:.5f}") print(f" valid loss - {valid_loss:.5f}") print(f"valid dataset accuracy - {valid_acc:.5f}") print() load_checkpoint(logdir / "stage1" / "best.pth", model)
def experiment(rank, world_size, logdir): """Experiment flow. Args: rank (int): process rank world_size (int): world size logdir (pathlib.Path): directory with logs """ # preparations torch.cuda.set_device(rank) setup(rank, world_size) logdir = Path(logdir) if isinstance(logdir, str) else logdir tb_logdir = logdir / "tensorboard" main_metric = "accuracy" minimize_metric = False def log(text): if rank == 0: print(text) train_loader, valid_loader = get_loaders("", rank, world_size) world_setup = (rank, world_size) train_batch_cnt = 0 valid_batch_cnt = 0 with TensorboardLogger(str(tb_logdir), write_to_disk=(rank == 0)) as tb: stage = "stage0" n_epochs = 2 log(f"Stage - {stage}") seed_all() model = SimpleNet() model = nn.SyncBatchNorm.convert_sync_batchnorm(model) log("Used sync batchnorm") model = model.to(rank) model = nn.parallel.DistributedDataParallel(model, device_ids=[rank]) optimizer = optim.AdamW(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss() checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=3, ) for ep in range(1, n_epochs + 1): log(f"[Epoch {ep}/{n_epochs}]") train_metrics = train_fn( model, train_loader, world_setup, criterion, optimizer, tb_logger=tb, last_iteration_index=train_batch_cnt, ) if rank == 0: tb.add_scalars(f"{stage}/train", train_metrics, ep) train_batch_cnt += len(train_loader) valid_metrics = valid_fn( model, valid_loader, world_setup, criterion, tb_logger=tb, last_iteration_index=valid_batch_cnt, ) valid_batch_cnt += len(valid_loader) if rank == 0: tb.add_scalars(f"{stage}/valid", valid_metrics, ep) # store checkpoints checkpointer.process( score=valid_metrics[main_metric], epoch=ep, checkpoint=make_checkpoint( stage, ep, model, optimizer, metrics={ "train": train_metrics, "valid": valid_metrics }, ), ) log("[{}/{}] train: loss - {}, accuracy - {}".format( ep, n_epochs, train_metrics["loss"], train_metrics["accuracy"])) log("[{}/{}] valid: loss - {}, accuracy - {}".format( ep, n_epochs, valid_metrics["loss"], valid_metrics["accuracy"])) # do a next training stage stage = "stage1" n_epochs = 3 log("*" * 100) log(f"Stage - {stage}") # wait other processes dist.barrier() model = SimpleNet() load_checkpoint(logdir / "stage0" / "best.pth", model, verbose=True) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = model.to(rank) model = nn.parallel.DistributedDataParallel(model, device_ids=[rank]) optimizer = optim.Adam(model.parameters(), lr=1e-4 / 2) checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=3, ) for ep in range(1, n_epochs + 1): log(f"[Epoch {ep}/{n_epochs}]") train_metrics = train_fn( model, train_loader, world_setup, criterion, optimizer, tb_logger=tb, last_iteration_index=train_batch_cnt, ) if rank == 0: tb.add_scalars(f"{stage}/train", train_metrics, ep) train_batch_cnt += len(train_loader) valid_metrics = valid_fn( model, valid_loader, world_setup, criterion, tb_logger=tb, last_iteration_index=valid_batch_cnt, ) valid_batch_cnt += len(valid_loader) if rank == 0: tb.add_scalars(f"{stage}/valid", valid_metrics, ep) # store checkpoints checkpointer.process( score=valid_metrics[main_metric], epoch=ep, checkpoint=make_checkpoint( stage, ep, model, optimizer, metrics={ "train": train_metrics, "valid": valid_metrics }, ), ) log("[{}/{}] train: loss - {}, accuracy - {}".format( ep, n_epochs, train_metrics["loss"], train_metrics["accuracy"])) log("[{}/{}] valid: loss - {}, accuracy - {}".format( ep, n_epochs, valid_metrics["loss"], valid_metrics["accuracy"])) cleanup()
def main(eval_args): # ensures that weight initializations are all the same logging = utils.Logger(eval_args.local_rank, eval_args.save) # load a checkpoint logging.info('loading the model at:') logging.info(eval_args.checkpoint) checkpoint = torch.load(eval_args.checkpoint, map_location='cpu') args = checkpoint['args'] if not hasattr(args, 'ada_groups'): logging.info('old model, no ada groups was found.') args.ada_groups = False if not hasattr(args, 'min_groups_per_scale'): logging.info('old model, no min_groups_per_scale was found.') args.min_groups_per_scale = 1 if not hasattr(args, 'num_mixture_dec'): logging.info('old model, no num_mixture_dec was found.') args.num_mixture_dec = 10 logging.info('loaded the model at epoch %d', checkpoint['epoch']) arch_instance = utils.get_arch_cells(args.arch_instance) model = AutoEncoder(args, None, arch_instance) # Loading is not strict because of self.weight_normalized in Conv2D class in neural_operations. This variable # is only used for computing the spectral normalization and it is safe not to load it. Some of our earlier models # did not have this variable. model.load_state_dict(checkpoint['state_dict'], strict=False) model = model.cuda() logging.info('args = %s', args) logging.info('num conv layers: %d', len(model.all_conv_layers)) logging.info('param size = %fM ', utils.count_parameters_in_M(model)) if eval_args.eval_mode == 'evaluate': # load train valid queue args.data = eval_args.data train_queue, valid_queue, num_classes = datasets.get_loaders(args) if eval_args.eval_on_train: logging.info('Using the training data for eval.') valid_queue = train_queue # get number of bits num_output = utils.num_output(args.dataset) bpd_coeff = 1. / np.log(2.) / num_output valid_neg_log_p, valid_nelbo = test( valid_queue, model, num_samples=eval_args.num_iw_samples, args=args, logging=logging) logging.info('final valid nelbo %f', valid_nelbo) logging.info('final valid neg log p %f', valid_neg_log_p) logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff) logging.info('final valid neg log p in bpd %f', valid_neg_log_p * bpd_coeff) else: bn_eval_mode = not eval_args.readjust_bn num_samples = 16 with torch.no_grad(): n = int(np.floor(np.sqrt(num_samples))) set_bn(model, bn_eval_mode, num_samples=36, t=eval_args.temp, iter=500) for ind in range(10): # sampling is repeated. torch.cuda.synchronize() start = time() with autocast(): logits = model.sample(num_samples, eval_args.temp) output = model.decoder_output(logits) output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \ else output.sample() torch.cuda.synchronize() end = time() # save images to 'results/eval-x/images/epochn' where x is exp id and n is epoch muber # print("tensor shape: {}".format(output_img.shape)) # try saving the images one my one path_to_images = '/content/gdrive/MyDrive/pipeline_results/NVAE/results/eval-1/images' if not os.path.exists(path_to_images): os.makedirs(path_to_images) for i in range(output_img.size(0)): vutils.save_image(output_img[i, :, :, :], '%s/sample_batch%03d_img%03d.png' % (path_to_images, ind + 1, i + 1), normalize=True)
def main(config): model = load_model(config) train_loader, val_loader = get_loaders(model, config) # Make dirs if not os.path.exists(config.checkpoints): os.makedirs(config.checkpoints, exist_ok=True) if not os.path.exists(config.save_path): os.makedirs(config.save_path, exist_ok=True) # Loss Functions criterion_GAN = mse_loss # Calculate output of image discriminator (PatchGAN) patch = (1, config.image_size // 2**4, config.image_size // 2**4) # Initialize vgg = Vgg16().to(config.device) resnet = ResNet18(requires_grad=True, pretrained=True).to(config.device) generator = GeneratorUNet().to(config.device) discriminator = Discriminator().to(config.device) if config.epoch != 0: # Load pretrained models resnet.load_state_dict( torch.load( os.path.join(config.checkpoints, 'epoch_%d_%s.pth' % (config.epoch - 1, 'resnet')))) generator.load_state_dict( torch.load( os.path.join( config.checkpoints, 'epoch_%d_%s.pth' % (config.epoch - 1, 'generator')))) discriminator.load_state_dict( torch.load( os.path.join( config.checkpoints, 'epoch_%d_%s.pth' % (config.epoch - 1, 'discriminator')))) else: # Initialize weights # resnet.apply(weights_init_normal) generator.apply(weights_init_normal) discriminator.apply(weights_init_normal) # Optimizers optimizer_resnet = torch.optim.Adam(resnet.parameters(), lr=config.lr, betas=(config.b1, config.b2)) optimizer_G = torch.optim.Adam(generator.parameters(), lr=config.lr, betas=(config.b1, config.b2)) optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=config.lr, betas=(config.b1, config.b2)) # ---------- # Training # ---------- resnet.train() generator.train() discriminator.train() for epoch in range(config.epoch, config.n_epochs): for i, (im1, m1, im2, m2) in enumerate(train_loader): assert im1.size(0) == im2.size(0) valid = Variable(torch.Tensor(np.ones( (im1.size(0), *patch))).to(config.device), requires_grad=False) fake = Variable(torch.Tensor(np.ones( (im1.size(0), *patch))).to(config.device), requires_grad=False) # ------------------ # Train Generators # ------------------ optimizer_resnet.zero_grad() optimizer_G.zero_grad() # GAN loss z = resnet(im2 * m2) if epoch < config.gan_epochs: fake_im = generator(im1 * (1 - m1), im2 * m2, z) else: fake_im = generator(im1, im2, z) if epoch < config.gan_epochs: pred_fake = discriminator(fake_im, im2) gan_loss = config.lambda_gan * criterion_GAN(pred_fake, valid) else: gan_loss = torch.Tensor([0]).to(config.device) # Hair, Face loss fake_m2 = torch.argmax(model(fake_im), 1).unsqueeze(1).type(torch.uint8).repeat( 1, 3, 1, 1).to(config.device) if 0.5 * torch.sum(m1) <= torch.sum( fake_m2) <= 1.5 * torch.sum(m1): hair_loss = config.lambda_style * calc_style_loss( fake_im * fake_m2, im2 * m2, vgg) + calc_content_loss( fake_im * fake_m2, im2 * m2, vgg) face_loss = calc_content_loss(fake_im, im1, vgg) else: hair_loss = config.lambda_style * calc_style_loss( fake_im * m1, im2 * m2, vgg) + calc_content_loss( fake_im * m1, im2 * m2, vgg) face_loss = calc_content_loss(fake_im, im1, vgg) hair_loss *= config.lambda_hair face_loss *= config.lambda_face # Total loss loss = gan_loss + hair_loss + face_loss loss.backward() optimizer_resnet.step() optimizer_G.step() # --------------------- # Train Discriminator # --------------------- if epoch < config.gan_epochs: optimizer_D.zero_grad() # Real loss pred_real = discriminator(im1 * (1 - m1) + im2 * m2, im2) loss_real = criterion_GAN(pred_real, valid) # Fake loss pred_fake = discriminator(fake_im.detach(), im2) loss_fake = criterion_GAN(pred_fake, fake) # Total loss loss_D = 0.5 * (loss_real + loss_fake) loss_D.backward() optimizer_D.step() if i % config.sample_interval == 0: msg = "Train || Gan loss: %.6f, hair loss: %.6f, face loss: %.6f, loss: %.6f\n" % \ (gan_loss.item(), hair_loss.item(), face_loss.item(), loss.item()) sys.stdout.write("Epoch: %d || Batch: %d\n" % (epoch, i)) sys.stdout.write(msg) fname = os.path.join( config.save_path, "Train_Epoch:%d_Batch:%d.png" % (epoch, i)) sample_images([im1[0], im2[0], fake_im[0]], ["img1", "img2", "img1+img2"], fname) for j, (im1, m1, im2, m2) in enumerate(val_loader): with torch.no_grad(): valid = Variable(torch.Tensor( np.ones((im1.size(0), *patch))).to(config.device), requires_grad=False) fake = Variable(torch.Tensor( np.ones((im1.size(0), *patch))).to(config.device), requires_grad=False) # GAN loss z = resnet(im2 * m2) if epoch < config.gan_epochs: fake_im = generator(im1 * (1 - m1), im2 * m2, z) else: fake_im = generator(im1, im2, z) if epoch < config.gan_epochs: pred_fake = discriminator(fake_im, im2) gan_loss = config.lambda_gan * criterion_GAN( pred_fake, valid) else: gan_loss = torch.Tensor([0]).to(config.device) # Hair, Face loss fake_m2 = torch.argmax( model(fake_im), 1).unsqueeze(1).type(torch.uint8).repeat( 1, 3, 1, 1).to(config.device) if 0.5 * torch.sum(m1) <= torch.sum( fake_m2) <= 1.5 * torch.sum(m1): hair_loss = config.lambda_style * calc_style_loss( fake_im * fake_m2, im2 * m2, vgg) + calc_content_loss( fake_im * fake_m2, im2 * m2, vgg) face_loss = calc_content_loss(fake_im, im1, vgg) else: hair_loss = config.lambda_style * calc_style_loss( fake_im * m1, im2 * m2, vgg) + calc_content_loss( fake_im * m1, im2 * m2, vgg) face_loss = calc_content_loss(fake_im, im1, vgg) hair_loss *= config.lambda_hair face_loss *= config.lambda_face # Total loss loss = gan_loss + hair_loss + face_loss msg = "Validation || Gan loss: %.6f, hair loss: %.6f, face loss: %.6f, loss: %.6f\n" % \ (gan_loss.item(), hair_loss.item(), face_loss.item(), loss.item()) sys.stdout.write(msg) fname = os.path.join( config.save_path, "Validation_Epoch:%d_Batch:%d.png" % (epoch, i)) sample_images([im1[0], im2[0], fake_im[0]], ["img1", "img2", "img1+img2"], fname) break if epoch % config.checkpoint_interval == 0: if epoch < config.gan_epochs: models = [resnet, generator, discriminator] fnames = ['resnet', 'generator', 'discriminator'] else: models = [resnet, generator] fnames = ['resnet', 'generator'] fnames = [ os.path.join(config.checkpoints, 'epoch_%d_%s.pth' % (epoch, s)) for s in fnames ] save_weights(models, fnames)
} if not args.use_cpu else {} ckpt_dir = os.path.join(args.ckpt_dir, args.run_name) os.makedirs(ckpt_dir, exist_ok=True) log_dir = os.path.join(args.log_dir, args.run_name) os.makedirs(log_dir, exist_ok=True) logger = SummaryWriter(log_dir) jitter_size = args.resize + 30 # random jitter from pix2pix tf = transforms.Compose([ transforms.Resize(jitter_size, Image.ANTIALIAS), transforms.RandomCrop((args.resize, args.resize)), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) train_loader, test_loader = get_loaders(args, tf) G = Generator(in_channels=3, out_channels=3, n_blocks=9).to(device) # A to B F = Generator(in_channels=3, out_channels=3, n_blocks=9).to(device) # B to A D_A = Discriminator(in_channels=3).to(device) D_B = Discriminator(in_channels=3).to(device) nets = [G, F, D_A, D_B] for net in nets: net.apply(init_weights_gaussian) G_opt = optim.Adam(G.parameters(), lr=args.lr) F_opt = optim.Adam(F.parameters(), lr=args.lr) D_A_opt = optim.Adam(D_A.parameters(), lr=args.lr) D_B_opt = optim.Adam(D_B.parameters(), lr=args.lr)
metavar='str', help='dir to save checkpoints (default: ./checkpoints)') parser.add_argument( '--vis_dir', type=str, default=r'./val_out', metavar='str', help='dir to save results during training (default: ./val_out)') parser.add_argument('--lr', type=float, default=2e-4, help='learning rate (default: 0.0002)') parser.add_argument('--max_num_epochs', type=int, default=100, metavar='N', help='max number of training epochs (default 200)') parser.add_argument( '--scheduler_step_size', type=int, default=50, metavar='N', help='after m epochs then reduce lr to 0.1*lr (default 500)') args = parser.parse_args() if __name__ == '__main__': dataloaders = datasets.get_loaders(args) nn_classifier = Classifier(args=args, dataloaders=dataloaders) nn_classifier.train_models()
logger['args'] = args logger['checkpoint'] = os.path.join('models/', logger.index + '.pth') logger['checkpoint_step'] = os.path.join('models/', logger.index + '_{}.pth') print("[Logging in {}]".format(logger.index)) use_cuda = not args.no_cuda and torch.cuda.is_available() args.device = torch.device("cuda" if use_cuda else "cpu") os.makedirs('checkpoints', exist_ok=True) train_loader, valid_loader, test_loader = datasets.get_loaders( args.dataset, class_to_replace=args.forget_class, num_indexes_to_replace=args.num_to_forget, batch_size=args.batch_size, seed=args.seed, augment=args.augment) num_classes = max(train_loader.dataset.targets ) + 1 if args.num_classes is None else args.num_classes args.num_classes = num_classes print(f"Number of Classes: {num_classes}") model = models.get_model(args.model, num_classes=num_classes, filters_percentage=args.filters).to(args.device) if args.model == 'allcnn': classifier_name = 'classifier.' elif 'resnet' in args.model: classifier_name = 'linear.'
def main(eval_args): # ensures that weight initializations are all the same logging = utils.Logger(eval_args.local_rank, eval_args.save) # load a checkpoint logging.info('loading the model at:') logging.info(eval_args.checkpoint) checkpoint = torch.load(eval_args.checkpoint, map_location='cpu') args = checkpoint['args'] logging.info('loaded the model at epoch %d', checkpoint['epoch']) arch_instance = utils.get_arch_cells(args.arch_instance) model = AutoEncoder(args, None, arch_instance) model.load_state_dict(checkpoint['state_dict']) model = model.cuda() logging.info('args = %s', args) logging.info('num conv layers: %d', len(model.all_conv_layers)) logging.info('param size = %fM ', utils.count_parameters_in_M(model)) if eval_args.eval_mode == 'evaluate': # load train valid queue args.data = eval_args.data train_queue, valid_queue, num_classes, test_queue = datasets.get_loaders(args) if eval_args.eval_on_train: logging.info('Using the training data for eval.') valid_queue = train_queue if eval_args.eval_on_test: logging.info('Using the test data for eval.') valid_queue = test_queue # get number of bits num_output = utils.num_output(args.dataset, args) bpd_coeff = 1. / np.log(2.) / num_output valid_neg_log_p, valid_nelbo = test(valid_queue, model, num_samples=eval_args.num_iw_samples, args=args, logging=logging) logging.info('final valid nelbo %f', valid_nelbo) logging.info('final valid neg log p %f', valid_neg_log_p) logging.info('final valid nelbo in bpd %f', valid_nelbo * bpd_coeff) logging.info('final valid neg log p in bpd %f', valid_neg_log_p * bpd_coeff) else: bn_eval_mode = not eval_args.readjust_bn num_samples = 16 with torch.no_grad(): n = int(np.floor(np.sqrt(num_samples))) set_bn(model, bn_eval_mode, num_samples=36, t=eval_args.temp, iter=500) for ind in range(eval_args.repetition): # sampling is repeated. torch.cuda.synchronize() start = time() with autocast(): logits = model.sample(num_samples, eval_args.temp) output = model.decoder_output(logits) output_img = output.mean if isinstance(output, torch.distributions.bernoulli.Bernoulli) \ else output.sample() torch.cuda.synchronize() end = time() # save to file total_name = "{}/data_to_save_{}_{}.pickle".format(eval_args.save, eval_args.name_to_save, ind) with open(total_name, 'wb') as handle: pickle.dump(output_img.deatach().numpy(), handle, protocol=pickle.HIGHEST_PROTOCOL) output_tiled = utils.tile_image(output_img, n).cpu().numpy().transpose(1, 2, 0) logging.info('sampling time per batch: %0.3f sec', (end - start)) output_tiled = np.asarray(output_tiled * 255, dtype=np.uint8) output_tiled = np.squeeze(output_tiled) plt.imshow(output_tiled) plt.savefig("{}/generation_{}_{}".format(eval_args.save, eval_args.name_to_save, ind))