def validate(self, net, samples, e): transforms = generator.TransformationsGenerator([]) dataset = datasets.ImageDataset(samples, settings.train, transforms) dataloader = DataLoader( dataset, num_workers=10, batch_size=16 ) average_meter_val = meters.AverageMeter() with tqdm(total=len(dataloader), leave=True, ascii=True) as pbar, torch.no_grad(): net.eval() for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_predictions = self.predict(net, images) self.update_pbar( masks_predictions, masks_targets, pbar, average_meter_val, 'Validation epoch {}'.format(e) ) val_stats = {'val_' + k: v for k, v in average_meter_val.get_all().items()} return val_stats
def test(self, samples_test, dir_test=settings.test, predict=None): if predict is None: predict = self.predict net = DataParallel(self.net).cuda() transforms = generator.TransformationsGenerator([]) test_dataset = datasets.ImageDataset(samples_test, dir_test, transforms, test=True) test_dataloader = DataLoader(test_dataset, num_workers=10, batch_size=32) with tqdm(total=len(test_dataloader), leave=True, ascii=True) as pbar, torch.no_grad(): net.eval() for images, ids in test_dataloader: masks_predictions = predict(net, images) pbar.set_description('Creating test predictions...') pbar.update() masks_predictions = masks_predictions.cpu().squeeze().numpy() for p, id in zip(masks_predictions, ids): yield p, id
def train(self, net, samples, optimizer, e): alpha = 2 * max(0, ((50 - e) / 50)) criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha) transforms = generator.TransformationsGenerator([ random.RandomFlipLr(), random.RandomAffine( image_size=101, translation=lambda rs: (rs.randint(-10, 10), rs.randint(-10, 10)), scale=lambda rs: (rs.uniform(0.85, 1.15), 1), rotation=lambda rs: rs.randint(-5, 5), **utils.transformations_options ), transformations.Padding(((13, 14), (13, 14), (0, 0))) ]) transforms_image = generator.TransformationsGenerator([ random.RandomColorPerturbation(std=1) ]) dataset = datasets.ImageDataset(samples, settings.train, transforms, transforms_image=transforms_image) dataloader = DataLoader( dataset, num_workers=10, batch_size=16, shuffle=True ) average_meter_train = meters.AverageMeter() with tqdm(total=len(dataloader), leave=False) as pbar, torch.enable_grad(): net.train() for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_predictions, aux_pam, aux_cam = net(images) loss_pam = criterion(F.interpolate(aux_pam, size=128, mode='bilinear'), masks_targets) loss_cam = criterion(F.interpolate(aux_cam, size=128, mode='bilinear'), masks_targets) loss_segmentation = criterion(masks_predictions, masks_targets) loss = loss_segmentation + loss_pam + loss_cam loss.backward() optimizer.step() optimizer.zero_grad() average_meter_train.add('loss', loss.item()) self.update_pbar( torch.sigmoid(masks_predictions), masks_targets, pbar, average_meter_train, 'Training epoch {}'.format(e) ) train_stats = {'train_' + k: v for k, v in average_meter_train.get_all().items()} return train_stats
def train(self, net, samples, optimizer, e): alpha = 2 * max(0, ((100 - e) / 100)) criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha) transforms = generator.TransformationsGenerator([ random.RandomFlipLr(), random.RandomAffine(image_size=101, translation=lambda rs: (rs.randint(-20, 20), rs.randint(-20, 20)), scale=lambda rs: (rs.uniform(0.85, 1.15), 1), **utils.transformations_options), transformations.Padding(((13, 14), (13, 14), (0, 0))) ]) pseudo_dataset = datasets.SemiSupervisedImageDataset( samples_test, settings.test, transforms, size=len(samples_test), test_predictions=self.test_predictions, momentum=0.0) dataset = datasets.ImageDataset(samples, settings.train, transforms) weights = [len(pseudo_dataset) / len(dataset) * 2 ] * len(dataset) + [1] * len(pseudo_dataset) dataloader = DataLoader(ConcatDataset([dataset, pseudo_dataset]), num_workers=10, batch_size=16, sampler=WeightedRandomSampler( weights=weights, num_samples=3200)) average_meter_train = meters.AverageMeter() with tqdm(total=len(dataloader), leave=False) as pbar, torch.enable_grad(): net.train() for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_predictions = net(images) loss = criterion(masks_predictions, masks_targets) loss.backward() optimizer.step() optimizer.zero_grad() average_meter_train.add('loss', loss.item()) self.update_pbar(torch.sigmoid(masks_predictions), masks_targets, pbar, average_meter_train, 'Training epoch {}'.format(e)) train_stats = { 'train_' + k: v for k, v in average_meter_train.get_all().items() } return train_stats
def train(self, net, samples, optimizer, e): alpha = 2 * max(0, ((50 - e) / 50)) criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha) transforms = generator.TransformationsGenerator([ random.RandomFlipLr(), random.RandomAffine( image_size=101, translation=lambda rs: (rs.randint(-20, 20), rs.randint(-20, 20)), scale=lambda rs: (rs.uniform(0.85, 1.15), 1), **utils.transformations_options ) ]) dataset = datasets.ImageDataset(samples, settings.train, transforms) dataloader = DataLoader( dataset, num_workers=10, batch_size=16, shuffle=True ) average_meter_train = meters.AverageMeter() with tqdm(total=len(dataloader), leave=False, ascii=True) as pbar, torch.enable_grad(): net.train() padding = tta.Pad((13, 14, 13, 14)) for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_predictions = padding.transform_backward(net(padding.transform_forward(images))).contiguous() loss = criterion(masks_predictions, masks_targets) loss.backward() optimizer.step() optimizer.zero_grad() average_meter_train.add('loss', loss.item()) self.update_pbar( torch.sigmoid(masks_predictions), masks_targets, pbar, average_meter_train, 'Training epoch {}'.format(e) ) train_stats = {'train_' + k: v for k, v in average_meter_train.get_all().items()} return train_stats
def validate(args, fixed_z, gen_net: nn.Module, writer_dict): dataset = datasets.ImageDataset(args) train_loader = dataset.train gen_net = gen_net.eval() global_steps = writer_dict['valid_global_steps'] eval_iter = args.num_eval_imgs // args.eval_batch_size fid_buffer_dir = os.path.join(args.path_helper['sample_path'], 'fid_buffer') os.makedirs(fid_buffer_dir) img_list = list() for iter_idx in tqdm(range(eval_iter), desc='sample images'): z = torch.cuda.FloatTensor( np.random.normal(0, 1, (args.eval_batch_size, args.latent_dim))) # Generate a batch of images gen_imgs = gen_net(z).mul_(127.5).add_(127.5).clamp_( 0.0, 255.0).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy() for img_idx, img in enumerate(gen_imgs): file_name = os.path.join(fid_buffer_dir, f'iter{iter_idx}_b{img_idx}.png') imsave(file_name, img) img_list.extend(list(gen_imgs)) # compute IS inception_score, std = get_inception_score(img_list) print('------------------------ Inception Score ------------------------') print(inception_score) print('------------------------ FID pytorch ------------------------') print(fid_score) # Generate a batch of images sample_dir = os.path.join(args.path_helper['sample_path'], 'sample_dir') Path(sample_dir).mkdir(exist_ok=True) sample_imgs = gen_net(fixed_z).mul_(127.5).add_(127.5).clamp_(0.0, 255.0) img_grid = make_grid(sample_imgs, nrow=5).to('cpu', torch.uint8).numpy() file_name = os.path.join( sample_dir, f'final_fid_{fid}_inception_score{inception_score}.png') imsave(file_name, img_grid.swapaxes(0, 1).swapaxes(1, 2)) writer_dict['valid_global_steps'] = global_steps + 1 return inception_score, fid
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception(MODEL_DIR) inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) # set grow controller grow_ctrler = GrowCtrler(args.grow_step1, args.grow_step2) # initial start_search_iter = 0 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file, map_location={'cuda:0': 'cpu'}) # set controller && its optimizer cur_stage = checkpoint['cur_stage'] controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) start_search_iter = checkpoint['search_iter'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) controller.load_state_dict(checkpoint['ctrl_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) ctrl_optimizer.load_state_dict(checkpoint['ctrl_optimizer']) prev_archs = checkpoint['prev_archs'] prev_hiddens = checkpoint['prev_hiddens'] args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (search iteration {start_search_iter})' ) else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) prev_archs = None prev_hiddens = None # set controller && its optimizer cur_stage = 0 controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) # set up data_loader dataset = datasets.ImageDataset(args, 2**(cur_stage + 3), args.dis_batch_size, args.num_workers) train_loader = dataset.train logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'controller_steps': start_search_iter * args.ctrl_step } g_loss_history = RunningStats(args.dynamic_reset_window) d_loss_history = RunningStats(args.dynamic_reset_window) # train loop for search_iter in tqdm(range(int(start_search_iter), int(args.max_search_iter)), desc='search progress'): logger.info(f"<start search iteration {search_iter}>") if search_iter == args.grow_step1 or search_iter == args.grow_step2: # save cur_stage = grow_ctrler.cur_stage(search_iter) logger.info(f'=> grow to stage {cur_stage}') prev_archs, prev_hiddens = get_topk_arch_hidden( args, controller, gen_net, prev_archs, prev_hiddens) # grow section del controller del ctrl_optimizer controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) dataset = datasets.ImageDataset(args, 2**(cur_stage + 3), args.dis_batch_size, args.num_workers) train_loader = dataset.train dynamic_reset = train_shared(args, gen_net, dis_net, g_loss_history, d_loss_history, controller, gen_optimizer, dis_optimizer, train_loader, prev_hiddens=prev_hiddens, prev_archs=prev_archs) train_controller(args, controller, ctrl_optimizer, gen_net, prev_hiddens, prev_archs, writer_dict) if dynamic_reset: logger.info('re-initialize share GAN') del gen_net, dis_net, gen_optimizer, dis_optimizer gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) save_checkpoint( { 'cur_stage': cur_stage, 'search_iter': search_iter + 1, 'gen_model': args.gen_model, 'dis_model': args.dis_model, 'controller': args.controller, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'ctrl_state_dict': controller.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'ctrl_optimizer': ctrl_optimizer.state_dict(), 'prev_archs': prev_archs, 'prev_hiddens': prev_hiddens, 'path_helper': args.path_helper }, False, args.path_helper['ckpt_path']) final_archs, _ = get_topk_arch_hidden(args, controller, gen_net, prev_archs, prev_hiddens) logger.info(f"discovered archs: {final_archs}")
json.dump(pred, f) if __name__ == "__main__": args = get_args() if args.log_time_and_gpu: gpu_log_interval = 10 # every k seconds start_time = time.time() gpu_check_thread = threading.Thread( target=log_gpu_util, args=[gpu_log_interval, (args.gpuid_start, args.gpu)]) gpu_check_thread.daemon = True gpu_check_thread.start() img_dataset = datasets.ImageDataset(args, "test", args.img_lst) dataset_loader = enqueuer_thread.DatasetEnqueuer( img_dataset, prefetch=args.prefetch, shuffle=False, is_multi_gpu=False, last_full_batch=True, start=True, num_workers=args.num_cpu_worker) get_batches = dataset_loader.get() # iterator to get batch if args.out_dir is not None: if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # 1. load the object detection model
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import network gen_net = eval('models.' + args.model + '.Generator')(args=args).cuda() dis_net = eval('models.' + args.model + '.Discriminator')(args=args).cuda() # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})') else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( f'Inception score: {inception_score}, FID score: {fid_score} || @ epoch {epoch}.' ) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path']) del avg_gen_net
def main(): args = cfg.parse_args() random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) np.random.seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import netwo # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net = eval('models.' + args.model + '.Generator')(args=args).cuda() dis_net = eval('models.' + args.model + '.Discriminator')(args=args).cuda() gen_net.apply(weights_init) dis_net.apply(weights_init) avg_gen_net = deepcopy(gen_net) initial_gen_net_weight = torch.load(os.path.join(args.init_path, 'initial_gen_net.pth'), map_location="cpu") initial_dis_net_weight = torch.load(os.path.join(args.init_path, 'initial_dis_net.pth'), map_location="cpu") assert id(initial_dis_net_weight) != id(dis_net.state_dict()) assert id(initial_gen_net_weight) != id(gen_net.state_dict()) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/fid_stats_stl10_train.npz' else: raise NotImplementedError('no fid stat for %s' % args.dataset.lower()) assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) start_epoch = 0 best_fid = 1e4 print('=> resuming from %s' % args.load_path) assert os.path.exists(args.load_path) checkpoint_file = args.load_path assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) pruning_generate(gen_net, checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) total = 0 total_nonzero = 0 for m in dis_net.modules(): if isinstance(m, nn.Conv2d): total += m.weight_orig.data.numel() mask = m.weight_orig.data.abs().clone().gt(0).float().cuda() total_nonzero += torch.sum(mask) conv_weights = torch.zeros(total) index = 0 for m in dis_net.modules(): if isinstance(m, nn.Conv2d): size = m.weight_orig.data.numel() conv_weights[index:( index + size)] = m.weight_orig.data.view(-1).abs().clone() index += size y, i = torch.sort(conv_weights) # thre_index = int(total * args.percent) # only care about the non zero weights # e.g: total = 100, total_nonzero = 80, percent = 0.2, thre_index = 36, that means keep 64 thre_index = total - total_nonzero thre = y[int(thre_index)] pruned = 0 print('Pruning threshold: {}'.format(thre)) zero_flag = False masks = OrderedDict() for k, m in enumerate(dis_net.modules()): if isinstance(m, nn.Conv2d): weight_copy = m.weight_orig.data.abs().clone() mask = weight_copy.gt(thre).float() masks[k] = mask pruned = pruned + mask.numel() - torch.sum(mask) m.weight_orig.data.mul_(mask) if int(torch.sum(mask)) == 0: zero_flag = True print( 'layer index: {:d} \t total params: {:d} \t remaining params: {:d}' .format(k, mask.numel(), int(torch.sum(mask)))) print('Total conv params: {}, Pruned conv params: {}, Pruned ratio: {}'. format(total, pruned, pruned / total)) pruning_generate(avg_gen_net, checkpoint['gen_state_dict']) see_remain_rate(gen_net) if not args.finetune_G: gen_weight = gen_net.state_dict() gen_orig_weight = rewind_weight(initial_gen_net_weight, gen_weight.keys()) gen_weight.update(gen_orig_weight) gen_net.load_state_dict(gen_weight) gen_avg_param = copy_params(gen_net) if args.finetune_D: dis_net.load_state_dict(checkpoint['dis_state_dict']) else: dis_net.load_state_dict(initial_dis_net_weight) for k, m in enumerate(dis_net.modules()): if isinstance(m, nn.Conv2d): m.weight_orig.data.mul_(masks[k]) orig_dis_net = eval('models.' + args.model + '.Discriminator')(args=args).cuda() orig_dis_net.load_state_dict(checkpoint['dis_state_dict']) orig_dis_net.eval() args.path_helper = set_log_dir('logs', args.exp_name + "_{}".format(args.percent)) logger = create_logger(args.path_helper['log_path']) #logger.info('=> loaded checkpoint %s (epoch %d)' % (checkpoint_file, start_epoch)) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None see_remain_rate(gen_net) see_remain_rate_orig(dis_net) if not args.use_kd_D: train_with_mask(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, masks, lr_schedulers) else: train_with_mask_kd(args, gen_net, dis_net, orig_dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, masks, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict, epoch) logger.info( 'Inception score: %.4f, FID score: %.4f || @ epoch %d.' % (inception_score, fid_score, epoch)) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net.load_state_dict(gen_net.state_dict()) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path'])
def train(self, net, samples, optimizer, e): alpha = 2 * max(0, ((50 - e) / 50)) criterion = losses.ELULovaszFocalWithLogitsLoss(alpha, 2 - alpha) transforms = generator.TransformationsGenerator([ random.RandomFlipLr(), random.RandomAffine( image_size=101, translation=lambda rs: (rs.randint(-20, 20), rs.randint(-20, 20)), scale=lambda rs: (rs.uniform(0.85, 1.15), 1), **utils.transformations_options ), transformations.Padding(((13, 14), (13, 14), (0, 0))) ]) samples_aux = list(set(samples).intersection(set(utils.get_aux_samples()))) dataset_aux = datasets.ImageDataset(samples_aux, settings.train, transforms) transforms_mosaic = generator.TransformationsGenerator([ random.RandomCrop(128) ]) pairs_mosaic = utils.get_mosaic_pairs() samples_mosaic = utils.get_mosaic_samples() samples_mosaic = [sample for sample, pair in zip(samples_mosaic, pairs_mosaic) if pair[0] in samples and pair[1] in samples] dataset_mosaic = datasets.ImageDataset(samples_mosaic, './data/mosaic_pairs', transforms_mosaic) dataset_pseudo = datasets.SemiSupervisedImageDataset( samples_test, settings.test, transforms, size=len(samples_test), test_predictions=self.test_predictions, momentum=0.0 ) dataset = datasets.ImageDataset(samples, settings.train, transforms) weight_train = len(dataset_pseudo) / len(dataset) * 2 weight_mosaic = weight_train weight_aux = weight_train / 2 weights = [weight_train] * len(dataset) + [weight_mosaic] * len(dataset_mosaic) + [weight_aux] * len(dataset_aux) + [1] * len(dataset_pseudo) dataloader = DataLoader( ConcatDataset([dataset, dataset_aux, dataset_mosaic, dataset_pseudo]), num_workers=10, batch_size=16, sampler=WeightedRandomSampler(weights=weights, num_samples=3200) ) average_meter_train = meters.AverageMeter() with tqdm(total=len(dataloader), leave=False, ascii=True) as pbar, torch.enable_grad(): net.train() padding = tta.Pad((13, 14, 13, 14)) for images, masks_targets in dataloader: masks_targets = masks_targets.to(gpu) masks_targets = padding.transform_backward(masks_targets).contiguous() masks_predictions = padding.transform_backward(net(images)).contiguous() loss = criterion(masks_predictions, masks_targets) loss.backward() optimizer.step() optimizer.zero_grad() average_meter_train.add('loss', loss.item()) self.update_pbar( torch.sigmoid(masks_predictions), masks_targets, pbar, average_meter_train, 'Training epoch {}'.format(e) ) train_stats = {'train_' + k: v for k, v in average_meter_train.get_all().items()} return train_stats
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find("Conv2d") != -1: if args.init_type == "normal": nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == "orth": nn.init.orthogonal_(m.weight.data) elif args.init_type == "xavier_uniform": nn.init.xavier_uniform(m.weight.data, 1.0) else: raise NotImplementedError("{} unknown inital type".format( args.init_type)) elif classname.find("BatchNorm2d") != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) # set grow controller grow_ctrler = GrowCtrler(args.grow_step1, args.grow_step2) # initial start_search_iter = 0 # set writer if args.load_path: print(f"=> resuming from {args.load_path}") assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, "Model", "checkpoint.pth") assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) # set controller && its optimizer cur_stage = checkpoint["cur_stage"] controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) start_search_iter = checkpoint["search_iter"] gen_net.load_state_dict(checkpoint["gen_state_dict"]) dis_net.load_state_dict(checkpoint["dis_state_dict"]) controller.load_state_dict(checkpoint["ctrl_state_dict"]) gen_optimizer.load_state_dict(checkpoint["gen_optimizer"]) dis_optimizer.load_state_dict(checkpoint["dis_optimizer"]) ctrl_optimizer.load_state_dict(checkpoint["ctrl_optimizer"]) prev_archs = checkpoint["prev_archs"] prev_hiddens = checkpoint["prev_hiddens"] args.path_helper = checkpoint["path_helper"] logger = create_logger(args.path_helper["log_path"]) logger.info( f"=> loaded checkpoint {checkpoint_file} (search iteration {start_search_iter})" ) else: # create new log dir assert args.exp_name args.path_helper = set_log_dir("logs", args.exp_name) logger = create_logger(args.path_helper["log_path"]) prev_archs = None prev_hiddens = None # set controller && its optimizer cur_stage = 0 controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) # set up data_loader dataset = datasets.ImageDataset(args, 2**(cur_stage + 3)) train_loader = dataset.train logger.info(args) writer_dict = { "writer": SummaryWriter(args.path_helper["log_path"]), "controller_steps": start_search_iter * args.ctrl_step, } g_loss_history = RunningStats(args.dynamic_reset_window) d_loss_history = RunningStats(args.dynamic_reset_window) # train loop for search_iter in tqdm(range(int(start_search_iter), int(args.max_search_iter)), desc="search progress"): logger.info(f"<start search iteration {search_iter}>") if search_iter == args.grow_step1 or search_iter == args.grow_step2: # save cur_stage = grow_ctrler.cur_stage(search_iter) logger.info(f"=> grow to stage {cur_stage}") prev_archs, prev_hiddens = get_topk_arch_hidden( args, controller, gen_net, prev_archs, prev_hiddens) # grow section del controller del ctrl_optimizer controller, ctrl_optimizer = create_ctrler(args, cur_stage, weights_init) dataset = datasets.ImageDataset(args, 2**(cur_stage + 3)) train_loader = dataset.train dynamic_reset = train_shared( args, gen_net, dis_net, g_loss_history, d_loss_history, controller, gen_optimizer, dis_optimizer, train_loader, prev_hiddens=prev_hiddens, prev_archs=prev_archs, ) train_controller( args, controller, ctrl_optimizer, gen_net, prev_hiddens, prev_archs, writer_dict, ) if dynamic_reset: logger.info("re-initialize share GAN") del gen_net, dis_net, gen_optimizer, dis_optimizer gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) save_checkpoint( { "cur_stage": cur_stage, "search_iter": search_iter + 1, "gen_model": args.gen_model, "dis_model": args.dis_model, "controller": args.controller, "gen_state_dict": gen_net.state_dict(), "dis_state_dict": dis_net.state_dict(), "ctrl_state_dict": controller.state_dict(), "gen_optimizer": gen_optimizer.state_dict(), "dis_optimizer": dis_optimizer.state_dict(), "ctrl_optimizer": ctrl_optimizer.state_dict(), "prev_archs": prev_archs, "prev_hiddens": prev_hiddens, "path_helper": args.path_helper, }, False, args.path_helper["ckpt_path"], ) final_archs, _ = get_topk_arch_hidden(args, controller, gen_net, prev_archs, prev_hiddens) logger.info(f"discovered archs: {final_archs}")
def main(index, args): device = xm.xla_device() gen_net = Generator(args).to(device) dis_net = Discriminator(args).to(device) enc_net = Encoder(args).to(device) def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) enc_net.apply(weights_init) ae_recon_optimizer = torch.optim.Adam( itertools.chain(enc_net.parameters(), gen_net.parameters()), args.ae_recon_lr, (args.beta1, args.beta2)) ae_reg_optimizer = torch.optim.Adam( itertools.chain(enc_net.parameters(), gen_net.parameters()), args.ae_reg_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam(dis_net.parameters(), args.d_lr, (args.beta1, args.beta2)) gen_optimizer = torch.optim.Adam(gen_net.parameters(), args.g_lr, (args.beta1, args.beta2)) dataset = datasets.ImageDataset(args) train_loader = dataset.train valid_loader = dataset.valid para_loader = pl.ParallelLoader(train_loader, [device]) fid_stat = str(pathlib.Path( __file__).parent.absolute()) + '/fid_stat/fid_stat_cifar10_test.npz' if not os.path.exists(fid_stat): download_stat_cifar10_test() is_best = True args.num_epochs = np.ceil(args.num_iter / len(train_loader)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0, args.num_iter / 2, args.num_iter) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0, args.num_iter / 2, args.num_iter) ae_recon_scheduler = LinearLrDecay(ae_recon_optimizer, args.ae_recon_lr, 0, args.num_iter / 2, args.num_iter) ae_reg_scheduler = LinearLrDecay(ae_reg_optimizer, args.ae_reg_lr, 0, args.num_iter / 2, args.num_iter) # initial start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) enc_net.load_state_dict(checkpoint['enc_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) ae_recon_optimizer.load_state_dict(checkpoint['ae_recon_optimizer']) ae_reg_optimizer.load_state_dict(checkpoint['ae_reg_optimizer']) args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})') else: # create new log dir assert args.exp_name logs_dir = str(pathlib.Path(__file__).parent.parent) + '/logs' args.path_helper = set_log_dir(logs_dir, args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # train loop for epoch in tqdm(range(int(start_epoch), int(args.num_epochs)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler, ae_recon_scheduler, ae_reg_scheduler) train(device, args, gen_net, dis_net, enc_net, gen_optimizer, dis_optimizer, ae_recon_optimizer, ae_reg_optimizer, para_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == args.num_epochs - 1: fid_score = validate(args, fid_stat, gen_net, writer_dict, valid_loader) logger.info(f'FID score: {fid_score} || @ epoch {epoch}.') if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False save_checkpoint( { 'epoch': epoch + 1, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'enc_state_dict': enc_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'ae_recon_optimizer': ae_recon_optimizer.state_dict(), 'ae_reg_optimizer': ae_reg_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path'])
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import network gen_net = eval("models_search." + args.gen_model + ".Generator")(args=args).cuda() dis_net = eval("models_search." + args.dis_model + ".Discriminator")(args=args).cuda() gen_net.set_arch(args.arch, cur_stage=2) dis_net.cur_stage = 2 # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find("Conv2d") != -1: if args.init_type == "normal": nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == "orth": nn.init.orthogonal_(m.weight.data) elif args.init_type == "xavier_uniform": nn.init.xavier_uniform(m.weight.data, 1.0) else: raise NotImplementedError("{} unknown inital type".format( args.init_type)) elif classname.find("BatchNorm2d") != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2), ) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2), ) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == "cifar10": fid_stat = "fid_stat/fid_stats_cifar10_train.npz" elif args.dataset.lower() == "stl10": fid_stat = "fid_stat/stl10_train_unlabeled_fid_stats_48.npz" else: raise NotImplementedError(f"no fid stat for {args.dataset.lower()}") assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print(f"=> resuming from {args.load_path}") assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, "Model", "checkpoint.pth") assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint["epoch"] best_fid = checkpoint["best_fid"] gen_net.load_state_dict(checkpoint["gen_state_dict"]) dis_net.load_state_dict(checkpoint["dis_state_dict"]) gen_optimizer.load_state_dict(checkpoint["gen_optimizer"]) dis_optimizer.load_state_dict(checkpoint["dis_optimizer"]) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint["avg_gen_state_dict"]) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint["path_helper"] logger = create_logger(args.path_helper["log_path"]) logger.info( f"=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})") else: # create new log dir assert args.exp_name args.path_helper = set_log_dir("logs", args.exp_name) logger = create_logger(args.path_helper["log_path"]) logger.info(args) writer_dict = { "writer": SummaryWriter(args.path_helper["log_path"]), "train_global_steps": start_epoch * len(train_loader), "valid_global_steps": start_epoch // args.val_freq, } # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc="total progress"): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train( args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers, ) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( f"Inception score: {inception_score}, FID score: {fid_score} || @ epoch {epoch}." ) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { "epoch": epoch + 1, "gen_model": args.gen_model, "dis_model": args.dis_model, "gen_state_dict": gen_net.state_dict(), "dis_state_dict": dis_net.state_dict(), "avg_gen_state_dict": avg_gen_net.state_dict(), "gen_optimizer": gen_optimizer.state_dict(), "dis_optimizer": dis_optimizer.state_dict(), "best_fid": best_fid, "path_helper": args.path_helper, }, is_best, args.path_helper["ckpt_path"], ) del avg_gen_net
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) # set visible GPU ids if len(args.gpu_ids) > 0: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids # set TensorFlow environment for evaluation (calculate IS and FID) _init_inception() inception_path = check_or_download_inception('./tmp/imagenet/') create_inception_graph(inception_path) # the first GPU in visible GPUs is dedicated for evaluation (running Inception model) str_ids = args.gpu_ids.split(',') args.gpu_ids = [] for id in range(len(str_ids)): if id >= 0: args.gpu_ids.append(id) if len(args.gpu_ids) > 1: args.gpu_ids = args.gpu_ids[1:] else: args.gpu_ids = args.gpu_ids # genotype G genotypes_root = os.path.join('exps', args.genotypes_exp, 'Genotypes') genotype_G = np.load(os.path.join(genotypes_root, 'latest_G.npy')) # import network from genotype basemodel_gen = eval('archs.' + args.arch + '.Generator')(args, genotype_G) gen_net = torch.nn.DataParallel( basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args) dis_net = torch.nn.DataParallel( basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # basemodel_gen = eval('archs.' + args.arch + '.Generator')(args=args) # gen_net = torch.nn.DataParallel(basemodel_gen, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # basemodel_dis = eval('archs.' + args.arch + '.Discriminator')(args=args) # dis_net = torch.nn.DataParallel(basemodel_dis, device_ids=args.gpu_ids).cuda(args.gpu_ids[0]) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # epoch number for dis_net args.max_epoch_D = args.max_epoch_G * args.n_critic if args.max_iter_G: args.max_epoch_D = np.ceil(args.max_iter_G * args.n_critic / len(train_loader)) max_iter_D = args.max_epoch_D * len(train_loader) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, max_iter_D) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, max_iter_D) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # initial gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.checkpoint: # resuming print(f'=> resuming from {args.checkpoint}') assert os.path.exists(os.path.join('exps', args.checkpoint)) checkpoint_file = os.path.join('exps', args.checkpoint, 'Model', 'checkpoint_best.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})') else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('exps', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # model size logger.info('Param size of G = %fMB', count_parameters_in_MB(gen_net)) logger.info('Param size of D = %fMB', count_parameters_in_MB(dis_net)) print_FLOPs(basemodel_gen, (1, args.latent_dim), logger) print_FLOPs(basemodel_dis, (1, 3, args.img_size, args.img_size), logger) # for visualization if args.draw_arch: from utils.genotype import draw_graph_G draw_graph_G(genotype_G, save=True, file_path=os.path.join(args.path_helper['graph_vis_path'], 'latest_G')) fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (100, args.latent_dim))) # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch_D)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch % args.val_freq == 0 or epoch == int(args.max_epoch_D) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, std, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( f'Inception score mean: {inception_score}, Inception score std: {std}, ' f'FID score: {fid_score} || @ epoch {epoch}.') load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False # save model avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.arch, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path']) del avg_gen_net
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) print(args) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) # initial start_search_iter = 0 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) cur_stage = checkpoint['cur_stage'] start_search_iter = checkpoint['search_iter'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) prev_archs = checkpoint['prev_archs'] prev_hiddens = checkpoint['prev_hiddens'] args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (search iteration {start_search_iter})' ) else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) prev_archs = None prev_hiddens = None # set controller && its optimizer cur_stage = 0 # set up data_loader dataset = datasets.ImageDataset(args, 2**(cur_stage + 3)) train_loader = dataset.train print(args.rl_num_eval_img, "##############################") logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'controller_steps': start_search_iter * args.ctrl_step } g_loss_history = RunningStats(args.dynamic_reset_window) d_loss_history = RunningStats(args.dynamic_reset_window) # train loop Agent = SAC(131) print(Agent.alpha) memory = ReplayMemory(2560000) updates = 0 outinfo = { 'rewards': [], 'a_loss': [], 'critic_error': [], } Best = False Z_NUMPY = None WARMUP = True update_time = 1 for search_iter in tqdm(range(int(start_search_iter), 100), desc='search progress'): logger.info(f"<start search iteration {search_iter}>") if search_iter >= 1: WARMUP = False ### Define number of layers, currently only support 1->3 total_layer_num = 3 ### Different image size for different layers ds = [ datasets.ImageDataset(args, 2**(k + 3)) for k in range(total_layer_num) ] train_loaders = [d.train for d in ds] last_R = 0. # Initial reward last_fid = 10000 # Inital reward last_arch = [] # Set exploration if search_iter > 69: update_time = 10 Best = True else: Best = False gen_net.set_stage(-1) last_R, last_fid, last_state = get_is(args, gen_net, args.rl_num_eval_img, get_is_score=True) for layer in range(total_layer_num): cur_stage = layer # This defines which layer to use as output, for example, if cur_stage==0, then the output will be the first layer output. Set it to 2 if you want the output of the last layer. action = Agent.select_action([layer, last_R, 0.01 * last_fid] + last_state, Best) arch = [ action[0][0], action[0][1], action[1][0], action[1][1], action[1][2], action[2][0], action[2][1], action[2][2], action[3][0], action[3][1], action[4][0], action[4][1], action[5][0], action[5][1] ] # print(arch) # argmax to get int description of arch cur_arch = [np.argmax(k) for k in action] # Pad the skip option 0=False (for only layer 1 and layer2, not layer0, see builing_blocks.py for why) if layer == 0: cur_arch = cur_arch[0:4] elif layer == 1: cur_arch = cur_arch[0:5] elif layer == 2: if cur_arch[4] + cur_arch[5] == 2: cur_arch = cur_arch[0:4] + [3] elif cur_arch[4] + cur_arch[5] == 0: cur_arch = cur_arch[0:4] + [0] elif cur_arch[4] == 1 and cur_arch[5] == 0: cur_arch = cur_arch[0:4] + [1] else: cur_arch = cur_arch[0:4] + [2] # Get the network arch with the new architecture attached. last_arch += cur_arch gen_net.set_arch(last_arch, layer) # Set the network, given cur_stage # Train network dynamic_reset = train_qin(args, gen_net, dis_net, g_loss_history, d_loss_history, gen_optimizer, dis_optimizer, train_loaders[layer], cur_stage, smooth=False, WARMUP=WARMUP) # Get reward, use the jth layer output for generation. (layer 0:j), and the proposed progressive state R, fid, state = get_is(args, gen_net, args.rl_num_eval_img, z_numpy=Z_NUMPY) # Print exploitation mark, for better readability of the log. if Best: print("arch:", cur_arch, "Exploitation:", Best) else: print("arch:", cur_arch, "Exploring...") # Proxy reward of the up-to-now (0:j) architecture. print("update times:", updates, "step:", layer + 1, "IS:", R, "FID:", fid) mask = 0 if layer == total_layer_num - 1 else 1 if search_iter >= 0: # warm up memory.push([layer, last_R, 0.01 * last_fid] + last_state, arch, R - last_R + 0.01 * (last_fid - fid), [layer + 1, R, 0.01 * fid] + state, mask) # Append transition to memory if len(memory) >= 64: # Number of updates per step in environment for i in range(update_time): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = Agent.update_parameters( memory, min(len(memory), 256), updates) updates += 1 outinfo['critic_error'] = min(critic_1_loss, critic_2_loss) outinfo['entropy'] = ent_loss outinfo['a_loss'] = policy_loss print("full batch", outinfo, alpha) last_R = R # next step last_fid = fid last_state = state outinfo['rewards'] = R critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = Agent.update_parameters( memory, len(memory), updates) updates += 1 outinfo['critic_error'] = min(critic_1_loss, critic_2_loss) outinfo['entropy'] = ent_loss outinfo['a_loss'] = policy_loss print("full batch", outinfo, alpha) # Clean up and start a new trajectory from scratch del gen_net, dis_net, gen_optimizer, dis_optimizer gen_net, dis_net, gen_optimizer, dis_optimizer = create_shared_gan( args, weights_init) print(outinfo, len(memory)) Agent.save_model("test") WARMUP = False
def main(): args = cfg_train.parse_args() torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import network # gen_net = eval('models.' + args.gen_model + '.' + args.gen)(args=args).cuda() genotype_gen = eval('genotypes.%s' % args.arch_gen) gen_net = eval('models.' + args.gen_model + '.' + args.gen)( args, genotype_gen).cuda() # gen_net = eval('models.' + args.gen_model + '.' + args.gen)(args = args).cuda() if 'Discriminator' not in args.dis: genotype_dis = eval('genotypes.%s' % args.arch_dis) dis_net = eval('models.' + args.dis_model + '.' + args.dis)( args, genotype_dis).cuda() else: dis_net = eval('models.' + args.dis_model + '.' + args.dis)(args=args).cuda() # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train val_loader = dataset.valid # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, args.g_lr * 0.01, 260 * len(train_loader), args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, args.d_lr * 0.01, 260 * len(train_loader), args.max_iter * args.n_critic) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' elif args.dataset.lower() == 'mnist': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) fixed_z_sample = torch.cuda.FloatTensor( np.random.normal(0, 1, (args.eval_batch_size, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 best_fid_epoch = 0 is_with_fid = 0 std_with_fid = 0. best_is = 0 best_is_epoch = 0 fid_with_is = 0 best_dts = 0 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info( f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})') else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # calculate the FLOPs and param count of G input = torch.randn(args.gen_batch_size, args.latent_dim).cuda() flops, params = profile(gen_net, inputs=(input, )) flops, params = clever_format([flops, params], "%.3f") logger.info('FLOPs is {}, param count is {}'.format(flops, params)) # train loop dg_list = [] worst_lr = 1e-5 for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, args.consistent, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, std, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict, args.path_helper, search=False) logger.info( f'Inception score: {inception_score}, FID score: {fid_score}+-{std} || @ epoch {epoch}.' ) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score best_fid_epoch = epoch is_with_fid = inception_score std_with_fid = std is_best = True else: is_best = False if inception_score > best_is: best_is = inception_score best_std = std fid_with_is = fid_score best_is_epoch = epoch else: is_best = False # save generated images if epoch % args.image_every == 0: gen_noise = torch.cuda.FloatTensor( np.random.normal(0, 1, (args.eval_batch_size, args.latent_dim))) # gen_images = gen_net(fixed_z_sample) # gen_images = gen_images.reshape(args.eval_batch_size, 32, 32, 3) # gen_images = gen_images.cpu().detach() gen_images = gen_net(fixed_z_sample).mul_(127.5).add_( 127.5).clamp_(0.0, 255.0).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy() fig = plt.figure() grid = ImageGrid(fig, 111, nrows_ncols=(10, 10), axes_pad=0) for x in range(args.eval_batch_size): grid[x].imshow(gen_images[x]) # cmap="gray") grid[x].set_xticks([]) grid[x].set_yticks([]) plt.savefig( os.path.join(args.path_helper['sample_path'], "epoch_{}.png".format(epoch))) plt.close() avg_gen_net = deepcopy(gen_net) # avg_gen_net = eval('models.'+args.gen_model+'.' + args.gen)(args, genotype_gen).cuda() # avg_gen_net = eval('models.' + args.gen_model + '.' + args.gen)(args=args).cuda() load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'gen_model': args.gen_model, 'dis_model': args.dis_model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path']) del avg_gen_net logger.info( 'best_is is {}+-{}@{} epoch, fid is {}, best_fid is {}@{}, is is {}+-{}' .format(best_is, best_std, best_is_epoch, fid_with_is, best_fid, best_fid_epoch, is_with_fid, std_with_fid))
def main(): args = cfg.parse_args() random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) np.random.seed(args.random_seed) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True os.environ['PYTHONHASHSEED'] = str(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # import network gen_net = eval('models.'+args.model+'.Generator')(args=args) dis_net = eval('models.'+args.model+'.Discriminator')(args=args) initial_gen_net_weight = torch.load(os.path.join(args.init_path, 'initial_gen_net.pth'), map_location="cpu") initial_dis_net_weight = torch.load(os.path.join(args.init_path, 'initial_dis_net.pth'), map_location="cpu") gen_net = gen_net.cuda() dis_net = dis_net.cuda() gen_net.load_state_dict(initial_gen_net_weight) dis_net.load_state_dict(initial_dis_net_weight) # set optimizer gen_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/fid_stats_stl10_train.npz' else: raise NotImplementedError('no fid stat for %s' % args.dataset.lower()) assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial fixed_z = torch.cuda.FloatTensor(np.random.normal(0, 1, (25, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print('=> resuming from %s' % args.load_path) assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) avg_gen_net = deepcopy(gen_net) avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = copy_params(avg_gen_net) del avg_gen_net args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info('=> loaded checkpoint %s (epoch %d)' % (checkpoint_file, start_epoch)) else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } # train loop switch = False for epoch in range(int(start_epoch), int(args.max_epoch)): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int(args.max_epoch)-1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict, epoch) logger.info('Inception score: %.4f, FID score: %.4f || @ epoch %d.' % (inception_score, fid_score, epoch)) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net = deepcopy(gen_net) load_params(avg_gen_net, gen_avg_param) save_checkpoint({ 'epoch': epoch + 1, 'model': args.model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper, 'seed': args.random_seed }, is_best, args.path_helper['ckpt_path']) del avg_gen_net
def validate(args, fixed_z, fid_stat, gen_net: nn.Module, writer_dict): dataset = datasets.ImageDataset(args) train_loader = dataset.train gen_net = gen_net.eval() global_steps = writer_dict['valid_global_steps'] eval_iter = args.num_eval_imgs // args.eval_batch_size # compute IS IS_buffer_dir = os.path.join(args.path_helper['sample_path'], 'fid_buffer') os.makedirs(IS_buffer_dir) img_list = list() for iter_idx in tqdm(range(eval_iter), desc='sample images'): z = torch.cuda.FloatTensor( np.random.normal(0, 1, (args.eval_batch_size, args.latent_dim))) # Generate a batch of images gen_imgs = gen_net(z).mul_(127.5).add_(127.5).clamp_( 0.0, 255.0).permute(0, 2, 3, 1).to('cpu', torch.uint8).numpy() for img_idx, img in enumerate(gen_imgs): file_name = os.path.join(IS_buffer_dir, f'iter{iter_idx}_b{img_idx}.png') imsave(file_name, img) img_list.extend(list(gen_imgs)) inception_score, std = get_inception_score(img_list) print('------------------------Inception Score------------------------') print(inception_score) # compute FID sample_list = [] for i in range(eval_iter): z = torch.cuda.FloatTensor( np.random.normal(0, 1, (args.eval_batch_size, args.latent_dim))) samples = gen_net(z) sample_list.append(samples.data.cpu().numpy()) new_sample_list = list(chain.from_iterable(sample_list)) fake_image_np = np.concatenate([img[None] for img in new_sample_list], 0) real_image_np = [] for i, (images, _) in enumerate(train_loader): real_image_np += [images.data.numpy()] batch_size = real_image_np[0].shape[0] if len(real_image_np) * batch_size >= fake_image_np.shape[0]: break real_image_np = np.concatenate(real_image_np, 0)[:fake_image_np.shape[0]] fid_score = calculate_fid(real_image_np, fake_image_np, batch_size=300) var_fid = fid_score[0][2] fid = round(fid_score[0][1], 3) print('------------------------fid_score------------------------') print(fid_score) # Generate a batch of images sample_dir = os.path.join(args.path_helper['sample_path'], 'sample_dir') Path(sample_dir).mkdir(exist_ok=True) sample_imgs = gen_net(fixed_z).mul_(127.5).add_(127.5).clamp_(0.0, 255.0) img_grid = make_grid(sample_imgs, nrow=5).to('cpu', torch.uint8).numpy() file_name = os.path.join( sample_dir, f'final_fid_{fid}_inception_score{inception_score}.png') imsave(file_name, img_grid.swapaxes(0, 1).swapaxes(1, 2)) writer_dict['valid_global_steps'] = global_steps + 1 return inception_score, fid
milestones=[int(args.max_epoch / 2), int(args.max_epoch * 0.75)]) # set optimizer W_optimizer = torch.optim.Adam(W_lst, args.lr_w) W_scheduler = torch.optim.lr_scheduler.LambdaLR(W_optimizer, lr_lambda=LambdaLR( args.max_epoch, 0, args.decay_epoch).step) gamma_optimizer = torch.optim.SGD(gamma_lst, args.lr_gamma, momentum=0.5) gamma_scheduler = torch.optim.lr_scheduler.MultiStepLR( gamma_optimizer, milestones=[int(args.max_epoch / 2), int(args.max_epoch * 0.75)]) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' else: raise NotImplementedError('no fid stat for %s' % args.dataset.lower()) assert os.path.exists(fid_stat) # initial fixed_z = torch.cuda.FloatTensor(np.random.normal(0, 1, (25, args.latent_dim))) start_epoch = 0 best_fid = 1e4 N = len(train_loader)
def main(): args = cfg.parse_args() random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net = Generator(bottom_width=args.bottom_width, gf_dim=args.gf_dim, latent_dim=args.latent_dim).cuda() dis_net = eval('models.' + args.model + '.Discriminator')(args=args).cuda() gen_net.apply(weights_init) dis_net.apply(weights_init) initial_gen_net_weight = torch.load(os.path.join(args.init_path, 'initial_gen_net.pth'), map_location="cpu") initial_dis_net_weight = torch.load(os.path.join(args.init_path, 'initial_dis_net.pth'), map_location="cpu") os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu exp_str = args.dir args.load_path = os.path.join('output', exp_str, 'pth', 'epoch{}.pth'.format(args.load_epoch)) # state dict: assert os.path.exists(args.load_path) checkpoint = torch.load(args.load_path) print('=> loaded checkpoint %s' % args.load_path) state_dict = checkpoint['generator'] gen_net = load_subnet(args, state_dict, initial_gen_net_weight).cuda() avg_gen_net = deepcopy(gen_net) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' else: raise NotImplementedError('no fid stat for %s' % args.dataset.lower()) assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial np.random.seed(args.random_seed) fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) start_epoch = 0 best_fid = 1e4 args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) #logger.info('=> loaded checkpoint %s (epoch %d)' % (checkpoint_file, start_epoch)) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } gen_avg_param = copy_params(gen_net) # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict) logger.info( 'Inception score: %.4f, FID score: %.4f || @ epoch %d.' % (inception_score, fid_score, epoch)) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net.load_state_dict(gen_net.state_dict()) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path'])
def main(): args = cfg.parse_args() random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # weight init gen_net = eval('models.' + args.model + '.Generator')(args=args) dis_net = eval('models.' + args.model + '.Discriminator')(args=args) # weight init def weights_init(m): if isinstance(m, nn.Conv2d): if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif isinstance(m, nn.BatchNorm2d): nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) gen_net = gen_net.cuda() dis_net = dis_net.cuda() avg_gen_net = deepcopy(gen_net) initial_gen_net_weight = deepcopy(gen_net.state_dict()) initial_dis_net_weight = deepcopy(dis_net.state_dict()) assert id(initial_dis_net_weight) != id(dis_net.state_dict()) assert id(initial_gen_net_weight) != id(gen_net.state_dict()) # set optimizer gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # set up data_loader dataset = datasets.ImageDataset(args) train_loader = dataset.train # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/fid_stats_stl10_train.npz' else: raise NotImplementedError('no fid stat for %s' % args.dataset.lower()) assert os.path.exists(fid_stat) # epoch number for dis_net args.max_epoch = args.max_epoch * args.n_critic if args.max_iter: args.max_epoch = np.ceil(args.max_iter * args.n_critic / len(train_loader)) # initial np.random.seed(args.random_seed) fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (25, args.latent_dim))) start_epoch = 0 best_fid = 1e4 args.path_helper = set_log_dir('logs', args.exp_name + "_{}".format(args.percent)) logger = create_logger(args.path_helper['log_path']) # logger.info('=> loaded checkpoint %s (epoch %d)' % (checkpoint_file, start_epoch)) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } print('=> resuming from %s' % args.load_path) assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) gen_net.load_state_dict(checkpoint['gen_state_dict']) torch.manual_seed(args.random_seed) pruning_generate(gen_net, (1 - args.percent), args.pruning_method) torch.manual_seed(args.random_seed) pruning_generate(avg_gen_net, (1 - args.percent), args.pruning_method) see_remain_rate(gen_net) if args.second_seed: dis_net.apply(weights_init) if args.finetune_D: dis_net.load_state_dict(checkpoint['dis_state_dict']) else: dis_net.load_state_dict(initial_dis_net_weight) gen_weight = gen_net.state_dict() gen_orig_weight = rewind_weight(initial_gen_net_weight, gen_weight.keys()) assert id(gen_weight) != id(gen_orig_weight) gen_weight.update(gen_orig_weight) gen_net.load_state_dict(gen_weight) gen_avg_param = copy_params(gen_net) if args.use_kd_D: orig_dis_net = eval('models.' + args.model + '.Discriminator')(args=args).cuda() orig_dis_net.load(checkpoint['dis_state_dict']) orig_dis_net.eval() # train loop for epoch in tqdm(range(int(start_epoch), int(args.max_epoch)), desc='total progress'): lr_schedulers = (gen_scheduler, dis_scheduler) if args.lr_decay else None see_remain_rate(gen_net) if not args.use_kd_D: train(args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) else: train_kd(args, gen_net, dis_net, orig_dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, lr_schedulers) if epoch and epoch % args.val_freq == 0 or epoch == int( args.max_epoch) - 1: backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) inception_score, fid_score = validate(args, fixed_z, fid_stat, gen_net, writer_dict, epoch) logger.info( 'Inception score: %.4f, FID score: %.4f || @ epoch %d.' % (inception_score, fid_score, epoch)) load_params(gen_net, backup_param) if fid_score < best_fid: best_fid = fid_score is_best = True else: is_best = False else: is_best = False avg_gen_net.load_state_dict(gen_net.state_dict()) load_params(avg_gen_net, gen_avg_param) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'gen_state_dict': gen_net.state_dict(), 'dis_state_dict': dis_net.state_dict(), 'avg_gen_state_dict': avg_gen_net.state_dict(), 'gen_optimizer': gen_optimizer.state_dict(), 'dis_optimizer': dis_optimizer.state_dict(), 'best_fid': best_fid, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path'])
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) torch.backends.cudnn.deterministic = True # set tf env _init_inception() inception_path = check_or_download_inception(None) create_inception_graph(inception_path) # epoch number for dis_net dataset = datasets.ImageDataset(args, cur_img_size=8) train_loader = dataset.train if args.max_iter: args.max_epoch = np.ceil(args.max_iter / len(train_loader)) else: args.max_iter = args.max_epoch * len(train_loader) args.max_epoch = args.max_epoch * args.n_critic # import network gen_net = eval('models.' + args.gen_model + '.Generator')(args=args).cuda() dis_net = eval('models.' + args.dis_model + '.Discriminator')(args=args).cuda() gen_net.set_arch(args.arch, cur_stage=2) # weight init def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform_(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format( args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) gpu_ids = [i for i in range(int(torch.cuda.device_count()))] gen_net = torch.nn.DataParallel(gen_net.to("cuda:0"), device_ids=gpu_ids) dis_net = torch.nn.DataParallel(dis_net.to("cuda:0"), device_ids=gpu_ids) gen_net.module.cur_stage = 0 dis_net.module.cur_stage = 0 gen_net.module.alpha = 1. dis_net.module.alpha = 1. # set optimizer if args.optimizer == "adam": gen_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) elif args.optimizer == "adamw": gen_optimizer = AdamW(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, weight_decay=args.wd) dis_optimizer = AdamW(filter(lambda p: p.requires_grad, dis_net.parameters()), args.g_lr, weight_decay=args.wd) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' elif args.fid_stat is not None: fid_stat = args.fid_stat else: raise NotImplementedError(f'no fid stat for {args.dataset.lower()}') assert os.path.exists(fid_stat) # initial fixed_z = torch.cuda.FloatTensor( np.random.normal(0, 1, (64, args.latent_dim))) gen_avg_param = copy_params(gen_net) start_epoch = 0 best_fid = 1e4 # set writer if args.load_path: print(f'=> resuming from {args.load_path}') assert os.path.exists(args.load_path) checkpoint_file = os.path.join(args.load_path) assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) start_epoch = checkpoint['epoch'] best_fid = checkpoint['best_fid'] gen_net.load_state_dict(checkpoint['gen_state_dict']) dis_net.load_state_dict(checkpoint['dis_state_dict']) gen_optimizer.load_state_dict(checkpoint['gen_optimizer']) dis_optimizer.load_state_dict(checkpoint['dis_optimizer']) # avg_gen_net = deepcopy(gen_net) # avg_gen_net.load_state_dict(checkpoint['avg_gen_state_dict']) gen_avg_param = checkpoint['gen_avg_param'] # del avg_gen_net cur_stage = cur_stages(start_epoch, args) gen_net.module.cur_stage = cur_stage dis_net.module.cur_stage = cur_stage gen_net.module.alpha = 1. dis_net.module.alpha = 1. args.path_helper = checkpoint['path_helper'] else: # create new log dir assert args.exp_name args.path_helper = set_log_dir('logs', args.exp_name) logger = create_logger(args.path_helper['log_path']) logger.info(args) writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': start_epoch * len(train_loader), 'valid_global_steps': start_epoch // args.val_freq, } def return_states(): states = {} states['epoch'] = epoch states['best_fid'] = best_fid_score states['gen_state_dict'] = gen_net.state_dict() states['dis_state_dict'] = dis_net.state_dict() states['gen_optimizer'] = gen_optimizer.state_dict() states['dis_optimizer'] = dis_optimizer.state_dict() states['gen_avg_param'] = gen_avg_param states['path_helper'] = args.path_helper return states # train loop for epoch in range(start_epoch + 1, args.max_epoch): train( args, gen_net, dis_net, gen_optimizer, dis_optimizer, gen_avg_param, train_loader, epoch, writer_dict, fixed_z, ) backup_param = copy_params(gen_net) load_params(gen_net, gen_avg_param) fid_score = validate( args, fixed_z, fid_stat, epoch, gen_net, writer_dict, ) logger.info(f'FID score: {fid_score} || @ epoch {epoch}.') load_params(gen_net, backup_param) is_best = False if epoch == 1 or fid_score < best_fid_score: best_fid_score = fid_score is_best = True if is_best or epoch % 1 == 0: states = return_states() save_checkpoint(states, is_best, args.path_helper['ckpt_path'], filename=f'checkpoint_epoch_{epoch}.pth')
def main(): args = cfg.parse_args() torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) np.random.seed(args.random_seed) random.seed(args.random_seed) torch.backends.cudnn.deterministic = True # import network # args.gen_model is TransGAN_8_8_1 for example gen_net = eval('models.'+args.gen_model+'.Generator')(args=args).cuda() dis_net = eval('models.'+args.dis_model+'.Discriminator')(args=args).cuda() gen_net.set_arch(args.arch, cur_stage=2) print("The shit!") # weight init: Xavier Uniform def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv2d') != -1: if args.init_type == 'normal': nn.init.normal_(m.weight.data, 0.0, 0.02) elif args.init_type == 'orth': nn.init.orthogonal_(m.weight.data) elif args.init_type == 'xavier_uniform': nn.init.xavier_uniform(m.weight.data, 1.) else: raise NotImplementedError('{} unknown inital type'.format(args.init_type)) elif classname.find('BatchNorm2d') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0.0) gen_net.apply(weights_init) dis_net.apply(weights_init) gpu_ids = [i for i in range(int(torch.cuda.device_count()))] gen_net = torch.nn.DataParallel(gen_net.to("cuda:0"), device_ids=gpu_ids) dis_net = torch.nn.DataParallel(dis_net.to("cuda:0"), device_ids=gpu_ids) # print(gen_net.module.cur_stage) if args.optimizer == "adam": gen_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, (args.beta1, args.beta2)) dis_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, dis_net.parameters()), args.d_lr, (args.beta1, args.beta2)) elif args.optimizer == "adamw": gen_optimizer = AdamW(filter(lambda p: p.requires_grad, gen_net.parameters()), args.g_lr, weight_decay=args.wd) dis_optimizer = AdamW(filter(lambda p: p.requires_grad, dis_net.parameters()), args.g_lr, weight_decay=args.wd) gen_scheduler = LinearLrDecay(gen_optimizer, args.g_lr, 0.0, 0, args.max_iter * args.n_critic) dis_scheduler = LinearLrDecay(dis_optimizer, args.d_lr, 0.0, 0, args.max_iter * args.n_critic) # fid stat if args.dataset.lower() == 'cifar10': fid_stat = 'fid_stat/fid_stats_cifar10_train.npz' elif args.dataset.lower() == 'stl10': fid_stat = 'fid_stat/stl10_train_unlabeled_fid_stats_48.npz' elif args.fid_stat is not None: fid_stat = args.fid_stat else: raise NotImplementedError # (f"no fid stat for %s"%args.dataset.lower()") assert os.path.exists(fid_stat) dataset = datasets.ImageDataset(args, cur_img_size=8) train_loader = dataset.train writer=SummaryWriter() writer_dict = {'writer':writer} writer_dict["train_global_steps"]=0 writer_dict["valid_global_steps"]=0 best = 1e4 for epoch in range(args.max_epoch): train(args, gen_net = gen_net, dis_net = dis_net, gen_optimizer = gen_optimizer, dis_optimizer = dis_optimizer, gen_avg_param = None, train_loader = train_loader, epoch = epoch, writer_dict = writer_dict, fixed_z = None, schedulers=[gen_scheduler, dis_scheduler]) checkpoint = {'epoch':epoch, 'best_fid':best} checkpoint['gen_state_dict'] = gen_net.state_dict() checkpoint['dis_state_dict'] = dis_net.state_dict() score = validate(args, None, fid_stat, epoch, gen_net, writer_dict, clean_dir=True) # print these scores, is it really the latest print(f'FID score: {score} - best ID score: {best} || @ epoch {epoch}.') if epoch == 0 or epoch > 30: if score < best: save_checkpoint(checkpoint, is_best=(score<best), output_dir=args.output_dir) print("Saved Latest Model!") best = score checkpoint = {'epoch':epoch, 'best_fid':best} checkpoint['gen_state_dict'] = gen_net.state_dict() checkpoint['dis_state_dict'] = dis_net.state_dict() score = validate(args, None, fid_stat, epoch, gen_net, writer_dict, clean_dir=True) save_checkpoint(checkpoint, is_best=(score<best), output_dir=args.output_dir)