def __init__(self, encoder, decoder, start_iter): super(Net, self).__init__() vgg = encoder # self.enc_0 = nn.Sequential(*list(vgg.children())[:1]) # enc_layers = list(encoder.children()) self.enc_1 = nn.Sequential(*list(vgg.children())[:4]) # input -> relu1_1 self.enc_2 = nn.Sequential(*list(vgg.children())[4:11]) # relu1_1 -> relu2_1 self.enc_3 = nn.Sequential(*list(vgg.children())[11:18]) # relu2_1 -> relu3_1 self.enc_4 = nn.Sequential(*list(vgg.children())[18:31]) # relu3_1 -> relu4_1 self.enc_5 = nn.Sequential(*list(vgg.children())[31:44]) # relu4_1 -> relu5_1 # transform self.transform = Transform(in_planes=512) self.decoder = decoder if (start_iter > 0): self.transform.load_state_dict(torch.load('weight/transformer_iter_' + str(start_iter) + '.pth')) self.decoder.load_state_dict(torch.load('weight/decoder_iter_' + str(start_iter) + '.pth')) self.mse_loss = nn.MSELoss() self.variation_loss = nn.L1Loss() # fix the encoder for name in ['enc_1', 'enc_2', 'enc_3', 'enc_4', 'enc_5']: for param in getattr(self, name).parameters(): param.requires_grad = False self.dx_bias = np.zeros([256, 256]) self.dy_bias = np.zeros([256, 256]) for i in range(256): self.dx_bias[:, i] = i self.dx_bias[i, :] = i
parser.add_argument('--vgg', type=str, default = 'weight/vgg_normalised.pth') parser.add_argument('--decoder', type=str, default = 'experiments4/decoder_iter_600000.pth') parser.add_argument('--transform', type=str, default = 'experiments4/transformer_iter_600000.pth') # Additional options parser.add_argument('--save_ext', default = 'output+', help='The extension name of the output viedo') parser.add_argument('--output', type=str, default = 'output', help='Directory to save the output image(s)') # Advanced options args = parser.parse_args('') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not os.path.exists(args.output): os.mkdir(args.output) decoder = Decoder('Decoder') transform = Transform(in_planes=512) vgg = VGG('VGG19') decoder.eval() transform.eval() vgg.eval() # decoder.features.load_state_dict(torch.load(args.decoder)) decoder.load_state_dict(torch.load(args.decoder)) transform.load_state_dict(torch.load(args.transform)) vgg.features.load_state_dict(torch.load(args.vgg)) enc_1 = nn.Sequential(*list(vgg.features.children())[:4]) # input -> relu1_1 enc_2 = nn.Sequential(*list(vgg.features.children())[4:11]) # relu1_1 -> relu2_1 enc_3 = nn.Sequential(*list(vgg.features.children())[11:18]) # relu2_1 -> relu3_1 enc_4 = nn.Sequential(*list(vgg.features.children())[18:31]) # relu3_1 -> relu4_1
class Net(nn.Module): def __init__(self, encoder, decoder, start_iter): super(Net, self).__init__() vgg = encoder # self.enc_0 = nn.Sequential(*list(vgg.children())[:1]) # enc_layers = list(encoder.children()) self.enc_1 = nn.Sequential(*list(vgg.children())[:4]) # input -> relu1_1 self.enc_2 = nn.Sequential(*list(vgg.children())[4:11]) # relu1_1 -> relu2_1 self.enc_3 = nn.Sequential(*list(vgg.children())[11:18]) # relu2_1 -> relu3_1 self.enc_4 = nn.Sequential(*list(vgg.children())[18:31]) # relu3_1 -> relu4_1 self.enc_5 = nn.Sequential(*list(vgg.children())[31:44]) # relu4_1 -> relu5_1 # transform self.transform = Transform(in_planes=512) self.GNN = CoattentionModel(all_channel=512) self.GNN_2 = CoattentionModel(all_channel=512) self.decoder = decoder if (start_iter > 0): self.transform.load_state_dict(torch.load('/home/lwq/sdb1/xiaoxin/code/SANT_weight/transformer_iter_' + str(start_iter) + '.pth')) self.decoder.load_state_dict(torch.load('/home/lwq/sdb1/xiaoxin/code/SANT_weight/decoder_iter_' + str(start_iter) + '.pth')) self.mse_loss = nn.MSELoss() self.variation_loss = nn.L1Loss() # fix the encoder for name in ['enc_1', 'enc_2', 'enc_3', 'enc_4', 'enc_5']: for param in getattr(self, name).parameters(): param.requires_grad = False self.dx_bias = np.zeros([256, 256]) self.dy_bias = np.zeros([256, 256]) for i in range(256): self.dx_bias[:, i] = i self.dx_bias[i, :] = i # extract relu1_1, relu2_1, relu3_1, relu4_1, relu5_1 from input image def encode_with_intermediate(self, input): results = [input] for i in range(5): func = getattr(self, 'enc_{:d}'.format(i+1)) results.append(func(results[-1])) return results[1:] def calc_content_loss(self, input, target, norm=False): if (norm == False): return self.mse_loss(input, target) else: return self.mse_loss(mean_variance_norm(input), mean_variance_norm(target)) def calc_style_loss(self, input, target): input_mean, input_std = calc_mean_std(input) target_mean, target_std = calc_mean_std(target) return self.mse_loss(input_mean, target_mean) + \ self.mse_loss(input_std, target_std) def calc_temporal_loss(self, x1, x2): h = x1.shape[2] w = x1.shape[3] D = h*w return self.mse_loss(x1, x2) def compute_total_variation_loss_l1(self, inputs): h = inputs.shape[2] w = inputs.shape[3] h1 = inputs[:, :, 0:h-1, :] h2 = inputs[:, :, 1:h, :] w1 = inputs[:, :, :, 0:w-1] w2 = inputs[:, :, :, 1:w] return self.variation_loss(h1, h2)+self.variation_loss(w1, w2) def forward(self, content1, content2, content3, style): # feature extract style_feats = self.encode_with_intermediate(style) content1_feats = self.encode_with_intermediate(content1) content2_feats = self.encode_with_intermediate(content2) content3_feats = self.encode_with_intermediate(content3) gcontent1_feats, gcontent2_feats, gcontent3_feats = self.GNN(content1_feats[3], content2_feats[3], content3_feats[3]) ggcontent1_feats, ggcontent2_feats, ggcontent3_feats = self.GNN_2(content1_feats[4], content2_feats[4], content3_feats[4]) # feature fusion & propagation stylized_1 = self.transform(gcontent1_feats, style_feats[3], ggcontent1_feats, style_feats[4]) stylized_2 = self.transform(gcontent2_feats, style_feats[3], ggcontent2_feats, style_feats[4]) stylized_3 = self.transform(gcontent3_feats, style_feats[3], ggcontent3_feats, style_feats[4]) stylized = torch.cat((stylized_1, stylized_2, stylized_3), 0) content_feats_l3 = torch.cat((content1_feats[3], content2_feats[3], content3_feats[3]), 0) content_feats_l4 = torch.cat((content1_feats[4], content2_feats[4], content3_feats[4]), 0) # decoder g_t = self.decoder(stylized) # compute loss g_t_feats = self.encode_with_intermediate(g_t) loss_c = self.calc_content_loss(g_t_feats[3], content_feats_l3, norm=True) + self.calc_content_loss( g_t_feats[4], content_feats_l4, norm=True) style_feats[0] = torch.cat((style_feats[0], style_feats[0], style_feats[0]), 0) loss_s = self.calc_style_loss(g_t_feats[0], style_feats[0]) for i in range(1, 5): style_feats[i] = torch.cat((style_feats[i], style_feats[i], style_feats[i]), 0) loss_s += self.calc_style_loss(g_t_feats[i], style_feats[i]) return loss_c, loss_s
class Net(nn.Module): def __init__(self, encoder, decoder, start_iter): super(Net, self).__init__() vgg = encoder # self.enc_0 = nn.Sequential(*list(vgg.children())[:1]) # enc_layers = list(encoder.children()) self.enc_1 = nn.Sequential(*list(vgg.children())[:4]) # input -> relu1_1 self.enc_2 = nn.Sequential(*list(vgg.children())[4:11]) # relu1_1 -> relu2_1 self.enc_3 = nn.Sequential(*list(vgg.children())[11:18]) # relu2_1 -> relu3_1 self.enc_4 = nn.Sequential(*list(vgg.children())[18:31]) # relu3_1 -> relu4_1 self.enc_5 = nn.Sequential(*list(vgg.children())[31:44]) # relu4_1 -> relu5_1 # transform self.transform = Transform(in_planes=512) self.decoder = decoder if (start_iter > 0): self.transform.load_state_dict(torch.load('weight/transformer_iter_' + str(start_iter) + '.pth')) self.decoder.load_state_dict(torch.load('weight/decoder_iter_' + str(start_iter) + '.pth')) self.mse_loss = nn.MSELoss() self.variation_loss = nn.L1Loss() # fix the encoder for name in ['enc_1', 'enc_2', 'enc_3', 'enc_4', 'enc_5']: for param in getattr(self, name).parameters(): param.requires_grad = False self.dx_bias = np.zeros([256, 256]) self.dy_bias = np.zeros([256, 256]) for i in range(256): self.dx_bias[:, i] = i self.dx_bias[i, :] = i # extract relu1_1, relu2_1, relu3_1, relu4_1, relu5_1 from input image def encode_with_intermediate(self, input): results = [input] for i in range(5): func = getattr(self, 'enc_{:d}'.format(i+1)) results.append(func(results[-1])) return results[1:] def calc_content_loss(self, input, target, norm=False): if (norm == False): return self.mse_loss(input, target) else: return self.mse_loss(mean_variance_norm(input), mean_variance_norm(target)) def calc_style_loss(self, input, target): input_mean, input_std = calc_mean_std(input) target_mean, target_std = calc_mean_std(target) return self.mse_loss(input_mean, target_mean) + \ self.mse_loss(input_std, target_std) def calc_temporal_loss(self, x1, x2): h = x1.shape[2] w = x1.shape[3] D = h*w return self.mse_loss(x1, x2) def compute_total_variation_loss_l1(self, inputs): h = inputs.shape[2] w = inputs.shape[3] h1 = inputs[:, :, 0:h-1, :] h2 = inputs[:, :, 1:h, :] w1 = inputs[:, :, :, 0:w-1] w2 = inputs[:, :, :, 1:w] return self.variation_loss(h1, h2)+self.variation_loss(w1, w2) def forward(self, content, style, content2=None, video=False): style_feats = self.encode_with_intermediate(style) content_feats = self.encode_with_intermediate(content) stylized = self.transform(content_feats[3], style_feats[3], content_feats[4], style_feats[4]) g_t = self.decoder(stylized) loss_v = self.compute_total_variation_loss_l1(g_t) g_t_feats = self.encode_with_intermediate(g_t) loss_c = self.calc_content_loss(g_t_feats[3], content_feats[3], norm=True) + self.calc_content_loss( g_t_feats[4], content_feats[4], norm=True) loss_s = self.calc_style_loss(g_t_feats[0], style_feats[0]) for i in range(1, 5): loss_s += self.calc_style_loss(g_t_feats[i], style_feats[i]) """IDENTITY LOSSES""" Icc = self.decoder(self.transform(content_feats[3], content_feats[3], content_feats[4], content_feats[4])) Iss = self.decoder(self.transform(style_feats[3], style_feats[3], style_feats[4], style_feats[4])) l_identity1 = self.calc_content_loss(Icc, content) + self.calc_content_loss(Iss, style) Fcc = self.encode_with_intermediate(Icc) Fss = self.encode_with_intermediate(Iss) l_identity2 = self.calc_content_loss(Fcc[0], content_feats[0]) + self.calc_content_loss(Fss[0], style_feats[0]) for i in range(1, 5): l_identity2 += self.calc_content_loss(Fcc[i], content_feats[i]) + self.calc_content_loss(Fss[i], style_feats[i]) if video==False: return loss_c, loss_s, l_identity1, l_identity2, loss_v else: content_feats2 = self.encode_with_intermediate(content2) stylized2 = self.transform(content_feats2[3], style_feats[3], content_feats2[4], style_feats[4]) g_t2 = self.decoder(stylized2) g_t2_feats = self.encode_with_intermediate(g_t2) temporal_loss = self.calc_temporal_loss(g_t_feats[0], g_t2_feats[0]) return loss_c, loss_s, l_identity1, l_identity2, temporal_loss, loss_v
def train(args): # get context ctx = get_extension_context(args.context) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) config = read_yaml(args.config) if args.info: config.monitor_params.info = args.info if comm.size == 1: comm = None else: # disable outputs from logger except its rank = 0 if comm.rank > 0: import logging logger.setLevel(logging.ERROR) test = False train_params = config.train_params dataset_params = config.dataset_params model_params = config.model_params loss_flags = get_loss_flags(train_params) start_epoch = 0 rng = np.random.RandomState(device_id) data_iterator = frame_data_iterator( root_dir=dataset_params.root_dir, frame_shape=dataset_params.frame_shape, id_sampling=dataset_params.id_sampling, is_train=True, random_seed=rng, augmentation_params=dataset_params.augmentation_params, batch_size=train_params['batch_size'], shuffle=True, with_memory_cache=False, with_file_cache=False) if n_devices > 1: data_iterator = data_iterator.slice(rng=rng, num_of_slices=comm.size, slice_pos=comm.rank) # workaround not to use memory cache data_iterator._data_source._on_memory = False logger.info("Disabled on memory data cache.") bs, h, w, c = [train_params.batch_size] + dataset_params.frame_shape source = nn.Variable((bs, c, h, w)) driving = nn.Variable((bs, c, h, w)) with nn.parameter_scope("kp_detector"): # kp_X = {"value": Variable((bs, 10, 2)), "jacobian": Variable((bs, 10, 2, 2))} kp_source = detect_keypoint(source, **model_params.kp_detector_params, **model_params.common_params, test=test, comm=comm) persistent_all(kp_source) kp_driving = detect_keypoint(driving, **model_params.kp_detector_params, **model_params.common_params, test=test, comm=comm) persistent_all(kp_driving) with nn.parameter_scope("generator"): generated = occlusion_aware_generator(source, kp_source=kp_source, kp_driving=kp_driving, **model_params.generator_params, **model_params.common_params, test=test, comm=comm) # generated is a dictionary containing; # 'mask': Variable((bs, num_kp+1, h/4, w/4)) when scale_factor=0.25 # 'sparse_deformed': Variable((bs, num_kp + 1, num_channel, h/4, w/4)) # 'occlusion_map': Variable((bs, 1, h/4, w/4)) # 'deformed': Variable((bs, c, h, w)) # 'prediction': Variable((bs, c, h, w)) Only this is fed to discriminator. generated["prediction"].persistent = True pyramide_real = get_image_pyramid(driving, train_params.scales, generated["prediction"].shape[1]) persistent_all(pyramide_real) pyramide_fake = get_image_pyramid(generated['prediction'], train_params.scales, generated["prediction"].shape[1]) persistent_all(pyramide_fake) total_loss_G = None # dammy. defined temporarily loss_var_dict = {} # perceptual loss using VGG19 (always applied) if loss_flags.use_perceptual_loss: logger.info("Use Perceptual Loss.") scales = train_params.scales weights = train_params.loss_weights.perceptual vgg_param_path = train_params.vgg_param_path percep_loss = perceptual_loss(pyramide_real, pyramide_fake, scales, weights, vgg_param_path) percep_loss.persistent = True loss_var_dict['perceptual_loss'] = percep_loss total_loss_G = percep_loss # (LS)GAN loss and feature matching loss if loss_flags.use_gan_loss: logger.info("Use GAN Loss.") with nn.parameter_scope("discriminator"): discriminator_maps_generated = multiscale_discriminator( pyramide_fake, kp=unlink_all(kp_driving), **model_params.discriminator_params, **model_params.common_params, test=test, comm=comm) discriminator_maps_real = multiscale_discriminator( pyramide_real, kp=unlink_all(kp_driving), **model_params.discriminator_params, **model_params.common_params, test=test, comm=comm) for v in discriminator_maps_generated["feature_maps_1"]: v.persistent = True discriminator_maps_generated["prediction_map_1"].persistent = True for v in discriminator_maps_real["feature_maps_1"]: v.persistent = True discriminator_maps_real["prediction_map_1"].persistent = True for i, scale in enumerate(model_params.discriminator_params.scales): key = f'prediction_map_{scale}'.replace('.', '-') lsgan_loss_weight = train_params.loss_weights.generator_gan # LSGAN loss for Generator if i == 0: gan_loss_gen = lsgan_loss(discriminator_maps_generated[key], lsgan_loss_weight) else: gan_loss_gen += lsgan_loss(discriminator_maps_generated[key], lsgan_loss_weight) # LSGAN loss for Discriminator if i == 0: gan_loss_dis = lsgan_loss(discriminator_maps_real[key], lsgan_loss_weight, discriminator_maps_generated[key]) else: gan_loss_dis += lsgan_loss(discriminator_maps_real[key], lsgan_loss_weight, discriminator_maps_generated[key]) gan_loss_dis.persistent = True loss_var_dict['gan_loss_dis'] = gan_loss_dis total_loss_D = gan_loss_dis total_loss_D.persistent = True gan_loss_gen.persistent = True loss_var_dict['gan_loss_gen'] = gan_loss_gen total_loss_G += gan_loss_gen if loss_flags.use_feature_matching_loss: logger.info("Use Feature Matching Loss.") fm_weights = train_params.loss_weights.feature_matching fm_loss = feature_matching_loss(discriminator_maps_real, discriminator_maps_generated, model_params, fm_weights) fm_loss.persistent = True loss_var_dict['feature_matching_loss'] = fm_loss total_loss_G += fm_loss # transform loss if loss_flags.use_equivariance_value_loss or loss_flags.use_equivariance_jacobian_loss: transform = Transform(bs, **config.train_params.transform_params) transformed_frame = transform.transform_frame(driving) with nn.parameter_scope("kp_detector"): transformed_kp = detect_keypoint(transformed_frame, **model_params.kp_detector_params, **model_params.common_params, test=test, comm=comm) persistent_all(transformed_kp) # Value loss part if loss_flags.use_equivariance_value_loss: logger.info("Use Equivariance Value Loss.") warped_kp_value = transform.warp_coordinates( transformed_kp['value']) eq_value_weight = train_params.loss_weights.equivariance_value eq_value_loss = equivariance_value_loss(kp_driving['value'], warped_kp_value, eq_value_weight) eq_value_loss.persistent = True loss_var_dict['equivariance_value_loss'] = eq_value_loss total_loss_G += eq_value_loss # jacobian loss part if loss_flags.use_equivariance_jacobian_loss: logger.info("Use Equivariance Jacobian Loss.") arithmetic_jacobian = transform.jacobian(transformed_kp['value']) eq_jac_weight = train_params.loss_weights.equivariance_jacobian eq_jac_loss = equivariance_jacobian_loss( kp_driving['jacobian'], arithmetic_jacobian, transformed_kp['jacobian'], eq_jac_weight) eq_jac_loss.persistent = True loss_var_dict['equivariance_jacobian_loss'] = eq_jac_loss total_loss_G += eq_jac_loss assert total_loss_G is not None total_loss_G.persistent = True loss_var_dict['total_loss_gen'] = total_loss_G # -------------------- Create Monitors -------------------- monitors_gen, monitors_dis, monitor_time, monitor_vis, log_dir = get_monitors( config, loss_flags, loss_var_dict) if device_id == 0: # Dump training info .yaml _ = shutil.copy(args.config, log_dir) # copy the config yaml training_info_yaml = os.path.join(log_dir, "training_info.yaml") os.rename(os.path.join(log_dir, os.path.basename(args.config)), training_info_yaml) # then add additional information with open(training_info_yaml, "a", encoding="utf-8") as f: f.write(f"\nlog_dir: {log_dir}\nsaved_parameter: None") # -------------------- Solver Setup -------------------- solvers = setup_solvers(train_params) solver_generator = solvers["generator"] solver_discriminator = solvers["discriminator"] solver_kp_detector = solvers["kp_detector"] # max epochs num_epochs = train_params['num_epochs'] # iteration per epoch num_iter_per_epoch = data_iterator.size // bs # will be increased by num_repeat if 'num_repeats' in train_params or train_params['num_repeats'] != 1: num_iter_per_epoch *= config.train_params.num_repeats # modify learning rate if current epoch exceeds the number defined in lr_decay_at_epochs = train_params['epoch_milestones'] # ex. [60, 90] gamma = 0.1 # decay rate # -------------------- For finetuning --------------------- if args.ft_params: assert os.path.isfile(args.ft_params) logger.info(f"load {args.ft_params} for finetuning.") nn.load_parameters(args.ft_params) start_epoch = int( os.path.splitext(os.path.basename( args.ft_params))[0].split("epoch_")[1]) # set solver's state for name, solver in solvers.items(): saved_states = os.path.join( os.path.dirname(args.ft_params), f"state_{name}_at_epoch_{start_epoch}.h5") solver.load_states(saved_states) start_epoch += 1 logger.info(f"Resuming from epoch {start_epoch}.") logger.info( f"Start training. Total epoch: {num_epochs - start_epoch}, {num_iter_per_epoch * n_devices} iter/epoch." ) for e in range(start_epoch, num_epochs): logger.info(f"Epoch: {e} / {num_epochs}.") data_iterator._reset() # rewind the iterator at the beginning # learning rate scheduler if e in lr_decay_at_epochs: logger.info("Learning rate decayed.") learning_rate_decay(solvers, gamma=gamma) for i in range(num_iter_per_epoch): _driving, _source = data_iterator.next() source.d = _source driving.d = _driving # update generator and keypoint detector total_loss_G.forward() if device_id == 0: monitors_gen.add((e * num_iter_per_epoch + i) * n_devices) solver_generator.zero_grad() solver_kp_detector.zero_grad() callback = None if n_devices > 1: params = [x.grad for x in solver_generator.get_parameters().values()] + \ [x.grad for x in solver_kp_detector.get_parameters().values()] callback = comm.all_reduce_callback(params, 2 << 20) total_loss_G.backward(clear_buffer=True, communicator_callbacks=callback) solver_generator.update() solver_kp_detector.update() if loss_flags.use_gan_loss: # update discriminator total_loss_D.forward(clear_no_need_grad=True) if device_id == 0: monitors_dis.add((e * num_iter_per_epoch + i) * n_devices) solver_discriminator.zero_grad() callback = None if n_devices > 1: params = [ x.grad for x in solver_discriminator.get_parameters().values() ] callback = comm.all_reduce_callback(params, 2 << 20) total_loss_D.backward(clear_buffer=True, communicator_callbacks=callback) solver_discriminator.update() if device_id == 0: monitor_time.add((e * num_iter_per_epoch + i) * n_devices) if device_id == 0 and ( (e * num_iter_per_epoch + i) * n_devices) % config.monitor_params.visualize_freq == 0: images_to_visualize = [ source.d, driving.d, generated["prediction"].d ] visuals = combine_images(images_to_visualize) monitor_vis.add((e * num_iter_per_epoch + i) * n_devices, visuals) if device_id == 0: if e % train_params.checkpoint_freq == 0 or e == num_epochs - 1: save_parameters(e, log_dir, solvers) return