def build_model(config, from_style, to_style): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") generator_ab = ResidualGenerator(config.image_size, config.num_residual_blocks).to(device) generator_ba = ResidualGenerator(config.image_size, config.num_residual_blocks).to(device) discriminator_a = Discriminator(config.image_size).to(device) discriminator_b = Discriminator(config.image_size).to(device) generator_ab_param = glob( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}", f"generator_ab_{config.epoch-1}.pth")) generator_ba_param = glob( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}", f"generator_ba_{config.epoch-1}.pth")) discriminator_a_param = glob( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}", f"discriminator_a_{config.epoch-1}.pth")) discriminator_b_param = glob( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}", f"discriminator_b_{config.epoch-1}.pth")) print(f"[*] Load checkpoint in {config.checkpoint_dir}") if not os.path.exists( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}")): os.makedirs( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}")) if len( os.listdir( os.path.join(config.checkpoint_dir, f"{from_style}2{to_style}"))) == 0: print(f"[!] No checkpoint in {config.checkpoint_dir}") generator_ab.apply(weights_init) generator_ba.apply(weights_init) discriminator_a.apply(weights_init) discriminator_b.apply(weights_init) else: generator_ab.load_state_dict( torch.load(generator_ab_param[-1], map_location=device)) generator_ba.load_state_dict( torch.load(generator_ba_param[-1], map_location=device)) discriminator_a.load_state_dict( torch.load(discriminator_a_param[-1], map_location=device)) discriminator_b.load_state_dict( torch.load(discriminator_b_param[-1], map_location=device)) return generator_ab, generator_ba, discriminator_a, discriminator_b
def train(args, generator: Generator, discriminator: Discriminator, feature_extractor: FeatureExtractor, photo_dataloader, edge_smooth_dataloader, animation_dataloader, checkpoint_dir=None): tb_writter = SummaryWriter() gen_criterion = nn.BCELoss().to(args.device) disc_criterion = nn.BCELoss().to(args.device) content_criterion = nn.L1Loss().to(args.device) gen_optimizer = torch.optim.Adam(generator.parameters(), lr=args.lr, betas=(args.adam_beta, 0.999)) disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=args.lr, betas=(args.adam_beta, 0.999)) global_step = 0 global_init_step = 0 # The number of steps to skip when loading a checkpoint skipped_step = 0 skipped_init_step = 0 cur_epoch = 0 cur_init_epoch = 0 data_len = min(len(photo_dataloader), len(edge_smooth_dataloader), len(animation_dataloader)) if checkpoint_dir: try: checkpoint_dict = load(checkpoint_dir) generator.load_state_dict(checkpoint_dict['generator']) discriminator.load_state_dict(checkpoint_dict['discriminator']) gen_optimizer.load_state_dict(checkpoint_dict['gen_optimizer']) disc_optimizer.load_state_dict(checkpoint_dict['disc_optimizer']) global_step = checkpoint_dict['global_step'] global_init_step = checkpoint_dict['global_init_step'] cur_epoch = global_step // data_len cur_init_epoch = global_init_step // len(photo_dataloader) skipped_step = global_step % data_len skipped_init_step = global_init_step % len(photo_dataloader) logger.info("Start training with,") logger.info("In initialization step, epoch: %d, step: %d", cur_init_epoch, skipped_init_step) logger.info("In main train step, epoch: %d, step: %d", cur_epoch, skipped_step) except: logger.info("Wrong checkpoint path") t_total = data_len * args.n_epochs t_init_total = len(photo_dataloader) * args.n_init_epoch # Train! logger.info("***** Running training *****") logger.info(" Num photo examples = %d", len(photo_dataloader)) logger.info(" Num edge_smooth examples = %d", len(edge_smooth_dataloader)) logger.info(" Num animation examples = %d", len(animation_dataloader)) logger.info(" Num Epochs = %d", args.n_epochs) logger.info(" Total train batch size = %d", args.batch_size) logger.info(" Total optimization steps = %d", t_total) logger.info(" Num Init Epochs = %d", args.n_init_epoch) logger.info(" Total Init optimization steps = %d", t_init_total) logger.info(" Logging steps = %d", args.logging_steps) logger.info(" Save steps = %d", args.save_steps) init_phase = True try: generator.train() discriminator.train() gloabl_init_loss = 0 # --- Initialization Content loss mb = master_bar(range(cur_init_epoch, args.n_init_epoch)) for init_epoch in mb: epoch_iter = progress_bar(photo_dataloader, parent=mb) for step, (photo, _) in enumerate(epoch_iter): if skipped_init_step > 0: skipped_init_step = -1 continue photo = photo.to(args.device) gen_optimizer.zero_grad() x_features = feature_extractor((photo + 1) / 2).detach() Gx = generator(photo) Gx_features = feature_extractor((Gx + 1) / 2) content_loss = args.content_loss_weight * content_criterion( Gx_features, x_features) content_loss.backward() gen_optimizer.step() gloabl_init_loss += content_loss.item() global_init_step += 1 if args.save_steps > 0 and global_init_step % args.save_steps == 0: logger.info( "Save Initialization Phase, init_epoch: %d, init_step: %d", init_epoch, global_init_step) save(checkpoint_dir, global_step, global_init_step, generator, discriminator, gen_optimizer, disc_optimizer) if args.logging_steps > 0 and global_init_step % args.logging_steps == 0: tb_writter.add_scalar('Initialization Phase/Content Loss', content_loss.item(), global_init_step) tb_writter.add_scalar( 'Initialization Phase/Global Generator Loss', gloabl_init_loss / global_init_step, global_init_step) logger.info( "Initialization Phase, Epoch: %d, Global Step: %d, Content Loss: %.4f", init_epoch, global_init_step, gloabl_init_loss / (global_init_step)) # ----------------------------------------------------- logger.info("Finish Initialization Phase, save model...") save(checkpoint_dir, global_step, global_init_step, generator, discriminator, gen_optimizer, disc_optimizer) init_phase = False global_loss_D = 0 global_loss_G = 0 global_loss_content = 0 mb = master_bar(range(cur_epoch, args.n_epochs)) for epoch in mb: epoch_iter = progress_bar(list( zip(animation_dataloader, edge_smooth_dataloader, photo_dataloader)), parent=mb) for step, ((animation, _), (edge_smoothed, _), (photo, _)) in enumerate(epoch_iter): if skipped_step > 0: skipped_step = -1 continue animation = animation.to(args.device) edge_smoothed = edge_smoothed.to(args.device) photo = photo.to(args.device) disc_optimizer.zero_grad() # --- Train discriminator # ------ Train Discriminator with animation image animation_disc = discriminator(animation) animation_target = torch.ones_like(animation_disc) loss_animation_disc = disc_criterion(animation_disc, animation_target) # ------ Train Discriminator with edge image edge_smoothed_disc = discriminator(edge_smoothed) edge_smoothed_target = torch.zeros_like(edge_smoothed_disc) loss_edge_disc = disc_criterion(edge_smoothed_disc, edge_smoothed_target) # ------ Train Discriminator with generated image generated_image = generator(photo).detach() generated_image_disc = discriminator(generated_image) generated_image_target = torch.zeros_like(generated_image_disc) loss_generated_disc = disc_criterion(generated_image_disc, generated_image_target) loss_disc = loss_animation_disc + loss_edge_disc + loss_generated_disc loss_disc.backward() disc_optimizer.step() global_loss_D += loss_disc.item() # --- Train Generator gen_optimizer.zero_grad() generated_image = generator(photo) generated_image_disc = discriminator(generated_image) generated_image_target = torch.ones_like(generated_image_disc) loss_adv = gen_criterion(generated_image_disc, generated_image_target) # ------ Train Generator with content loss x_features = feature_extractor((photo + 1) / 2).detach() Gx_features = feature_extractor((generated_image + 1) / 2) loss_content = args.content_loss_weight * content_criterion( Gx_features, x_features) loss_gen = loss_adv + loss_content loss_gen.backward() gen_optimizer.step() global_loss_G += loss_adv.item() global_loss_content += loss_content.item() global_step += 1 if args.save_steps > 0 and global_step % args.save_steps == 0: logger.info("Save Training Phase, epoch: %d, step: %d", epoch, global_step) save(checkpoint_dir, global_step, global_init_step, generator, discriminator, gen_optimizer, disc_optimizer) if args.logging_steps > 0 and global_init_step % args.logging_steps == 0: tb_writter.add_scalar('Train Phase/Generator Loss', loss_adv.item(), global_step) tb_writter.add_scalar('Train Phase/Discriminator Loss', loss_disc.item(), global_step) tb_writter.add_scalar('Train Phase/Content Loss', loss_content.item(), global_step) tb_writter.add_scalar('Train Phase/Global Generator Loss', global_loss_G / global_step, global_step) tb_writter.add_scalar( 'Train Phase/Global Discriminator Loss', global_loss_D / global_step, global_step) tb_writter.add_scalar('Train Phase/Global Content Loss', global_loss_content / global_step, global_step) logger.info( "Training Phase, Epoch: %d, Global Step: %d, Disc Loss %.4f, Gen Loss %.4f, Content Loss: %.4f", epoch, global_step, global_loss_D / global_step, global_loss_G / global_step, global_loss_content / global_step) except KeyboardInterrupt: if init_phase: logger.info("KeyboardInterrupt in Initialization Phase!") logger.info("Save models, init_epoch: %d, init_step: %d", init_epoch, global_init_step) else: logger.info("KeyboardInterrupt in Training Phase!") logger.info("Save models, epoch: %d, step: %d", epoch, global_step) save(checkpoint_dir, global_step, global_init_step, generator, discriminator, gen_optimizer, disc_optimizer)
class MaskCycleGANVCTrainer(object): """Trainer for MaskCycleGAN-VC """ def __init__(self, args): """ Args: args (Namespace): Program arguments from argparser """ # Store args self.num_epochs = args.num_epochs self.start_epoch = args.start_epoch self.generator_lr = args.generator_lr self.discriminator_lr = args.discriminator_lr self.decay_after = args.decay_after self.mini_batch_size = args.batch_size self.cycle_loss_lambda = args.cycle_loss_lambda self.identity_loss_lambda = args.identity_loss_lambda self.device = args.device self.epochs_per_save = args.epochs_per_save self.sample_rate = args.sample_rate self.validation_A_dir = os.path.join(args.origin_data_dir, args.speaker_A_id) self.output_A_dir = os.path.join(args.output_data_dir, args.speaker_A_id) self.validation_B_dir = os.path.join(args.origin_data_dir, args.speaker_B_id) self.output_B_dir = os.path.join(args.output_data_dir, args.speaker_B_id) self.infer_data_dir = args.infer_data_dir self.pretrain_models = args.pretrain_models # Initialize speakerA's dataset self.dataset_A = self.loadPickleFile( os.path.join( args.preprocessed_data_dir, args.speaker_A_id, f"{args.speaker_A_id}_normalized.pickle", )) dataset_A_norm_stats = np.load( os.path.join( args.preprocessed_data_dir, args.speaker_A_id, f"{args.speaker_A_id}_norm_stat.npz", )) self.dataset_A_mean = dataset_A_norm_stats["mean"] self.dataset_A_std = dataset_A_norm_stats["std"] # Initialize speakerB's dataset self.dataset_B = self.loadPickleFile( os.path.join( args.preprocessed_data_dir, args.speaker_B_id, f"{args.speaker_B_id}_normalized.pickle", )) dataset_B_norm_stats = np.load( os.path.join( args.preprocessed_data_dir, args.speaker_B_id, f"{args.speaker_B_id}_norm_stat.npz", )) self.dataset_B_mean = dataset_B_norm_stats["mean"] self.dataset_B_std = dataset_B_norm_stats["std"] # Compute lr decay rate self.n_samples = len(self.dataset_A) print(f"n_samples = {self.n_samples}") self.generator_lr_decay = self.generator_lr / float( self.num_epochs * (self.n_samples // self.mini_batch_size)) self.discriminator_lr_decay = self.discriminator_lr / float( self.num_epochs * (self.n_samples // self.mini_batch_size)) print(f"generator_lr_decay = {self.generator_lr_decay}") print(f"discriminator_lr_decay = {self.discriminator_lr_decay}") # Initialize Train Dataloader self.num_frames = args.num_frames self.dataset = VCDataset( datasetA=self.dataset_A, datasetB=self.dataset_B, n_frames=args.num_frames, max_mask_len=args.max_mask_len, ) self.train_dataloader = flow.utils.data.DataLoader( dataset=self.dataset, batch_size=self.mini_batch_size, shuffle=True, drop_last=False, ) # Initialize Generators and Discriminators self.generator_A2B = Generator().to(self.device) self.generator_B2A = Generator().to(self.device) self.discriminator_A = Discriminator().to(self.device) self.discriminator_B = Discriminator().to(self.device) # Discriminator to compute 2 step adversarial loss self.discriminator_A2 = Discriminator().to(self.device) # Discriminator to compute 2 step adversarial loss self.discriminator_B2 = Discriminator().to(self.device) # Initialize Optimizers g_params = list(self.generator_A2B.parameters()) + list( self.generator_B2A.parameters()) d_params = (list(self.discriminator_A.parameters()) + list(self.discriminator_B.parameters()) + list(self.discriminator_A2.parameters()) + list(self.discriminator_B2.parameters())) self.generator_optimizer = flow.optim.Adam(g_params, lr=self.generator_lr, betas=(0.5, 0.999)) self.discriminator_optimizer = flow.optim.Adam( d_params, lr=self.discriminator_lr, betas=(0.5, 0.999)) def adjust_lr_rate(self, optimizer, generator): """Decays learning rate. Args: optimizer (torch.optim): torch optimizer generator (bool): Whether to adjust generator lr. """ if generator: self.generator_lr = max( 0.0, self.generator_lr - self.generator_lr_decay) for param_groups in optimizer.param_groups: param_groups["lr"] = self.generator_lr else: self.discriminator_lr = max( 0.0, self.discriminator_lr - self.discriminator_lr_decay) for param_groups in optimizer.param_groups: param_groups["lr"] = self.discriminator_lr def reset_grad(self): """Sets gradients of the generators and discriminators to zero before backpropagation. """ self.generator_optimizer.zero_grad() self.discriminator_optimizer.zero_grad() def loadPickleFile(self, fileName): """Loads a Pickle file. Args: fileName (str): pickle file path Returns: file object: The loaded pickle file object """ with open(fileName, "rb") as f: return pickle.load(f) def train(self): """Implements the training loop for MaskCycleGAN-VC """ for epoch in range(self.start_epoch, self.num_epochs + 1): for i, (real_A, mask_A, real_B, mask_B) in enumerate(self.train_dataloader): num_iterations = (self.n_samples // self.mini_batch_size) * epoch + i if num_iterations > 10000: self.identity_loss_lambda = 0 if num_iterations > self.decay_after: self.adjust_lr_rate(self.generator_optimizer, generator=True) self.adjust_lr_rate(self.generator_optimizer, generator=False) real_A = real_A.to(self.device, dtype=flow.float) mask_A = mask_A.to(self.device, dtype=flow.float) real_B = real_B.to(self.device, dtype=flow.float) mask_B = mask_B.to(self.device, dtype=flow.float) # Train Generator self.generator_A2B.train() self.generator_B2A.train() self.discriminator_A.eval() self.discriminator_B.eval() self.discriminator_A2.eval() self.discriminator_B2.eval() # Generator Feed Forward fake_B = self.generator_A2B(real_A, mask_A) cycle_A = self.generator_B2A(fake_B, flow.ones_like(fake_B)) fake_A = self.generator_B2A(real_B, mask_B) cycle_B = self.generator_A2B(fake_A, flow.ones_like(fake_A)) identity_A = self.generator_B2A(real_A, flow.ones_like(real_A)) identity_B = self.generator_A2B(real_B, flow.ones_like(real_B)) d_fake_A = self.discriminator_A(fake_A) d_fake_B = self.discriminator_B(fake_B) # For Two Step Adverserial Loss d_fake_cycle_A = self.discriminator_A2(cycle_A) d_fake_cycle_B = self.discriminator_B2(cycle_B) # Generator Cycle Loss cycleLoss = flow.mean(flow.abs(real_A - cycle_A)) + flow.mean( flow.abs(real_B - cycle_B)) # Generator Identity Loss identityLoss = flow.mean( flow.abs(real_A - identity_A)) + flow.mean( flow.abs(real_B - identity_B)) # Generator Loss g_loss_A2B = flow.mean((1 - d_fake_B)**2) g_loss_B2A = flow.mean((1 - d_fake_A)**2) # Generator Two Step Adverserial Loss generator_loss_A2B_2nd = flow.mean((1 - d_fake_cycle_B)**2) generator_loss_B2A_2nd = flow.mean((1 - d_fake_cycle_A)**2) # Total Generator Loss g_loss = (g_loss_A2B + g_loss_B2A + generator_loss_A2B_2nd + generator_loss_B2A_2nd + self.cycle_loss_lambda * cycleLoss + self.identity_loss_lambda * identityLoss) # Backprop for Generator self.reset_grad() g_loss.backward() self.generator_optimizer.step() # Train Discriminator self.generator_A2B.eval() self.generator_B2A.eval() self.discriminator_A.train() self.discriminator_B.train() self.discriminator_A2.train() self.discriminator_B2.train() # Discriminator Feed Forward d_real_A = self.discriminator_A(real_A) d_real_B = self.discriminator_B(real_B) d_real_A2 = self.discriminator_A2(real_A) d_real_B2 = self.discriminator_B2(real_B) generated_A = self.generator_B2A(real_B, mask_B) d_fake_A = self.discriminator_A(generated_A) # For Two Step Adverserial Loss A->B cycled_B = self.generator_A2B(generated_A, flow.ones_like(generated_A)) d_cycled_B = self.discriminator_B2(cycled_B) generated_B = self.generator_A2B(real_A, mask_A) d_fake_B = self.discriminator_B(generated_B) # For Two Step Adverserial Loss B->A cycled_A = self.generator_B2A(generated_B, flow.ones_like(generated_B)) d_cycled_A = self.discriminator_A2(cycled_A) # Loss Functions d_loss_A_real = flow.mean((1 - d_real_A)**2) d_loss_A_fake = flow.mean((0 - d_fake_A)**2) d_loss_A = (d_loss_A_real + d_loss_A_fake) / 2.0 d_loss_B_real = flow.mean((1 - d_real_B)**2) d_loss_B_fake = flow.mean((0 - d_fake_B)**2) d_loss_B = (d_loss_B_real + d_loss_B_fake) / 2.0 # Two Step Adverserial Loss d_loss_A_cycled = flow.mean((0 - d_cycled_A)**2) d_loss_B_cycled = flow.mean((0 - d_cycled_B)**2) d_loss_A2_real = flow.mean((1 - d_real_A2)**2) d_loss_B2_real = flow.mean((1 - d_real_B2)**2) d_loss_A_2nd = (d_loss_A2_real + d_loss_A_cycled) / 2.0 d_loss_B_2nd = (d_loss_B2_real + d_loss_B_cycled) / 2.0 # Final Loss for discriminator with the Two Step Adverserial Loss d_loss = (d_loss_A + d_loss_B) / 2.0 + (d_loss_A_2nd + d_loss_B_2nd) / 2.0 # Backprop for Discriminator self.reset_grad() d_loss.backward() self.discriminator_optimizer.step() if (i + 1) % 2 == 0: print( "Iter:{} Generator Loss:{:.4f} Discrimator Loss:{:.4f} GA2B:{:.4f} GB2A:{:.4f} G_id:{:.4f} G_cyc:{:.4f} D_A:{:.4f} D_B:{:.4f}" .format( num_iterations, g_loss.item(), d_loss.item(), g_loss_A2B, g_loss_B2A, identityLoss, cycleLoss, d_loss_A, d_loss_B, )) # Save each model checkpoint and validation if epoch % self.epochs_per_save == 0 and epoch != 0: self.saveModelCheckPoint(epoch, PATH="model_checkpoint") self.validation_for_A_dir() self.validation_for_B_dir() def infer(self): """Implements the infering loop for MaskCycleGAN-VC """ # load pretrain models self.loadModel(self.pretrain_models) num_mcep = 80 sampling_rate = self.sample_rate frame_period = 5.0 infer_A_dir = self.infer_data_dir print("Generating Validation Data B from A...") for file in os.listdir(infer_A_dir): filePath = os.path.join(infer_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.dataset_A_mean, std_log_src=self.dataset_A_std, mean_log_target=self.dataset_B_mean, std_log_target=self.dataset_B_std, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.dataset_A_mean) / self.dataset_A_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B( coded_sp_norm, flow.ones_like(coded_sp_norm)) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.dataset_B_std + self.dataset_B_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted).astype(np.double) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted[0], decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(infer_A_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, ) def validation_for_A_dir(self): num_mcep = 80 sampling_rate = 22050 frame_period = 5.0 validation_A_dir = self.validation_A_dir output_A_dir = self.output_A_dir os.makedirs(output_A_dir, exist_ok=True) print("Generating Validation Data B from A...") for file in os.listdir(validation_A_dir): filePath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.dataset_A_mean, std_log_src=self.dataset_A_std, mean_log_target=self.dataset_B_mean, std_log_target=self.dataset_B_std, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.dataset_A_mean) / self.dataset_A_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B( coded_sp_norm, flow.ones_like(coded_sp_norm)) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.dataset_B_std + self.dataset_B_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted).astype(np.double) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted[0], decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_A_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, ) def validation_for_B_dir(self): num_mcep = 80 sampling_rate = 22050 frame_period = 5.0 validation_B_dir = self.validation_B_dir output_B_dir = self.output_B_dir os.makedirs(output_B_dir, exist_ok=True) print("Generating Validation Data A from B...") for file in os.listdir(validation_B_dir): filePath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.dataset_B_mean, std_log_src=self.dataset_B_std, mean_log_target=self.dataset_A_mean, std_log_target=self.dataset_A_std, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.dataset_B_mean) / self.dataset_B_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_B2A( coded_sp_norm, flow.ones_like(coded_sp_norm)) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.dataset_A_std + self.dataset_A_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted).astype(np.double) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted[0], decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_B_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, ) def saveModelCheckPoint(self, epoch, PATH): flow.save( self.generator_A2B.state_dict(), os.path.join(PATH, "generator_A2B_%d" % epoch), ) flow.save( self.generator_B2A.state_dict(), os.path.join(PATH, "generator_B2A_%d" % epoch), ) flow.save( self.discriminator_A.state_dict(), os.path.join(PATH, "discriminator_A_%d" % epoch), ) flow.save( self.discriminator_B.state_dict(), os.path.join(PATH, "discriminator_B_%d" % epoch), ) def loadModel(self, PATH): self.generator_A2B.load_state_dict( flow.load(os.path.join(PATH, "generator_A2B"))) self.generator_B2A.load_state_dict( flow.load(os.path.join(PATH, "generator_B2A"))) self.discriminator_A.load_state_dict( flow.load(os.path.join(PATH, "discriminator_A"))) self.discriminator_B.load_state_dict( flow.load(os.path.join(PATH, "discriminator_B")))
class CycleGANTrainr(object): def __init__( self, logf0s_normalization, mcep_normalization, coded_sps_A_norm, coded_sps_B_norm, model_checkpoint, validation_A_dir, output_A_dir, validation_B_dir, output_B_dir, restart_training_at=None, ): self.start_epoch = 0 self.num_epochs = 200000 self.mini_batch_size = 10 self.dataset_A = self.loadPickleFile(coded_sps_A_norm) self.dataset_B = self.loadPickleFile(coded_sps_B_norm) self.device = flow.device( "cuda" if flow.cuda.is_available() else "cpu") # Speech Parameters logf0s_normalization = np.load(logf0s_normalization) self.log_f0s_mean_A = logf0s_normalization["mean_A"] self.log_f0s_std_A = logf0s_normalization["std_A"] self.log_f0s_mean_B = logf0s_normalization["mean_B"] self.log_f0s_std_B = logf0s_normalization["std_B"] mcep_normalization = np.load(mcep_normalization) self.coded_sps_A_mean = mcep_normalization["mean_A"] self.coded_sps_A_std = mcep_normalization["std_A"] self.coded_sps_B_mean = mcep_normalization["mean_B"] self.coded_sps_B_std = mcep_normalization["std_B"] # Generator and Discriminator self.generator_A2B = Generator().to(self.device) self.generator_B2A = Generator().to(self.device) self.discriminator_A = Discriminator().to(self.device) self.discriminator_B = Discriminator().to(self.device) # Loss Functions criterion_mse = flow.nn.MSELoss() # Optimizer g_params = list(self.generator_A2B.parameters()) + list( self.generator_B2A.parameters()) d_params = list(self.discriminator_A.parameters()) + list( self.discriminator_B.parameters()) # Initial learning rates self.generator_lr = 2e-4 self.discriminator_lr = 1e-4 # Learning rate decay self.generator_lr_decay = self.generator_lr / 200000 self.discriminator_lr_decay = self.discriminator_lr / 200000 # Starts learning rate decay from after this many iterations have passed self.start_decay = 10000 self.generator_optimizer = flow.optim.Adam(g_params, lr=self.generator_lr, betas=(0.5, 0.999)) self.discriminator_optimizer = flow.optim.Adam( d_params, lr=self.discriminator_lr, betas=(0.5, 0.999)) # To Load save previously saved models self.modelCheckpoint = model_checkpoint os.makedirs(self.modelCheckpoint, exist_ok=True) # Validation set Parameters self.validation_A_dir = validation_A_dir self.output_A_dir = output_A_dir os.makedirs(self.output_A_dir, exist_ok=True) self.validation_B_dir = validation_B_dir self.output_B_dir = output_B_dir os.makedirs(self.output_B_dir, exist_ok=True) # Storing Discriminatior and Generator Loss self.generator_loss_store = [] self.discriminator_loss_store = [] self.file_name = "log_store_non_sigmoid.txt" def adjust_lr_rate(self, optimizer, name="generator"): if name == "generator": self.generator_lr = max( 0.0, self.generator_lr - self.generator_lr_decay) for param_groups in optimizer.param_groups: param_groups["lr"] = self.generator_lr else: self.discriminator_lr = max( 0.0, self.discriminator_lr - self.discriminator_lr_decay) for param_groups in optimizer.param_groups: param_groups["lr"] = self.discriminator_lr def reset_grad(self): self.generator_optimizer.zero_grad() self.discriminator_optimizer.zero_grad() def train(self): # Training Begins for epoch in range(self.start_epoch, self.num_epochs): start_time_epoch = time.time() # Constants cycle_loss_lambda = 10 identity_loss_lambda = 5 # Preparing Dataset n_samples = len(self.dataset_A) dataset = trainingDataset(datasetA=self.dataset_A, datasetB=self.dataset_B, n_frames=128) train_loader = flow.utils.data.DataLoader( dataset=dataset, batch_size=self.mini_batch_size, shuffle=True, drop_last=False, ) pbar = tqdm(enumerate(train_loader)) for i, (real_A, real_B) in enumerate(train_loader): num_iterations = (n_samples // self.mini_batch_size) * epoch + i if num_iterations > 10000: identity_loss_lambda = 0 if num_iterations > self.start_decay: self.adjust_lr_rate(self.generator_optimizer, name="generator") self.adjust_lr_rate(self.generator_optimizer, name="discriminator") real_A = real_A.to(self.device).float() real_B = real_B.to(self.device).float() # Generator Loss function fake_B = self.generator_A2B(real_A) cycle_A = self.generator_B2A(fake_B) fake_A = self.generator_B2A(real_B) cycle_B = self.generator_A2B(fake_A) identity_A = self.generator_B2A(real_A) identity_B = self.generator_A2B(real_B) d_fake_A = self.discriminator_A(fake_A) d_fake_B = self.discriminator_B(fake_B) # for the second step adverserial loss d_fake_cycle_A = self.discriminator_A(cycle_A) d_fake_cycle_B = self.discriminator_B(cycle_B) # Generator Cycle loss cycleLoss = flow.mean(flow.abs(real_A - cycle_A)) + flow.mean( flow.abs(real_B - cycle_B)) # Generator Identity Loss identiyLoss = flow.mean( flow.abs(real_A - identity_A)) + flow.mean( flow.abs(real_B - identity_B)) # Generator Loss generator_loss_A2B = flow.mean((1 - d_fake_B)**2) generator_loss_B2A = flow.mean((1 - d_fake_A)**2) # Total Generator Loss generator_loss = (generator_loss_A2B + generator_loss_B2A + cycle_loss_lambda * cycleLoss + identity_loss_lambda * identiyLoss) self.generator_loss_store.append(generator_loss.item()) # Backprop for Generator self.reset_grad() generator_loss.backward() self.generator_optimizer.step() # Discriminator Feed Forward d_real_A = self.discriminator_A(real_A) d_real_B = self.discriminator_B(real_B) generated_A = self.generator_B2A(real_B) d_fake_A = self.discriminator_A(generated_A) # for the second step adverserial loss cycled_B = self.generator_A2B(generated_A) d_cycled_B = self.discriminator_B(cycled_B) generated_B = self.generator_A2B(real_A) d_fake_B = self.discriminator_B(generated_B) # for the second step adverserial loss cycled_A = self.generator_B2A(generated_B) d_cycled_A = self.discriminator_A(cycled_A) # Loss Functions d_loss_A_real = flow.mean((1 - d_real_A)**2) d_loss_A_fake = flow.mean((0 - d_fake_A)**2) d_loss_A = (d_loss_A_real + d_loss_A_fake) / 2.0 d_loss_B_real = flow.mean((1 - d_real_B)**2) d_loss_B_fake = flow.mean((0 - d_fake_B)**2) d_loss_B = (d_loss_B_real + d_loss_B_fake) / 2.0 # the second step adverserial loss d_loss_A_cycled = flow.mean((0 - d_cycled_A)**2) d_loss_B_cycled = flow.mean((0 - d_cycled_B)**2) d_loss_A_2nd = (d_loss_A_real + d_loss_A_cycled) / 2.0 d_loss_B_2nd = (d_loss_B_real + d_loss_B_cycled) / 2.0 # Final Loss for discriminator with the second step adverserial loss d_loss = (d_loss_A + d_loss_B) / 2.0 + (d_loss_A_2nd + d_loss_B_2nd) / 2.0 self.discriminator_loss_store.append(d_loss.item()) # Backprop for Discriminator self.reset_grad() d_loss.backward() self.discriminator_optimizer.step() if (i + 1) % 2 == 0: pbar.set_description( "Iter:{} Generator Loss:{:.4f} Discrimator Loss:{:.4f} GA2B:{:.4f} GB2A:{:.4f} G_id:{:.4f} G_cyc:{:.4f} D_A:{:.4f} D_B:{:.4f}" .format( num_iterations, generator_loss.item(), d_loss.item(), generator_loss_A2B, generator_loss_B2A, identiyLoss, cycleLoss, d_loss_A, d_loss_B, )) if epoch % 2000 == 0 and epoch != 0: end_time = time.time() store_to_file = "Epoch: {} Generator Loss: {:.4f} Discriminator Loss: {}, Time: {:.2f}\n\n".format( epoch, generator_loss.item(), d_loss.item(), end_time - start_time_epoch, ) self.store_to_file(store_to_file) print( "Epoch: {} Generator Loss: {:.4f} Discriminator Loss: {}, Time: {:.2f}\n\n" .format( epoch, generator_loss.item(), d_loss.item(), end_time - start_time_epoch, )) # Save the Entire model print("Saving model Checkpoint ......") store_to_file = "Saving model Checkpoint ......" self.store_to_file(store_to_file) self.saveModelCheckPoint(epoch, self.modelCheckpoint) print("Model Saved!") if epoch % 2000 == 0 and epoch != 0: # Validation Set validation_start_time = time.time() self.validation_for_A_dir() self.validation_for_B_dir() validation_end_time = time.time() store_to_file = "Time taken for validation Set: {}".format( validation_end_time - validation_start_time) self.store_to_file(store_to_file) print("Time taken for validation Set: {}".format( validation_end_time - validation_start_time)) def infer(self, PATH="sample"): num_mcep = 36 sampling_rate = 16000 frame_period = 5.0 n_frames = 128 infer_A_dir = PATH output_A_dir = PATH for file in os.listdir(infer_A_dir): filePath = os.path.join(infer_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.log_f0s_mean_A, std_log_src=self.log_f0s_std_A, mean_log_target=self.log_f0s_mean_B, std_log_target=self.log_f0s_std_B, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.coded_sps_A_mean) / self.coded_sps_A_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B(coded_sp_norm) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.coded_sps_B_std + self.coded_sps_B_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_A_dir, "convert_" + os.path.basename(file)), wav_transformed, sampling_rate, ) def validation_for_A_dir(self): num_mcep = 36 sampling_rate = 16000 frame_period = 5.0 n_frames = 128 validation_A_dir = self.validation_A_dir output_A_dir = self.output_A_dir print("Generating Validation Data B from A...") for file in os.listdir(validation_A_dir): filePath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.log_f0s_mean_A, std_log_src=self.log_f0s_std_A, mean_log_target=self.log_f0s_mean_B, std_log_target=self.log_f0s_std_B, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.coded_sps_A_mean) / self.coded_sps_A_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B(coded_sp_norm) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.coded_sps_B_std + self.coded_sps_B_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_A_dir, os.path.basename(file)), wav_transformed, sampling_rate, ) def validation_for_B_dir(self): num_mcep = 36 sampling_rate = 16000 frame_period = 5.0 n_frames = 128 validation_B_dir = self.validation_B_dir output_B_dir = self.output_B_dir print("Generating Validation Data A from B...") for file in os.listdir(validation_B_dir): filePath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.log_f0s_mean_B, std_log_src=self.log_f0s_std_B, mean_log_target=self.log_f0s_mean_A, std_log_target=self.log_f0s_std_A, ) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.coded_sps_B_mean) / self.coded_sps_B_std coded_sp_norm = np.array([coded_sp_norm]) if flow.cuda.is_available(): coded_sp_norm = flow.tensor(coded_sp_norm).cuda().float() else: coded_sp_norm = flow.tensor(coded_sp_norm).float() coded_sp_converted_norm = self.generator_B2A(coded_sp_norm) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = ( coded_sp_converted_norm * self.coded_sps_A_std + self.coded_sps_A_mean) coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period, ) sf.write( os.path.join(output_B_dir, os.path.basename(file)), wav_transformed, sampling_rate, ) def savePickle(self, variable, fileName): with open(fileName, "wb") as f: pickle.dump(variable, f) def loadPickleFile(self, fileName): with open(fileName, "rb") as f: return pickle.load(f) def store_to_file(self, doc): doc = doc + "\n" with open(self.file_name, "a") as myfile: myfile.write(doc) def saveModelCheckPoint(self, epoch, PATH): flow.save( self.generator_A2B.state_dict(), os.path.join(PATH, "generator_A2B_%d" % epoch), ) flow.save( self.generator_B2A.state_dict(), os.path.join(PATH, "generator_B2A_%d" % epoch), ) flow.save( self.discriminator_A.state_dict(), os.path.join(PATH, "discriminator_A_%d" % epoch), ) flow.save( self.discriminator_B.state_dict(), os.path.join(PATH, "discriminator_B_%d" % epoch), ) def loadModel(self, PATH): self.generator_A2B.load_state_dict( flow.load(os.path.join(PATH, "generator_A2B"))) self.generator_B2A.load_state_dict( flow.load(os.path.join(PATH, "generator_B2A"))) self.discriminator_A.load_state_dict( flow.load(os.path.join(PATH, "discriminator_A"))) self.discriminator_B.load_state_dict( flow.load(os.path.join(PATH, "discriminator_B")))
class Solver(object): def __init__(self, data_loader, config): self.config = config self.data_loader = data_loader # Model configurations. self.lambda_cycle = config.lambda_cycle self.lambda_cls = config.lambda_cls self.lambda_identity = config.lambda_identity # Training configurations. self.data_dir = config.data_dir self.test_dir = config.test_dir self.batch_size = config.batch_size self.num_iters = config.num_iters self.num_iters_decay = config.num_iters_decay self.g_lr = config.g_lr self.d_lr = config.d_lr self.c_lr = config.c_lr self.n_critic = config.n_critic self.beta1 = config.beta1 self.beta2 = config.beta2 self.resume_iters = config.resume_iters # Test configurations. self.pretrain_models = config.pretrain_models self.sample_dir = config.sample_dir self.trg_speaker = ast.literal_eval(config.trg_speaker) self.src_speaker = config.src_speaker # Miscellaneous. self.device = flow.device( "cuda:0" if flow.cuda.is_available() else "cpu") self.spk_enc = LabelBinarizer().fit(speakers) # Directories. self.model_save_dir = config.model_save_dir self.result_dir = config.result_dir self.use_gradient_penalty = config.use_gradient_penalty # Step size. self.log_step = config.log_step self.sample_step = config.sample_step self.model_save_step = config.model_save_step self.lr_update_step = config.lr_update_step # Build the model. self.build_model() def build_model(self): self.G = Generator() self.D = Discriminator() self.C = DomainClassifier() self.g_optimizer = flow.optim.Adam(self.G.parameters(), self.g_lr, [self.beta1, self.beta2]) self.d_optimizer = flow.optim.Adam(self.D.parameters(), self.d_lr, [self.beta1, self.beta2]) self.c_optimizer = flow.optim.Adam(self.C.parameters(), self.c_lr, [self.beta1, self.beta2]) self.print_network(self.G, "G") self.print_network(self.D, "D") self.print_network(self.C, "C") self.G.to(self.device) self.D.to(self.device) self.C.to(self.device) def print_network(self, model, name): """Print out the network information.""" num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print(name) print("The number of parameters: {}".format(num_params)) def update_lr(self, g_lr, d_lr, c_lr): """Decay learning rates of the generator and discriminator and classifier.""" for param_group in self.g_optimizer.param_groups: param_group["lr"] = g_lr for param_group in self.d_optimizer.param_groups: param_group["lr"] = d_lr for param_group in self.c_optimizer.param_groups: param_group["lr"] = c_lr def train(self): # Learning rate cache for decaying. g_lr = self.g_lr d_lr = self.d_lr c_lr = self.c_lr start_iters = 0 if self.resume_iters: pass norm = Normalizer() data_iter = iter(self.data_loader) print("Start training......") start_time = datetime.now() for i in range(start_iters, self.num_iters): # Preprocess input data # Fetch real images and labels. try: x_real, speaker_idx_org, label_org = next(data_iter) except: data_iter = iter(self.data_loader) x_real, speaker_idx_org, label_org = next(data_iter) # Generate target domain labels randomly. rand_idx = flow.randperm(label_org.size(0)) label_trg = label_org[rand_idx] speaker_idx_trg = speaker_idx_org[rand_idx] x_real = x_real.to(self.device) # Original domain one-hot labels. label_org = label_org.to(self.device) # Target domain one-hot labels. label_trg = label_trg.to(self.device) speaker_idx_org = speaker_idx_org.to(self.device) speaker_idx_trg = speaker_idx_trg.to(self.device) # Train the discriminator # Compute loss with real audio frame. CELoss = nn.CrossEntropyLoss() cls_real = self.C(x_real) cls_loss_real = CELoss(input=cls_real, target=speaker_idx_org) self.reset_grad() cls_loss_real.backward() self.c_optimizer.step() # Logging. loss = {} loss["C/C_loss"] = cls_loss_real.item() out_r = self.D(x_real, label_org) # Compute loss with fake audio frame. x_fake = self.G(x_real, label_trg) out_f = self.D(x_fake.detach(), label_trg) d_loss_t = nn.BCEWithLogitsLoss()( input=out_f, target=flow.zeros_like( out_f).float()) + nn.BCEWithLogitsLoss()( input=out_r, target=flow.ones_like(out_r).float()) out_cls = self.C(x_fake) d_loss_cls = CELoss(input=out_cls, target=speaker_idx_trg) # Compute loss for gradient penalty. alpha = flow.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = ((alpha * x_real + (1 - alpha) * x_fake).detach().requires_grad_(True)) out_src = self.D(x_hat, label_trg) # TODO: Second-order derivation is not currently supported in oneflow, so gradient penalty cannot be used temporarily. if self.use_gradient_penalty: d_loss_gp = self.gradient_penalty(out_src, x_hat) d_loss = d_loss_t + self.lambda_cls * d_loss_cls + 5 * d_loss_gp else: d_loss = d_loss_t + self.lambda_cls * d_loss_cls self.reset_grad() d_loss.backward() self.d_optimizer.step() loss["D/D_loss"] = d_loss.item() # Train the generator if (i + 1) % self.n_critic == 0: # Original-to-target domain. x_fake = self.G(x_real, label_trg) g_out_src = self.D(x_fake, label_trg) g_loss_fake = nn.BCEWithLogitsLoss()( input=g_out_src, target=flow.ones_like(g_out_src).float()) out_cls = self.C(x_real) g_loss_cls = CELoss(input=out_cls, target=speaker_idx_org) # Target-to-original domain. x_reconst = self.G(x_fake, label_org) g_loss_rec = nn.L1Loss()(x_reconst, x_real) # Original-to-Original domain(identity). x_fake_iden = self.G(x_real, label_org) id_loss = nn.L1Loss()(x_fake_iden, x_real) # Backward and optimize. g_loss = (g_loss_fake + self.lambda_cycle * g_loss_rec + self.lambda_cls * g_loss_cls + self.lambda_identity * id_loss) self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss["G/loss_fake"] = g_loss_fake.item() loss["G/loss_rec"] = g_loss_rec.item() loss["G/loss_cls"] = g_loss_cls.item() loss["G/loss_id"] = id_loss.item() loss["G/g_loss"] = g_loss.item() # Miscellaneous # Print out training information. if (i + 1) % self.log_step == 0: et = datetime.now() - start_time et = str(et)[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format( et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: with flow.no_grad(): d, speaker = TestSet(self.test_dir).test_data() target = random.choice( [x for x in speakers if x != speaker]) label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp( content["coded_sp_norm"]) convert_result = [] for start_idx in range( 0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion( f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_iter{i+1}_{filename}" path = os.path.join(self.sample_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, "{}-G".format(i + 1)) D_path = os.path.join(self.model_save_dir, "{}-D".format(i + 1)) C_path = os.path.join(self.model_save_dir, "{}-C".format(i + 1)) flow.save(self.G.state_dict(), G_path) flow.save(self.D.state_dict(), D_path) flow.save(self.C.state_dict(), C_path) print("Saved model checkpoints into {}...".format( self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > ( self.num_iters - self.num_iters_decay): g_lr -= self.g_lr / float(self.num_iters_decay) d_lr -= self.d_lr / float(self.num_iters_decay) c_lr -= self.c_lr / float(self.num_iters_decay) self.update_lr(g_lr, d_lr, c_lr) print("Decayed learning rates, g_lr: {}, d_lr: {}.".format( g_lr, d_lr)) def gradient_penalty(self, y, x): """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2.""" weight = flow.ones(y.size()).to(self.device) dydx = flow.autograd.grad(outputs=y, inputs=x, out_grads=weight, retain_graph=True, create_graph=True)[0] dydx = dydx.view(dydx.size(0), -1) dydx_l2norm = flow.sqrt(flow.sum(dydx**2, dim=1)) return flow.mean((dydx_l2norm - 1)**2) def reset_grad(self): """Reset the gradient buffers.""" self.g_optimizer.zero_grad() self.d_optimizer.zero_grad() self.c_optimizer.zero_grad() def restore_model(self, model_save_dir): """Restore the tra,zined generator and discriminator.""" print("Loading the pretrain models...") G_path = os.path.join(model_save_dir, "200000-G") D_path = os.path.join(model_save_dir, "200000-D") C_path = os.path.join(model_save_dir, "200000-C") self.G.load_state_dict(flow.load(G_path)) self.D.load_state_dict(flow.load(D_path)) self.C.load_state_dict(flow.load(C_path)) @staticmethod def pad_coded_sp(coded_sp_norm): f_len = coded_sp_norm.shape[1] if f_len >= FRAMES: pad_length = FRAMES - (f_len - (f_len // FRAMES) * FRAMES) elif f_len < FRAMES: pad_length = FRAMES - f_len sp_norm_pad = np.hstack( (coded_sp_norm, np.zeros((coded_sp_norm.shape[0], pad_length)))) return sp_norm_pad def test(self): """Translate speech using StarGAN .""" # Load the trained generator. self.restore_model(self.pretrain_models) norm = Normalizer() # Set data loader. d, speaker = TestSet(self.test_dir).test_data(self.src_speaker) targets = self.trg_speaker for target in targets: print(target) assert target in speakers label_t = self.spk_enc.transform([target])[0] label_t = np.asarray([label_t]) with flow.no_grad(): for filename, content in d.items(): f0 = content["f0"] ap = content["ap"] sp_norm_pad = self.pad_coded_sp(content["coded_sp_norm"]) convert_result = [] for start_idx in range(0, sp_norm_pad.shape[1] - FRAMES + 1, FRAMES): one_seg = sp_norm_pad[:, start_idx:start_idx + FRAMES] one_seg = flow.Tensor(one_seg).to(self.device) one_seg = one_seg.view(1, 1, one_seg.size(0), one_seg.size(1)) l = flow.Tensor(label_t) one_seg = one_seg.to(self.device) l = l.to(self.device) one_set_return = self.G(one_seg, l).detach().cpu().numpy() one_set_return = np.squeeze(one_set_return) one_set_return = norm.backward_process( one_set_return, target) convert_result.append(one_set_return) convert_con = np.concatenate(convert_result, axis=1) convert_con = convert_con[:, 0:content["coded_sp_norm"]. shape[1]] contigu = np.ascontiguousarray(convert_con.T, dtype=np.float64) decoded_sp = decode_spectral_envelope(contigu, SAMPLE_RATE, fft_size=FFTSIZE) f0_converted = norm.pitch_conversion(f0, speaker, target) wav = synthesize(f0_converted, decoded_sp, ap, SAMPLE_RATE) name = f"{speaker}-{target}_{filename}" path = os.path.join(self.result_dir, name) print(f"[save]:{path}") sf.write(path, wav, SAMPLE_RATE)