# Data parameters sampling_rate = 22050 num_mcep = 36 frame_period = 5.0 n_frames = 128 # Training parameters num_iterations = 200000 mini_batch_size = 1 generator_learning_rate = 0.0002 discriminator_learning_rate = 0.0001 lambda_cycle = 10 lambda_identity = 5 print('Loading cached data...') coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle( os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep))) coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle( os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep))) model = CycleGAN2(num_features=num_mcep, batch_size=mini_batch_size, log_dir=log_dir) iteration = 1 while iteration <= num_iterations: dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) n_samples = dataset_A.shape[0] for i in range(n_samples // mini_batch_size):
train_basic_dir = os.path.join(data_dir, 'vcc2018_training') dirs = glob.glob(os.path.join(train_basic_dir, 'VCC*')) data_a = list() data_b = list() for i in permutations(dirs, 2): train_A_dir = i[0] train_B_dir = i[1] exp_A_dir = os.path.join(exp_dir, os.path.basename(i[0])) exp_B_dir = os.path.join(exp_dir, os.path.basename(i[1])) data_a.append( load_pickle(os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep)))[0]) data_b.append( load_pickle(os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep)))[0]) model = CycleGAN2(num_features=num_mcep, batch_size=mini_batch_size, log_dir=log_dir) if start_at is not 0: model.load( os.path.join('experiments', dataset, model_name, 'checkpoints', 'cyclegan_vc2_two_step_' + str(start_at) + '.ckpt')) while iteration <= num_iterations: train_ab_index = random.randint(0, len(data_a) - 1)
def main(): log_dir = os.path.join(argv.output_dir, 'log', argv.model_name) os.makedirs(log_dir, exist_ok=True) exp_dirs = [] for f in os.listdir(argv.dataset_dir): exp_dirs.append(os.path.join(argv.dataset_dir, f)) # /Dataset root/Emotions` print(exp_dirs) print('Loading cached data...') coded_sps_norms = [] coded_sps_means = [] coded_sps_stds = [] log_f0s_means = [] log_f0s_stds = [] for f in exp_dirs: coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle( os.path.join(f, 'cache{}.p'.format(num_mcep))) coded_sps_norms.append(coded_sps_A_norm) coded_sps_means.append(coded_sps_A_mean) coded_sps_stds.append(coded_sps_A_std) log_f0s_means.append(log_f0s_mean_A) log_f0s_stds.append(log_f0s_std_A) num_domains = len(coded_sps_norms) model = RelGAN(num_features=num_mcep, num_domains=num_domains, batch_size=mini_batch_size, log_dir=log_dir) os.makedirs(os.path.join(argv.output_dir, 'experiment', argv.model_name, 'checkpoints'), exist_ok=True) ckpt = tf.train.get_checkpoint_state( os.path.join(argv.output_dir, 'experiment', argv.model_name, 'checkpoints')) if ckpt: # last_model = ckpt.all_model_checkpoint_paths[1] last_model = ckpt.model_checkpoint_path print("loading {}".format(last_model)) model.load(filepath=last_model) else: print("checkpoints are not found") iteration = 1 while iteration <= num_iterations: if (iteration % 10000 == 0): lambda_triangle *= 0.9 lambda_backward *= 0.9 generator_learning_rate *= 0.99999 discriminator_learning_rate *= 0.99999 x, x2, x_atr, y, y_atr, z, z_atr = sample_train_data( dataset_A=coded_sps_norms, nBatch=mini_batch_size, num_mcep=num_mcep, n_frames=n_frames) x_labels = np.zeros([mini_batch_size, num_domains]) y_labels = np.zeros([mini_batch_size, num_domains]) z_labels = np.zeros([mini_batch_size, num_domains]) for b in range(mini_batch_size): x_labels[b] = np.identity(num_domains)[x_atr[b]] y_labels[b] = np.identity(num_domains)[y_atr[b]] z_labels[b] = np.identity(num_domains)[z_atr[b]] rnd = np.random.randint(2) alp = np.random.uniform( 0, 0.5, size=mini_batch_size) if rnd == 0 else np.random.uniform( 0.5, 1.0, size=mini_batch_size) generator_loss, discriminator_loss, gen_adv_loss, gen_cond_loss, gen_int_loss, gen_rec_loss, gen_self_loss, dis_adv_loss, dis_cond_loss, dis_int_loss, lossb, lossm, losst = model.train( input_A=x, input_A2=x2, input_B=y, input_C=z, label_A=x_labels, label_B=y_labels, label_C=z_labels, alpha=alp, rand=rnd, lambda_cycle=lambda_cycle, lambda_identity=lambda_identity, lambda_triangle=lambda_triangle, lambda_backward=lambda_backward, generator_learning_rate=generator_learning_rate, discriminator_learning_rate=discriminator_learning_rate) if iteration % 10 == 0: print( 'Iteration: {:07d}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(iteration, generator_loss, discriminator_loss)) print("d_a=%.3f, d_c=%.3f, d_i=%.3f" % (dis_adv_loss, dis_cond_loss, dis_int_loss)) print( "g_a=%.3f, g_c=%.3f, g_i=%.3f, g_r=%.3f, g_s=%.3f, g_b=%.3f, g_m=%.3f, g_t=%.3f" % (gen_adv_loss, gen_cond_loss, gen_int_loss, gen_rec_loss, gen_self_loss, lossb, lossm, losst)) if iteration % 5000 == 0: print('Checkpointing...') model.save(directory=os.path.join('experiments', argv.model_name, 'checkpoints'), filename='{}_{}.ckpt'.format(argv.model_name, iteration)) if val_flag and iteration % 1000 == 0: for q in range(3): eval_dirs = os.listdir('datasets_val') assert len(eval_dirs) == num_domains x, x2, x_atr, y, y_atr, z, z_atr = sample_train_data( dataset_A=coded_sps_norms, nBatch=1, num_mcep=num_mcep, n_frames=n_frames) x_labels = np.zeros([1, num_domains]) y_labels = np.zeros([1, num_domains]) for b in range(1): x_labels[b] = np.identity(num_domains)[x_atr[b]] y_labels[b] = np.identity(num_domains)[y_atr[b]] x_atr = x_atr[0] y_atr = y_atr[0] eval_A_dir = os.path.join('datasets_val', eval_dirs[x_atr]) print(eval_A_dir) for file in glob.glob(eval_A_dir + '/*.wav'): alpha = np.random.uniform(0, 1, size=1) if q != 0 else np.ones(1) wav, _ = librosa.load(file, sr=sampling_rate, mono=True) wav *= 1. / max(0.01, np.max(np.abs(wav))) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0s_mean_A = np.exp(log_f0s_means[x_atr]) f0s_mean_B = np.exp(log_f0s_means[y_atr]) f0s_mean_AB = alpha * f0s_mean_B + (1 - alpha) * f0s_mean_A log_f0s_mean_AB = np.log(f0s_mean_AB) f0s_std_A = np.exp(log_f0s_stds[x_atr]) f0s_std_B = np.exp(log_f0s_stds[y_atr]) f0s_std_AB = alpha * f0s_std_B + (1 - alpha) * f0s_std_A log_f0s_std_AB = np.log(f0s_std_AB) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_means[x_atr], std_log_src=log_f0s_stds[x_atr], mean_log_target=log_f0s_mean_AB, std_log_target=log_f0s_std_AB) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = ( coded_sp_transposed - coded_sps_means[x_atr]) / coded_sps_stds[x_atr] coded_sp_converted_norm = \ model.test(inputs=np.array([coded_sp_norm]), label_A=x_labels, label_B=y_labels, alpha=alpha)[0] if coded_sp_converted_norm.shape[1] > len(f0): coded_sp_converted_norm = coded_sp_converted_norm[:, : -1] coded_sps_AB_mean = (1 - alpha) * coded_sps_means[ x_atr] + alpha * coded_sps_means[y_atr] coded_sps_AB_std = (1 - alpha) * coded_sps_stds[ x_atr] + alpha * coded_sps_stds[y_atr] coded_sp_converted = coded_sp_converted_norm * coded_sps_AB_std + coded_sps_AB_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) wav_transformed *= 1. / max( 0.01, np.max(np.abs(wav_transformed))) validation_A_output_dir = 'test' os.makedirs(validation_A_output_dir, exist_ok=True) librosa.output.write_wav( os.path.join( validation_A_output_dir, "{:06d}_{}_to_{}_{:.3f}_{}".format( iteration, x_atr, y_atr, alpha[0], os.path.basename(file))), wav_transformed, sampling_rate) iteration += 1
def train(self, batch_size, mode='train', model_iter='0'): speaker_list = ['VCC2SF1', 'VCC2SF2', 'VCC2SM1', 'VCC2SM2'] num_mcep = 36 frame_period = 5.0 n_frames = 128 batch_num = batch_size exp_dir = os.path.join('processed') device = torch.device("cuda") print('Loading cached data...') ac_lr = 0.0001 lr = 0.001 random.seed() if mode == 'DisentanglementANDConvPath_VAE': for ep in range(100): for i in range(4): for j in range(4): src_speaker = speaker_list[i] trg_speaker = speaker_list[j] exp_A_dir = os.path.join(exp_dir, src_speaker) exp_B_dir = os.path.join(exp_dir, trg_speaker) coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle( os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep))) coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle( os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep))) dataset_A, dataset_B = sample_train_data( dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) dataset_A = np.expand_dims(dataset_A, axis=1) dataset_A = torch.from_numpy(dataset_A).to( device, dtype=torch.float) dataset_B = np.expand_dims(dataset_B, axis=1) dataset_B = torch.from_numpy(dataset_B).to( device, dtype=torch.float) for iteration in range(4): start = iteration * batch_num end = (iteration + 1) * batch_num if ((iteration + 1) % 4) != 0: self.grad_reset() clf_loss_A = self.clf_step( dataset_A[start:end], i, batch_num) clf_loss_B = self.clf_step( dataset_B[start:end], j, batch_num) Clf_loss = clf_loss_A + clf_loss_B loss = Clf_loss loss.backward() self.cls_optimizer.step() elif ((iteration + 1) % 4) == 0: self.grad_reset() asr_loss_A = self.asr_step( dataset_A[start:end], i, batch_num) asr_loss_B = self.asr_step( dataset_B[start:end], j, batch_num) asr_loss = asr_loss_A + asr_loss_B loss = asr_loss loss.backward() self.asr_optimizer.step() self.grad_reset() AC_source,AC_target = \ self.AC_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) AC_t_loss = AC_source + AC_target AC_t_loss.backward() self.ac_optimizer.step() self.grad_reset() ###VAE step src_KLD, src_same_loss_rec, _ = self.vae_step( dataset_A[start:end], dataset_B[start:end], i, i, batch_num) trg_KLD, trg_same_loss_rec, _ = self.vae_step( dataset_B[start:end], dataset_A[start:end], j, j, batch_num) ###AC F step AC_real_src,AC_cross_src = \ self.AC_F_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) AC_real_trg,AC_cross_trg = \ self.AC_F_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num) ###clf asr step clf_loss_A,asr_loss_A = \ self.clf_asr_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) clf_loss_B,asr_loss_B = \ self.clf_asr_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num) Clf_loss = (clf_loss_A + clf_loss_B) / 2.0 ASR_loss = (asr_loss_A + asr_loss_B) / 2.0 ###Cycle step src_cyc_KLD, src_cyc_loss_rec = self.cycle_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_cyc_KLD, trg_cyc_loss_rec = self.cycle_step( dataset_B[start:end], dataset_A[start:end], j, i, batch_num) ###Semantic step src_semloss = self.sem_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_semloss = self.sem_step( dataset_B[start:end], dataset_A[start:end], j, i, batch_num) AC_f_loss = (AC_real_src + AC_real_trg + AC_cross_src + AC_cross_trg) / 4.0 Sem_loss = (src_semloss + trg_semloss) / 2.0 Cycle_KLD_loss = (src_cyc_KLD + trg_cyc_KLD) / 2.0 Cycle_rec_loss = (src_cyc_loss_rec + trg_cyc_loss_rec) / 2.0 KLD_loss = (src_KLD + trg_KLD) / 2.0 Rec_loss = (src_same_loss_rec + trg_same_loss_rec) / 2.0 loss = Rec_loss + KLD_loss + Cycle_KLD_loss + Cycle_rec_loss + AC_f_loss + Sem_loss - Clf_loss + ASR_loss loss.backward() self.vae_optimizer.step() if (ep + 1) % 1 == 0: print("Epoch : {}, Recon : {:.3f}, KLD : {:.3f}, AC t Loss : {:.3f}, AC f Loss : {:.3f}, Sem Loss : {:.3f}, Clf : {:.3f}, Asr Loss : {:.3f}"\ .format(ep+1,Rec_loss,KLD_loss,AC_t_loss,AC_cross_trg,Sem_loss,Clf_loss,ASR_loss)) os.makedirs("./VAE_all" + model_iter, exist_ok=True) if (ep + 1) % 50 == 0: print("Model Save Epoch {}".format(ep + 1)) self.save_model("VAE_all" + model_iter, ep + 1) if mode == 'DisentanglementANDConvPath_VAE_with_GAN': os.makedirs("./GAN_all" + model_iter, exist_ok=True) for ep in range(200): if ep > 100: lr = lr * 0.9 for i in range(4): for j in range(4): src_speaker = speaker_list[i] trg_speaker = speaker_list[j] exp_A_dir = os.path.join(exp_dir, src_speaker) exp_B_dir = os.path.join(exp_dir, trg_speaker) coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle( os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep))) coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle( os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep))) dataset_A, dataset_B = sample_train_data( dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) dataset_A = np.expand_dims(dataset_A, axis=1) dataset_A = torch.from_numpy(dataset_A).to( device, dtype=torch.float) dataset_B = np.expand_dims(dataset_B, axis=1) dataset_B = torch.from_numpy(dataset_B).to( device, dtype=torch.float) for iteration in range(81 // batch_num): start = iteration * batch_num end = (iteration + 1) * batch_num if ((iteration + 1) % 5) != 0: self.grad_reset() clf_loss_A = self.clf_step( dataset_A[start:end], i, batch_num) clf_loss_B = self.clf_step( dataset_B[start:end], j, batch_num) Clf_loss = clf_loss_A + clf_loss_B loss = Clf_loss loss.backward() self.cls_optimizer.step() self.grad_reset() convert_KLD, convert_rec, src_to_trg_x_tilde = self.vae_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_w_dis = self.patch_step( dataset_B[start:end], src_to_trg_x_tilde, j, is_dis=True) trg_adv_loss = trg_w_dis adv_loss = (trg_adv_loss) adv_loss.backward() self.dis_optimizer.step() for p in self.Discriminator.parameters(): p.data.clamp_(-0.01, 0.01) elif ((iteration + 1) % 5) == 0 and ep > 10: self.grad_reset() asr_loss_A = self.asr_step( dataset_A[start:end], i, batch_num) asr_loss_B = self.asr_step( dataset_B[start:end], j, batch_num) asr_loss = asr_loss_A + asr_loss_B loss = asr_loss loss.backward() self.asr_optimizer.step() self.grad_reset() AC_source,AC_target = \ self.AC_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) AC_t_loss = AC_source + AC_target AC_t_loss.backward() self.ac_optimizer.step() self.grad_reset() ###VAE step src_KLD, src_same_loss_rec, _ = self.vae_step( dataset_A[start:end], dataset_B[start:end], i, i, batch_num) trg_KLD, trg_same_loss_rec, _ = self.vae_step( dataset_B[start:end], dataset_A[start:end], j, j, batch_num) ###AC F step AC_real_src,AC_cross_src = \ self.AC_F_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) AC_real_trg,AC_cross_trg = \ self.AC_F_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num) ###clf asr step clf_loss_A,asr_loss_A = \ self.clf_asr_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num) clf_loss_B,asr_loss_B = \ self.clf_asr_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num) Clf_loss = (clf_loss_A + clf_loss_B) / 2.0 ASR_loss = (asr_loss_A + asr_loss_B) / 2.0 ###Cycle step src_cyc_KLD, src_cyc_loss_rec = self.cycle_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_cyc_KLD, trg_cyc_loss_rec = self.cycle_step( dataset_B[start:end], dataset_A[start:end], j, i, batch_num) ###Semantic step src_semloss = self.sem_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_semloss = self.sem_step( dataset_B[start:end], dataset_A[start:end], j, i, batch_num) convert_KLD, convert_rec, src_to_trg_x_tilde = self.vae_step( dataset_A[start:end], dataset_B[start:end], i, j, batch_num) trg_loss_adv = self.patch_step( dataset_B[start:end], src_to_trg_x_tilde, j, is_dis=False) AC_f_loss = (AC_real_src + AC_real_trg + AC_cross_src + AC_cross_trg) / 4.0 Sem_loss = (src_semloss + trg_semloss) / 2.0 Cycle_KLD_loss = (src_cyc_KLD + trg_cyc_KLD) / 2.0 Cycle_rec_loss = (src_cyc_loss_rec + trg_cyc_loss_rec) / 2.0 KLD_loss = (src_KLD + trg_KLD) / 2.0 Rec_loss = (src_same_loss_rec + trg_same_loss_rec) / 2.0 loss = 2 * ( Rec_loss + KLD_loss ) + Cycle_rec_loss + Cycle_KLD_loss + AC_f_loss + Sem_loss + trg_loss_adv - Clf_loss + ASR_loss loss.backward() self.vae_optimizer.step() if ep > 10: print( "Epoch : {}, Recon Loss : {:.3f}, KLD Loss : {:.3f}, Dis Loss : {:.3f}, GEN Loss : {:.3f}, AC t Loss : {:.3f}, AC f Loss : {:.3f}" .format(ep, Rec_loss, KLD_loss, adv_loss, trg_loss_adv, AC_t_loss, AC_cross_trg)) else: print("Epoch : {} Dis Loss : {}".format(ep, adv_loss)) if (ep + 1) % 50 == 0: print("Model Save Epoch {}".format(ep + 1)) self.save_model("GAN_all" + model_iter, ep + 1)