# Data parameters
sampling_rate = 22050
num_mcep = 36
frame_period = 5.0
n_frames = 128

# Training parameters
num_iterations = 200000
mini_batch_size = 1
generator_learning_rate = 0.0002
discriminator_learning_rate = 0.0001
lambda_cycle = 10
lambda_identity = 5

print('Loading cached data...')
coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle(
    os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep)))
coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle(
    os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep)))

model = CycleGAN2(num_features=num_mcep,
                  batch_size=mini_batch_size,
                  log_dir=log_dir)

iteration = 1
while iteration <= num_iterations:
    dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm,
                                             dataset_B=coded_sps_B_norm,
                                             n_frames=n_frames)
    n_samples = dataset_A.shape[0]

    for i in range(n_samples // mini_batch_size):
Пример #2
0
train_basic_dir = os.path.join(data_dir, 'vcc2018_training')

dirs = glob.glob(os.path.join(train_basic_dir, 'VCC*'))

data_a = list()
data_b = list()

for i in permutations(dirs, 2):
    train_A_dir = i[0]
    train_B_dir = i[1]

    exp_A_dir = os.path.join(exp_dir, os.path.basename(i[0]))
    exp_B_dir = os.path.join(exp_dir, os.path.basename(i[1]))

    data_a.append(
        load_pickle(os.path.join(exp_A_dir, 'cache{}.p'.format(num_mcep)))[0])
    data_b.append(
        load_pickle(os.path.join(exp_B_dir, 'cache{}.p'.format(num_mcep)))[0])

model = CycleGAN2(num_features=num_mcep,
                  batch_size=mini_batch_size,
                  log_dir=log_dir)

if start_at is not 0:
    model.load(
        os.path.join('experiments', dataset, model_name, 'checkpoints',
                     'cyclegan_vc2_two_step_' + str(start_at) + '.ckpt'))

while iteration <= num_iterations:
    train_ab_index = random.randint(0, len(data_a) - 1)
def main():
    log_dir = os.path.join(argv.output_dir, 'log', argv.model_name)
    os.makedirs(log_dir, exist_ok=True)

    exp_dirs = []
    for f in os.listdir(argv.dataset_dir):
        exp_dirs.append(os.path.join(argv.dataset_dir,
                                     f))  # /Dataset root/Emotions`
    print(exp_dirs)

    print('Loading cached data...')
    coded_sps_norms = []
    coded_sps_means = []
    coded_sps_stds = []
    log_f0s_means = []
    log_f0s_stds = []
    for f in exp_dirs:
        coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle(
            os.path.join(f, 'cache{}.p'.format(num_mcep)))
        coded_sps_norms.append(coded_sps_A_norm)
        coded_sps_means.append(coded_sps_A_mean)
        coded_sps_stds.append(coded_sps_A_std)
        log_f0s_means.append(log_f0s_mean_A)
        log_f0s_stds.append(log_f0s_std_A)

    num_domains = len(coded_sps_norms)
    model = RelGAN(num_features=num_mcep,
                   num_domains=num_domains,
                   batch_size=mini_batch_size,
                   log_dir=log_dir)

    os.makedirs(os.path.join(argv.output_dir, 'experiment', argv.model_name,
                             'checkpoints'),
                exist_ok=True)
    ckpt = tf.train.get_checkpoint_state(
        os.path.join(argv.output_dir, 'experiment', argv.model_name,
                     'checkpoints'))

    if ckpt:
        # last_model = ckpt.all_model_checkpoint_paths[1]
        last_model = ckpt.model_checkpoint_path
        print("loading {}".format(last_model))
        model.load(filepath=last_model)
    else:
        print("checkpoints are not found")

    iteration = 1
    while iteration <= num_iterations:
        if (iteration % 10000 == 0):
            lambda_triangle *= 0.9
            lambda_backward *= 0.9
        generator_learning_rate *= 0.99999
        discriminator_learning_rate *= 0.99999
        x, x2, x_atr, y, y_atr, z, z_atr = sample_train_data(
            dataset_A=coded_sps_norms,
            nBatch=mini_batch_size,
            num_mcep=num_mcep,
            n_frames=n_frames)

        x_labels = np.zeros([mini_batch_size, num_domains])
        y_labels = np.zeros([mini_batch_size, num_domains])
        z_labels = np.zeros([mini_batch_size, num_domains])
        for b in range(mini_batch_size):
            x_labels[b] = np.identity(num_domains)[x_atr[b]]
            y_labels[b] = np.identity(num_domains)[y_atr[b]]
            z_labels[b] = np.identity(num_domains)[z_atr[b]]

        rnd = np.random.randint(2)
        alp = np.random.uniform(
            0, 0.5, size=mini_batch_size) if rnd == 0 else np.random.uniform(
                0.5, 1.0, size=mini_batch_size)

        generator_loss, discriminator_loss, gen_adv_loss, gen_cond_loss, gen_int_loss, gen_rec_loss, gen_self_loss, dis_adv_loss, dis_cond_loss, dis_int_loss, lossb, lossm, losst = model.train(
            input_A=x,
            input_A2=x2,
            input_B=y,
            input_C=z,
            label_A=x_labels,
            label_B=y_labels,
            label_C=z_labels,
            alpha=alp,
            rand=rnd,
            lambda_cycle=lambda_cycle,
            lambda_identity=lambda_identity,
            lambda_triangle=lambda_triangle,
            lambda_backward=lambda_backward,
            generator_learning_rate=generator_learning_rate,
            discriminator_learning_rate=discriminator_learning_rate)

        if iteration % 10 == 0:
            print(
                'Iteration: {:07d}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                .format(iteration, generator_loss, discriminator_loss))
            print("d_a=%.3f, d_c=%.3f, d_i=%.3f" %
                  (dis_adv_loss, dis_cond_loss, dis_int_loss))
            print(
                "g_a=%.3f, g_c=%.3f, g_i=%.3f, g_r=%.3f, g_s=%.3f, g_b=%.3f, g_m=%.3f, g_t=%.3f"
                % (gen_adv_loss, gen_cond_loss, gen_int_loss, gen_rec_loss,
                   gen_self_loss, lossb, lossm, losst))
        if iteration % 5000 == 0:
            print('Checkpointing...')
            model.save(directory=os.path.join('experiments', argv.model_name,
                                              'checkpoints'),
                       filename='{}_{}.ckpt'.format(argv.model_name,
                                                    iteration))

        if val_flag and iteration % 1000 == 0:
            for q in range(3):
                eval_dirs = os.listdir('datasets_val')
                assert len(eval_dirs) == num_domains
                x, x2, x_atr, y, y_atr, z, z_atr = sample_train_data(
                    dataset_A=coded_sps_norms,
                    nBatch=1,
                    num_mcep=num_mcep,
                    n_frames=n_frames)
                x_labels = np.zeros([1, num_domains])
                y_labels = np.zeros([1, num_domains])
                for b in range(1):
                    x_labels[b] = np.identity(num_domains)[x_atr[b]]
                    y_labels[b] = np.identity(num_domains)[y_atr[b]]
                x_atr = x_atr[0]
                y_atr = y_atr[0]
                eval_A_dir = os.path.join('datasets_val', eval_dirs[x_atr])
                print(eval_A_dir)
                for file in glob.glob(eval_A_dir + '/*.wav'):
                    alpha = np.random.uniform(0, 1,
                                              size=1) if q != 0 else np.ones(1)
                    wav, _ = librosa.load(file, sr=sampling_rate, mono=True)
                    wav *= 1. / max(0.01, np.max(np.abs(wav)))
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0s_mean_A = np.exp(log_f0s_means[x_atr])
                    f0s_mean_B = np.exp(log_f0s_means[y_atr])
                    f0s_mean_AB = alpha * f0s_mean_B + (1 - alpha) * f0s_mean_A
                    log_f0s_mean_AB = np.log(f0s_mean_AB)
                    f0s_std_A = np.exp(log_f0s_stds[x_atr])
                    f0s_std_B = np.exp(log_f0s_stds[y_atr])
                    f0s_std_AB = alpha * f0s_std_B + (1 - alpha) * f0s_std_A
                    log_f0s_std_AB = np.log(f0s_std_AB)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_means[x_atr],
                        std_log_src=log_f0s_stds[x_atr],
                        mean_log_target=log_f0s_mean_AB,
                        std_log_target=log_f0s_std_AB)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (
                        coded_sp_transposed -
                        coded_sps_means[x_atr]) / coded_sps_stds[x_atr]
                    coded_sp_converted_norm = \
                        model.test(inputs=np.array([coded_sp_norm]), label_A=x_labels, label_B=y_labels, alpha=alpha)[0]
                    if coded_sp_converted_norm.shape[1] > len(f0):
                        coded_sp_converted_norm = coded_sp_converted_norm[:, :
                                                                          -1]
                    coded_sps_AB_mean = (1 - alpha) * coded_sps_means[
                        x_atr] + alpha * coded_sps_means[y_atr]
                    coded_sps_AB_std = (1 - alpha) * coded_sps_stds[
                        x_atr] + alpha * coded_sps_stds[y_atr]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_AB_std + coded_sps_AB_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    wav_transformed *= 1. / max(
                        0.01, np.max(np.abs(wav_transformed)))
                    validation_A_output_dir = 'test'
                    os.makedirs(validation_A_output_dir, exist_ok=True)
                    librosa.output.write_wav(
                        os.path.join(
                            validation_A_output_dir,
                            "{:06d}_{}_to_{}_{:.3f}_{}".format(
                                iteration, x_atr, y_atr, alpha[0],
                                os.path.basename(file))), wav_transformed,
                        sampling_rate)

        iteration += 1
Пример #4
0
    def train(self, batch_size, mode='train', model_iter='0'):
        speaker_list = ['VCC2SF1', 'VCC2SF2', 'VCC2SM1', 'VCC2SM2']
        num_mcep = 36
        frame_period = 5.0
        n_frames = 128
        batch_num = batch_size

        exp_dir = os.path.join('processed')
        device = torch.device("cuda")

        print('Loading cached data...')

        ac_lr = 0.0001
        lr = 0.001
        random.seed()

        if mode == 'DisentanglementANDConvPath_VAE':
            for ep in range(100):

                for i in range(4):
                    for j in range(4):

                        src_speaker = speaker_list[i]

                        trg_speaker = speaker_list[j]

                        exp_A_dir = os.path.join(exp_dir, src_speaker)
                        exp_B_dir = os.path.join(exp_dir, trg_speaker)

                        coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle(
                            os.path.join(exp_A_dir,
                                         'cache{}.p'.format(num_mcep)))
                        coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle(
                            os.path.join(exp_B_dir,
                                         'cache{}.p'.format(num_mcep)))
                        dataset_A, dataset_B = sample_train_data(
                            dataset_A=coded_sps_A_norm,
                            dataset_B=coded_sps_B_norm,
                            n_frames=n_frames)

                        dataset_A = np.expand_dims(dataset_A, axis=1)
                        dataset_A = torch.from_numpy(dataset_A).to(
                            device, dtype=torch.float)
                        dataset_B = np.expand_dims(dataset_B, axis=1)
                        dataset_B = torch.from_numpy(dataset_B).to(
                            device, dtype=torch.float)
                        for iteration in range(4):
                            start = iteration * batch_num
                            end = (iteration + 1) * batch_num
                            if ((iteration + 1) % 4) != 0:
                                self.grad_reset()
                                clf_loss_A = self.clf_step(
                                    dataset_A[start:end], i, batch_num)
                                clf_loss_B = self.clf_step(
                                    dataset_B[start:end], j, batch_num)
                                Clf_loss = clf_loss_A + clf_loss_B
                                loss = Clf_loss
                                loss.backward()
                                self.cls_optimizer.step()

                            elif ((iteration + 1) % 4) == 0:
                                self.grad_reset()
                                asr_loss_A = self.asr_step(
                                    dataset_A[start:end], i, batch_num)
                                asr_loss_B = self.asr_step(
                                    dataset_B[start:end], j, batch_num)
                                asr_loss = asr_loss_A + asr_loss_B
                                loss = asr_loss

                                loss.backward()
                                self.asr_optimizer.step()

                                self.grad_reset()
                                AC_source,AC_target = \
                                        self.AC_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                AC_t_loss = AC_source + AC_target

                                AC_t_loss.backward()

                                self.ac_optimizer.step()

                                self.grad_reset()

                                ###VAE step
                                src_KLD, src_same_loss_rec, _ = self.vae_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, i, batch_num)
                                trg_KLD, trg_same_loss_rec, _ = self.vae_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, j, batch_num)

                                ###AC F step
                                AC_real_src,AC_cross_src = \
                                        self.AC_F_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                AC_real_trg,AC_cross_trg = \
                                        self.AC_F_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num)

                                ###clf asr step
                                clf_loss_A,asr_loss_A = \
                                    self.clf_asr_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                clf_loss_B,asr_loss_B = \
                                    self.clf_asr_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num)
                                Clf_loss = (clf_loss_A + clf_loss_B) / 2.0

                                ASR_loss = (asr_loss_A + asr_loss_B) / 2.0

                                ###Cycle step
                                src_cyc_KLD, src_cyc_loss_rec = self.cycle_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)
                                trg_cyc_KLD, trg_cyc_loss_rec = self.cycle_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, i, batch_num)

                                ###Semantic step
                                src_semloss = self.sem_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)
                                trg_semloss = self.sem_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, i, batch_num)

                                AC_f_loss = (AC_real_src + AC_real_trg +
                                             AC_cross_src + AC_cross_trg) / 4.0
                                Sem_loss = (src_semloss + trg_semloss) / 2.0
                                Cycle_KLD_loss = (src_cyc_KLD +
                                                  trg_cyc_KLD) / 2.0
                                Cycle_rec_loss = (src_cyc_loss_rec +
                                                  trg_cyc_loss_rec) / 2.0
                                KLD_loss = (src_KLD + trg_KLD) / 2.0
                                Rec_loss = (src_same_loss_rec +
                                            trg_same_loss_rec) / 2.0
                                loss = Rec_loss + KLD_loss + Cycle_KLD_loss + Cycle_rec_loss + AC_f_loss + Sem_loss - Clf_loss + ASR_loss
                                loss.backward()
                                self.vae_optimizer.step()

                if (ep + 1) % 1 == 0:
                    print("Epoch : {}, Recon : {:.3f}, KLD : {:.3f}, AC t Loss : {:.3f}, AC f Loss : {:.3f}, Sem Loss : {:.3f}, Clf : {:.3f}, Asr Loss : {:.3f}"\
                        .format(ep+1,Rec_loss,KLD_loss,AC_t_loss,AC_cross_trg,Sem_loss,Clf_loss,ASR_loss))
                os.makedirs("./VAE_all" + model_iter, exist_ok=True)
                if (ep + 1) % 50 == 0:
                    print("Model Save Epoch {}".format(ep + 1))
                    self.save_model("VAE_all" + model_iter, ep + 1)

        if mode == 'DisentanglementANDConvPath_VAE_with_GAN':
            os.makedirs("./GAN_all" + model_iter, exist_ok=True)
            for ep in range(200):

                if ep > 100:

                    lr = lr * 0.9

                for i in range(4):
                    for j in range(4):

                        src_speaker = speaker_list[i]

                        trg_speaker = speaker_list[j]

                        exp_A_dir = os.path.join(exp_dir, src_speaker)
                        exp_B_dir = os.path.join(exp_dir, trg_speaker)

                        coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = load_pickle(
                            os.path.join(exp_A_dir,
                                         'cache{}.p'.format(num_mcep)))
                        coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = load_pickle(
                            os.path.join(exp_B_dir,
                                         'cache{}.p'.format(num_mcep)))
                        dataset_A, dataset_B = sample_train_data(
                            dataset_A=coded_sps_A_norm,
                            dataset_B=coded_sps_B_norm,
                            n_frames=n_frames)

                        dataset_A = np.expand_dims(dataset_A, axis=1)
                        dataset_A = torch.from_numpy(dataset_A).to(
                            device, dtype=torch.float)
                        dataset_B = np.expand_dims(dataset_B, axis=1)
                        dataset_B = torch.from_numpy(dataset_B).to(
                            device, dtype=torch.float)
                        for iteration in range(81 // batch_num):
                            start = iteration * batch_num
                            end = (iteration + 1) * batch_num

                            if ((iteration + 1) % 5) != 0:

                                self.grad_reset()
                                clf_loss_A = self.clf_step(
                                    dataset_A[start:end], i, batch_num)
                                clf_loss_B = self.clf_step(
                                    dataset_B[start:end], j, batch_num)
                                Clf_loss = clf_loss_A + clf_loss_B
                                loss = Clf_loss

                                loss.backward()

                                self.cls_optimizer.step()

                                self.grad_reset()
                                convert_KLD, convert_rec, src_to_trg_x_tilde = self.vae_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)

                                trg_w_dis = self.patch_step(
                                    dataset_B[start:end],
                                    src_to_trg_x_tilde,
                                    j,
                                    is_dis=True)

                                trg_adv_loss = trg_w_dis
                                adv_loss = (trg_adv_loss)

                                adv_loss.backward()

                                self.dis_optimizer.step()
                                for p in self.Discriminator.parameters():
                                    p.data.clamp_(-0.01, 0.01)

                            elif ((iteration + 1) % 5) == 0 and ep > 10:
                                self.grad_reset()
                                asr_loss_A = self.asr_step(
                                    dataset_A[start:end], i, batch_num)
                                asr_loss_B = self.asr_step(
                                    dataset_B[start:end], j, batch_num)
                                asr_loss = asr_loss_A + asr_loss_B
                                loss = asr_loss

                                loss.backward()
                                self.asr_optimizer.step()

                                self.grad_reset()
                                AC_source,AC_target = \
                                        self.AC_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                AC_t_loss = AC_source + AC_target

                                AC_t_loss.backward()

                                self.ac_optimizer.step()

                                self.grad_reset()

                                ###VAE step
                                src_KLD, src_same_loss_rec, _ = self.vae_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, i, batch_num)
                                trg_KLD, trg_same_loss_rec, _ = self.vae_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, j, batch_num)

                                ###AC F step
                                AC_real_src,AC_cross_src = \
                                        self.AC_F_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                AC_real_trg,AC_cross_trg = \
                                        self.AC_F_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num)

                                ###clf asr step
                                clf_loss_A,asr_loss_A = \
                                    self.clf_asr_step(dataset_A[start:end],dataset_B[start:end],i,j,batch_num)
                                clf_loss_B,asr_loss_B = \
                                    self.clf_asr_step(dataset_B[start:end],dataset_A[start:end],j,i,batch_num)
                                Clf_loss = (clf_loss_A + clf_loss_B) / 2.0

                                ASR_loss = (asr_loss_A + asr_loss_B) / 2.0

                                ###Cycle step
                                src_cyc_KLD, src_cyc_loss_rec = self.cycle_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)
                                trg_cyc_KLD, trg_cyc_loss_rec = self.cycle_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, i, batch_num)

                                ###Semantic step
                                src_semloss = self.sem_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)
                                trg_semloss = self.sem_step(
                                    dataset_B[start:end], dataset_A[start:end],
                                    j, i, batch_num)

                                convert_KLD, convert_rec, src_to_trg_x_tilde = self.vae_step(
                                    dataset_A[start:end], dataset_B[start:end],
                                    i, j, batch_num)

                                trg_loss_adv = self.patch_step(
                                    dataset_B[start:end],
                                    src_to_trg_x_tilde,
                                    j,
                                    is_dis=False)

                                AC_f_loss = (AC_real_src + AC_real_trg +
                                             AC_cross_src + AC_cross_trg) / 4.0

                                Sem_loss = (src_semloss + trg_semloss) / 2.0
                                Cycle_KLD_loss = (src_cyc_KLD +
                                                  trg_cyc_KLD) / 2.0
                                Cycle_rec_loss = (src_cyc_loss_rec +
                                                  trg_cyc_loss_rec) / 2.0
                                KLD_loss = (src_KLD + trg_KLD) / 2.0

                                Rec_loss = (src_same_loss_rec +
                                            trg_same_loss_rec) / 2.0

                                loss = 2 * (
                                    Rec_loss + KLD_loss
                                ) + Cycle_rec_loss + Cycle_KLD_loss + AC_f_loss + Sem_loss + trg_loss_adv - Clf_loss + ASR_loss

                                loss.backward()

                                self.vae_optimizer.step()

                if ep > 10:
                    print(
                        "Epoch : {}, Recon Loss : {:.3f},  KLD Loss : {:.3f}, Dis Loss : {:.3f},  GEN Loss : {:.3f}, AC t Loss : {:.3f}, AC f Loss : {:.3f}"
                        .format(ep, Rec_loss, KLD_loss, adv_loss, trg_loss_adv,
                                AC_t_loss, AC_cross_trg))
                else:
                    print("Epoch : {} Dis Loss : {}".format(ep, adv_loss))

                if (ep + 1) % 50 == 0:
                    print("Model Save Epoch {}".format(ep + 1))
                    self.save_model("GAN_all" + model_iter, ep + 1)