示例#1
0
def conversion(model_filepath, img_dir, conversion_direction, output_dir):

    input_size = [256, 256, 3]
    num_filters = 64

    model = CycleGAN(input_size=input_size,
                     num_filters=num_filters,
                     mode='test')

    model.load(filepath=model_filepath)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(img_dir):
        filepath = os.path.join(img_dir, file)
        img = cv2.imread(filepath)
        img_height, img_width, img_channel = img.shape
        img = cv2.resize(img, (input_size[1], input_size[0]))
        img = image_scaling(imgs=img)
        img_converted = model.test(inputs=np.array([img]),
                                   direction=conversion_direction)[0]
        img_converted = image_scaling_inverse(imgs=img_converted)
        img_converted = cv2.resize(img_converted, (img_width, img_height))
        cv2.imwrite(os.path.join(output_dir, os.path.basename(file)),
                    img_converted)
def conversion(model_dir, model_name, data_dir, conversion_direction, output_dir):

    num_features = 24
    sampling_rate = 16000
    frame_period = 5.0

    model = CycleGAN(num_features = num_features, mode = 'test')

    model.load(filepath = os.path.join(model_dir, model_name))

    mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz'))
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz'))
    logf0s_mean_A = logf0s_normalization_params['mean_A']
    logf0s_std_A = logf0s_normalization_params['std_A']
    logf0s_mean_B = logf0s_normalization_params['mean_B']
    logf0s_std_B = logf0s_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(data_dir):

        filepath = os.path.join(data_dir, file)
        wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
        wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features)
        coded_sp_transposed = coded_sp.T

        if conversion_direction == 'A2B':
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else:
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_B, std_log_src = logf0s_std_B, mean_log_target = logf0s_mean_A, std_log_target = logf0s_std_A)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
        librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def conversion(model_dir, model_name, data_dir, conversion_direction, output_dir):

    num_features = 24
    sampling_rate = 16000
    frame_period = 5.0

    model = CycleGAN(num_features = num_features, mode = 'test')

    model.load(filepath = os.path.join(model_dir, model_name))

    mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz'))
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz'))
    logf0s_mean_A = logf0s_normalization_params['mean_A']
    logf0s_std_A = logf0s_normalization_params['std_A']
    logf0s_mean_B = logf0s_normalization_params['mean_B']
    logf0s_std_B = logf0s_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(data_dir):

        filepath = os.path.join(data_dir, file)
        wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
        wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features)
        coded_sp_transposed = coded_sp.T

        if conversion_direction == 'A2B':
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else:
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_B, std_log_src = logf0s_std_B, mean_log_target = logf0s_mean_A, std_log_target = logf0s_std_A)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B
            coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
        librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
示例#4
0
def conversion(model_dir, model_name, data_dir, conversion_direction,
               output_dir, pc, generation_model):

    num_features = 32
    sampling_rate = 44000
    frame_period = 5.0

    model = CycleGAN(num_features=num_features,
                     mode='test',
                     gen_model=generation_model)

    model.load(filepath=os.path.join(model_dir, model_name))

    mcep_normalization_params = np.load(
        os.path.join(model_dir, 'mcep_normalization.npz'))
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(
        os.path.join(model_dir, 'logf0s_normalization.npz'))
    logf0s_mean_A = logf0s_normalization_params['mean_A']
    logf0s_std_A = logf0s_normalization_params['std_A']
    logf0s_mean_B = logf0s_normalization_params['mean_B']
    logf0s_std_B = logf0s_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(data_dir):

        filepath = os.path.join(data_dir, file)
        wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True)
        # wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav=wav,
                                               fs=sampling_rate,
                                               frame_period=frame_period)
        coded_sp = world_encode_spectral_envelop(sp=sp,
                                                 fs=sampling_rate,
                                                 dim=num_features)
        coded_sp_transposed = coded_sp.T

        frame_size = 128
        if conversion_direction == 'A2B':
            # pitch
            print("AtoB")
            if pc == True:
                print("pitch convert")
                f0_converted = pitch_conversion(f0=f0,
                                                mean_log_src=logf0s_mean_A,
                                                std_log_src=logf0s_std_A,
                                                mean_log_target=logf0s_mean_B,
                                                std_log_target=logf0s_std_B)
            else:
                print("pitch same")
                f0_converted = f0

            # normalization A Domain
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A

            # padding
            remain, padd = frame_size - coded_sp_norm.shape[
                1] % frame_size, False
            if coded_sp_norm.shape[1] % frame_size != 0:
                coded_sp_norm = np.concatenate(
                    (coded_sp_norm, np.zeros((32, remain))), axis=1)
                padd = True

            # inference for segmentation
            coded_sp_converted_norm = model.test(
                inputs=np.array([coded_sp_norm[:, 0:frame_size]]),
                direction=conversion_direction)[0]
            for i in range(1, coded_sp_norm.shape[1] // frame_size):
                ccat = model.test(inputs=np.array(
                    [coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
                                  direction=conversion_direction)[0]
                coded_sp_converted_norm = np.concatenate(
                    (coded_sp_converted_norm, ccat), axis=1)

            if padd == True:
                coded_sp_converted_norm = coded_sp_converted_norm[:, :-remain]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else:
            print("BtoA")
            if pc == True:
                print("pitch convert")
                f0_converted = pitch_conversion(f0=f0,
                                                mean_log_src=logf0s_mean_A,
                                                std_log_src=logf0s_std_A,
                                                mean_log_target=logf0s_mean_B,
                                                std_log_target=logf0s_std_B)
            else:
                f0_converted = f0

            # normalization B Domain
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B

            # padding
            remain, padd = frame_size - coded_sp_norm.shape[
                1] % frame_size, False
            if coded_sp_norm.shape[1] % frame_size != 0:
                coded_sp_norm = np.concatenate(
                    (coded_sp_norm, np.zeros((32, remain))), axis=1)
                padd = True

            # inference for segmentation
            coded_sp_converted_norm = model.test(
                inputs=np.array([coded_sp_norm[:, 0:frame_size]]),
                direction=conversion_direction)[0]
            for i in range(1, coded_sp_norm.shape[1] // frame_size):
                ccat = model.test(inputs=np.array(
                    [coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
                                  direction=conversion_direction)[0]
                coded_sp_converted_norm = np.concatenate(
                    (coded_sp_converted_norm, ccat), axis=1)

            if padd == True:
                coded_sp_converted_norm = coded_sp_converted_norm[:, :-remain]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A

        # output translation value processing
        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(
            coded_sp=coded_sp_converted, fs=sampling_rate)

        # World vocoder synthesis
        wav_transformed = world_speech_synthesis(
            f0=f0_converted,
            decoded_sp=decoded_sp_converted,
            ap=ap,
            fs=sampling_rate,
            frame_period=frame_period)
        librosa.output.write_wav(
            os.path.join(output_dir, os.path.basename(file)), wav_transformed,
            sampling_rate)
class Converter():
    def __init__(self, model_dir, model_name):
        self.num_features = 24
        self.sampling_rate = 16000
        self.frame_period = 5.0

        self.model = CycleGAN(num_features=self.num_features, mode='test')

        self.model.load(filepath=os.path.join(model_dir, model_name))

        self.mcep_normalization_params = np.load(
            os.path.join(model_dir, 'mcep_normalization.npz'))
        self.mcep_mean_A = self.mcep_normalization_params['mean_A']
        self.mcep_std_A = self.mcep_normalization_params['std_A']
        self.mcep_mean_B = self.mcep_normalization_params['mean_B']
        self.mcep_std_B = self.mcep_normalization_params['std_B']

        self.logf0s_normalization_params = np.load(
            os.path.join(model_dir, 'logf0s_normalization.npz'))
        self.logf0s_mean_A = self.logf0s_normalization_params['mean_A']
        self.logf0s_std_A = self.logf0s_normalization_params['std_A']
        self.logf0s_mean_B = self.logf0s_normalization_params['mean_B']
        self.logf0s_std_B = self.logf0s_normalization_params['std_B']

    def convert_to_pcm_data(self, wav, conversion_direction='A2B'):
        wav = wav_padding(wav=wav,
                          sr=self.sampling_rate,
                          frame_period=self.frame_period,
                          multiple=4)
        f0, timeaxis, sp, ap = world_decompose(wav=wav,
                                               fs=self.sampling_rate,
                                               frame_period=self.frame_period)
        coded_sp = world_encode_spectral_envelop(sp=sp,
                                                 fs=self.sampling_rate,
                                                 dim=self.num_features)
        coded_sp_transposed = coded_sp.T

        if conversion_direction == 'A2B':
            f0_converted = pitch_conversion(f0=f0,
                                            mean_log_src=self.logf0s_mean_A,
                                            std_log_src=self.logf0s_std_A,
                                            mean_log_target=self.logf0s_mean_B,
                                            std_log_target=self.logf0s_std_B)
            coded_sp_norm = (coded_sp_transposed -
                             self.mcep_mean_A) / self.mcep_std_A
            coded_sp_converted_norm = self.model.test(
                inputs=np.array([coded_sp_norm]),
                direction=conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * self.mcep_std_B + self.mcep_mean_B
        else:
            f0_converted = pitch_conversion(f0=f0,
                                            mean_log_src=self.logf0s_mean_B,
                                            std_log_src=self.logf0s_std_B,
                                            mean_log_target=self.logf0s_mean_A,
                                            std_log_target=self.logf0s_std_A)
            coded_sp_norm = (coded_sp_transposed -
                             self.mcep_mean_B) / self.mcep_std_B
            coded_sp_converted_norm = self.model.test(
                inputs=np.array([coded_sp_norm]),
                direction=conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * self.mcep_std_A + self.mcep_mean_A

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(
            coded_sp=coded_sp_converted, fs=self.sampling_rate)
        wav_transformed = world_speech_synthesis(
            f0=f0_converted,
            decoded_sp=decoded_sp_converted,
            ap=ap,
            fs=self.sampling_rate,
            frame_period=self.frame_period)

        # For debugging model output, uncomment the following line:
        # librosa.output.write_wav('model_output.wav', wav_transformed, self.sampling_rate)

        # TODO: Perhaps ditch this. It's probably unnecessary work.
        upsampled = librosa.resample(wav_transformed, self.sampling_rate,
                                     48000)
        pcm_data = upsampled.astype(np.float64)
        stereo_pcm_data = np.tile(pcm_data, (2, 1)).T
        return stereo_pcm_data

    def convert_pcm_to_wav(self, stereo_pcm_data):
        buf = io.BytesIO()
        scipy.io.wavfile.write(buf, 48000, stereo_pcm_data.astype(np.float32))
        return buf

    def convert(self, wav, conversion_direction='A2B'):
        stereo_pcm_data = self.convert_to_pcm_data(
            wav, conversion_direction=conversion_direction)
        return self.convert_pcm_to_wav(stereo_pcm_data)
示例#6
0
def conversion(training_data_dir, model_dir, model_name, data_dir,
               conversion_direction, output_dir):

    num_features = 24
    sampling_rate = 16000
    frame_period = 5.0

    model = CycleGAN(num_features=num_features, mode='test')

    if os.path.exists(os.path.join(model_dir, "checkpoint")) == True:
        f = open(os.path.join(model_dir, "checkpoint"), "r")
        all_ckpt = f.readlines()
        f.close()
        pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1]
        assert os.path.exists(
            os.path.join(model_dir,
                         (pretrain_ckpt +
                          ".index"))) == True, "The checkpoint is not exist."
        model.load(filepath=os.path.join(model_dir, pretrain_ckpt))
        print("Loading pretrained model {}".format(pretrain_ckpt))

    mcep_normalization_params = np.load(
        os.path.join(training_data_dir, 'mcep_normalization.npz'))
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(
        os.path.join(training_data_dir, 'logf0s_normalization.npz'))
    logf0s_mean_A = logf0s_normalization_params['mean_A']
    logf0s_std_A = logf0s_normalization_params['std_A']
    logf0s_mean_B = logf0s_normalization_params['mean_B']
    logf0s_std_B = logf0s_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in trange(len(os.listdir(data_dir))):
        file = os.listdir(data_dir)[i]
        filepath = os.path.join(data_dir, file)
        wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True)
        wav = wav_padding(wav=wav,
                          sr=sampling_rate,
                          frame_period=frame_period,
                          multiple=4)
        f0, timeaxis, sp, ap = world_decompose(wav=wav,
                                               fs=sampling_rate,
                                               frame_period=frame_period)
        coded_sp = world_encode_spectral_envelop(sp=sp,
                                                 fs=sampling_rate,
                                                 dim=num_features)
        coded_sp_transposed = coded_sp.T

        if conversion_direction == 'A2B':
            f0_converted = pitch_conversion(f0=f0,
                                            mean_log_src=logf0s_mean_A,
                                            std_log_src=logf0s_std_A,
                                            mean_log_target=logf0s_mean_B,
                                            std_log_target=logf0s_std_B)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A
            coded_sp_converted_norm = model.test(
                inputs=np.array([coded_sp_norm]),
                direction=conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else:
            f0_converted = pitch_conversion(f0=f0,
                                            mean_log_src=logf0s_mean_B,
                                            std_log_src=logf0s_std_B,
                                            mean_log_target=logf0s_mean_A,
                                            std_log_target=logf0s_std_A)
            #f0_converted = f0
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B
            coded_sp_converted_norm = model.test(
                inputs=np.array([coded_sp_norm]),
                direction=conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(
            coded_sp=coded_sp_converted, fs=sampling_rate)
        wav_transformed = world_speech_synthesis(
            f0=f0_converted,
            decoded_sp=decoded_sp_converted,
            ap=ap,
            fs=sampling_rate,
            frame_period=frame_period)
        librosa.output.write_wav(
            os.path.join(output_dir, os.path.basename(file)), wav_transformed,
            sampling_rate)
示例#7
0
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed,
          validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir,
          gen_model, MCEPs_dim, lambda_list, processed_data_dir):

    gen_loss_thres = 100.0
    np.random.seed(random_seed)
    num_epochs = 5000
    mini_batch_size = 1
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 44000
    num_mcep = MCEPs_dim
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = lambda_list[0]
    lambda_identity = lambda_list[1]

    Speaker_A_features = os.path.join(processed_data_dir, 'wav_A.npz')
    Speaker_B_features = os.path.join(processed_data_dir, 'wav_B.npz')
    start_time = time.time()
    print('lookiong for preprocessed data in:{}'.format(processed_data_dir))
    if os.path.exists(Speaker_A_features) and os.path.exists(
            Speaker_B_features):
        print('#### loading processed data #######')
        f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = load_speaker_features(
            Speaker_A_features)
        f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = load_speaker_features(
            Speaker_B_features)
    else:
        print('Preprocessing Data...')
        if not os.path.exists(processed_data_dir):
            os.makedirs(processed_data_dir)

        wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate)

        f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data(
            wavs=wavs_A,
            fs=sampling_rate,
            frame_period=frame_period,
            coded_dim=num_mcep)
        np.savez(Speaker_A_features,
                 f0s=f0s_A,
                 timeaxes=timeaxes_A,
                 sps=sps_A,
                 aps=aps_A,
                 coded_sps=coded_sps_A)

        del wavs_A

        wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate)

        f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data(
            wavs=wavs_B,
            fs=sampling_rate,
            frame_period=frame_period,
            coded_dim=num_mcep)
        np.savez(Speaker_B_features,
                 f0s=f0s_B,
                 timeaxes=timeaxes_B,
                 sps=sps_B,
                 aps=aps_B,
                 coded_sps=coded_sps_B)

        del wavs_B

        print('Data preprocessing finished !')
        return

    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    print('Log Pitch A')
    print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A))
    print('Log Pitch B')
    print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B))

    coded_sps_A, f0s_A = remove_radical_pitch_samples(f0s_A, coded_sps_A,
                                                      log_f0s_mean_A,
                                                      log_f0s_std_A)
    coded_sps_B, f0s_B = remove_radical_pitch_samples(f0s_B, coded_sps_B,
                                                      log_f0s_mean_B,
                                                      log_f0s_std_B)

    print('recalculating mean and std of radical cleared f0s')
    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A)
    coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B)

    print("Input data fixed.")
    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_A_transposed)
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_B_transposed)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'),
             mean_A=log_f0s_mean_A,
             std_A=log_f0s_std_A,
             mean_B=log_f0s_mean_B,
             std_B=log_f0s_std_B)
    np.savez(os.path.join(model_dir, 'mcep_normalization.npz'),
             mean_A=coded_sps_A_mean,
             std_A=coded_sps_A_std,
             mean_B=coded_sps_B_mean,
             std_B=coded_sps_B_std)

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time
    print('Preprocessing Done.')
    print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' %
          (time_elapsed // 3600, (time_elapsed % 3600 // 60),
           (time_elapsed % 60 // 1)))
    # ---------------------------------------------- Data preprocessing ---------------------------------------------- #

    # Model define
    model = CycleGAN(num_features=num_mcep,
                     log_dir=tensorboard_log_dir,
                     model_name=model_name,
                     gen_model=gen_model)
    # load model
    if os.path.exists(os.path.join(model_dir,
                                   (model_name + ".index"))) == True:
        model.load(filepath=os.path.join(model_dir, model_name))

    # =================================================== Training =================================================== #
    for epoch in range(num_epochs):
        print('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm,
                                                 dataset_B=coded_sps_B_norm,
                                                 n_frames=n_frames)

        n_samples = dataset_A.shape[0]
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        for i in tqdm.tqdm(range(n_samples // mini_batch_size)):

            num_iterations = n_samples // mini_batch_size * epoch + i

            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(
                    0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(
                    0, discriminator_learning_rate -
                    discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss, generator_loss_A2B = model.train\
                (input_A = dataset_A[start:end], input_B = dataset_B[start:end],
                 lambda_cycle = lambda_cycle, lambda_identity = lambda_identity,
                 generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate)
            # issue #4,
            #            model.summary()

            # Minimum AtoB loss model save
            # if gen_loss_thres > generator_loss_A2B:
            #     gen_loss_thres = generator_loss_A2B
            #     best_model_name = 'Bestmodel' + model_name
            #     model.save(directory=model_dir, filename=best_model_name)
            #     print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B)

            if i % 50 == 0:
                print(
                    'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                    .format(num_iterations, generator_learning_rate,
                            discriminator_learning_rate, generator_loss,
                            discriminator_loss))

        # Last model save
        if epoch % 10 == 0:
            model.save(directory=model_dir, filename=model_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        # ------------------------------------------- validation inference ------------------------------------------- #
        if validation_A_dir is not None:
            # if epoch % 50 == 0:
            if epoch % 10 == 0:
                print('Generating Validation Data B from A...')
                for file in os.listdir(validation_A_dir):
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
                    # break

        if validation_B_dir is not None:
            # if epoch % 50 == 0:
            if epoch % 10 == 0:
                print('Generating Validation Data A from B...')
                for file in os.listdir(validation_B_dir):
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_B,
                        std_log_src=log_f0s_std_B,
                        mean_log_target=log_f0s_mean_A,
                        std_log_target=log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_B_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name,
          random_seed, validation_A_dir, validation_B_dir, output_dir):

    np.random.seed(random_seed)

    num_epochs = 2000
    mini_batch_size = 1
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 16000
    num_mcep = 24
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = 10
    lambda_identity = 5

    # ****************************************************************
    # *************************Loading DATA***************************
    # ****************************************************************
    with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa:
        coded_sps_A_norm = pickle.load(fa)

    with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb:
        coded_sps_B_norm = pickle.load(fb)

    mcep_normalization_params = np.load(
        os.path.join(training_data_dir, 'mcep_normalization.npz'))
    coded_sps_A_mean = mcep_normalization_params['mean_A']
    coded_sps_A_std = mcep_normalization_params['std_A']
    coded_sps_B_mean = mcep_normalization_params['mean_B']
    coded_sps_B_std = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(
        os.path.join(training_data_dir, 'logf0s_normalization.npz'))
    log_f0s_mean_A = logf0s_normalization_params['mean_A']
    log_f0s_std_A = logf0s_normalization_params['std_A']
    log_f0s_mean_B = logf0s_normalization_params['mean_B']
    log_f0s_std_B = logf0s_normalization_params['std_B']

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    print("****************************************************************")
    print("*************************Start Training*************************")
    print("****************************************************************")

    model = CycleGAN(num_features=num_mcep)

    epoch = 0
    # load model
    if os.path.exists(os.path.join(model_dir, "checkpoint")) == True:
        f = open(os.path.join(model_dir, "checkpoint"), "r")
        all_ckpt = f.readlines()
        f.close()
        pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1]
        epoch = int(pretrain_ckpt.split("-")[1].split(".")[0])
        if os.path.exists(os.path.join(model_dir,
                                       (pretrain_ckpt + ".index"))) == True:
            model.load(filepath=os.path.join(model_dir, pretrain_ckpt))
            print("Loading pretrained model {}".format(pretrain_ckpt))
    else:
        print("Training model from 0 epoch")

    for k in range(epoch + 1, num_epochs):
        print('Epoch: %d' % k)

        start_time_epoch = time.time()
        pool_A, pool_B = list(coded_sps_A_norm), list(coded_sps_B_norm)
        dataset_A, dataset_B = sample_train_data(dataset_A=pool_A,
                                                 dataset_B=pool_B,
                                                 n_frames=n_frames)
        print('dataset_A', np.shape(dataset_A), 'dataset_B',
              np.shape(dataset_B))
        n_samples = dataset_A.shape[0]

        for i in trange(n_samples // mini_batch_size):
            num_iterations = n_samples // mini_batch_size * epoch + i
            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(
                    0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(
                    0, discriminator_learning_rate -
                    discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss = model.train(
                input_A=dataset_A[start:end],
                input_B=dataset_B[start:end],
                lambda_cycle=lambda_cycle,
                lambda_identity=lambda_identity,
                generator_learning_rate=generator_learning_rate,
                discriminator_learning_rate=discriminator_learning_rate)

            if i % (n_samples // 2) == 0:
                print(
                    'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                    .format(num_iterations, generator_learning_rate,
                            discriminator_learning_rate, generator_loss,
                            discriminator_loss))

        if k == 1 or k % 100 == 0:
            print("Saving Epoch {}".format(k))
            ckpt_name = model_name + "-" + str(k) + ".ckpt"
            model.save(directory=model_dir, filename=ckpt_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))

        if validation_A_dir is not None:
            if k % 300 == 0:
                print('Generating Validation Data B from A...')
                for i in trange(len(os.listdir(validation_A_dir))):
                    file = os.listdir(validation_A_dir)[i]
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)

        if validation_B_dir is not None:
            if k % 300 == 0:
                print('Generating Validation Data A from B...')
                for i in trange(len(os.listdir(validation_B_dir))):
                    file = os.listdir(validation_A_dir)[i]
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_B,
                        std_log_src=log_f0s_std_B,
                        mean_log_target=log_f0s_mean_A,
                        std_log_target=log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_B_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
示例#9
0
文件: train.py 项目: cshbli/AILib
def train(img_A_dir, img_B_dir, model_dir, model_name, random_seed,
          batch_size_maximum, validation_A_dir, validation_B_dir, output_dir,
          lambda_cycle, loss_function, tensorboard_log_dir):

    np.random.seed(random_seed)

    num_epochs = argv.epochs
    mini_batch_size = 1  # mini_batch_size = 1 is better
    learning_rate = 0.0002
    input_size = [argv.fine_size_h, argv.fine_size_w, 3]
    #num_filters = 64 # Tried num_filters = 8 still not good for 200 epochs
    num_filters = argv.filter_number

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    model = CycleGAN(input_size=input_size,
                     num_filters=num_filters,
                     mode='train',
                     lambda_cycle=lambda_cycle,
                     loss_function=loss_function,
                     log_dir=tensorboard_log_dir)

    dataset_A_raw = load_data(img_dir=img_A_dir,
                              load_size_w=argv.load_size_w,
                              load_size_h=argv.load_size_h)
    dataset_B_raw = load_data(img_dir=img_B_dir,
                              load_size_w=argv.load_size_w,
                              load_size_h=argv.load_size_h)

    if argv.checkpoint is not None:
        print('loading model from checkpoint')
        model.load(argv.checkpoint)

    for epoch in range(num_epochs):
        print('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        #dataset_A, dataset_B = sample_train_data(dataset_A_raw, dataset_B_raw, load_size = 286, output_size = 256, batch_size_maximum = batch_size_maximum)
        dataset_A, dataset_B = sample_train_data(
            dataset_A_raw,
            dataset_B_raw,
            load_size_w=argv.load_size_w,
            load_size_h=argv.load_size_h,
            output_size_w=argv.fine_size_w,
            output_size_h=argv.fine_size_h,
            batch_size_maximum=batch_size_maximum)

        n_samples = dataset_A.shape[0]
        for i in range(n_samples // mini_batch_size):

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss = model.train(
                input_A=dataset_A[start:end],
                input_B=dataset_B[start:end],
                learning_rate=learning_rate)

            if i % 50 == 0:
                print(
                    'Minibatch: %d, Generator Loss : %f, Discriminator Loss : %f'
                    % (i, generator_loss, discriminator_loss))

        #model.save(directory = model_dir, filename = model_name)
        model.save(directory=model_dir, filename=model_name + '_' + str(epoch))

        if validation_A_dir is not None:
            final_output_dir = os.path.join(validation_A_output_dir,
                                            str(epoch))
            if not os.path.exists(final_output_dir):
                os.makedirs(final_output_dir)

            for file in os.listdir(validation_A_dir):
                filepath = os.path.join(validation_A_dir, file)
                img = cv2.imread(filepath)
                img_height, img_width, img_channel = img.shape
                img = cv2.resize(img, (input_size[1], input_size[0]))
                img = image_scaling(imgs=img)
                img_converted = model.test(inputs=np.array([img]),
                                           direction='A2B')[0]
                img_converted = image_scaling_inverse(imgs=img_converted)
                img_converted = cv2.resize(img_converted,
                                           (img_width, img_height))
                cv2.imwrite(
                    os.path.join(final_output_dir, os.path.basename(file)),
                    img_converted)

        if validation_B_dir is not None:
            final_output_dir = os.path.join(validation_B_output_dir,
                                            str(epoch))
            if not os.path.exists(final_output_dir):
                os.makedirs(final_output_dir)

            for file in os.listdir(validation_B_dir):
                filepath = os.path.join(validation_B_dir, file)
                img = cv2.imread(filepath)
                img_height, img_width, img_channel = img.shape
                img = cv2.resize(img, (input_size[1], input_size[0]))
                img = image_scaling(imgs=img)
                img_converted = model.test(inputs=np.array([img]),
                                           direction='B2A')[0]
                img_converted = image_scaling_inverse(imgs=img_converted)
                img_converted = cv2.resize(img_converted,
                                           (img_width, img_height))
                cv2.imwrite(
                    os.path.join(final_output_dir, os.path.basename(file)),
                    img_converted)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))
示例#10
0
class Converter():
    def __init__(self, model_dir, model_name):
        self.num_features = 24
        self.sampling_rate = 16000
        self.frame_period = 5.0

        self.model = CycleGAN(num_features = self.num_features, mode = 'test')

        self.model.load(filepath = os.path.join(model_dir, model_name))

        # NB: Save the graph
        definition = self.model.sess.graph_def
        directory = 'saved_model_2'
        tf.train.write_graph(definition, directory, 'saved_model_2.pb', as_text=True)

        # https://github.com/tensorflow/models/issues/3530#issuecomment-395968881
        output_dir = './saved_model/'
        builder = tf.saved_model.builder.SavedModelBuilder(output_dir)

        builder.add_meta_graph_and_variables(
            self.model.sess,
            [tf.saved_model.tag_constants.SERVING],
            main_op=tf.tables_initializer(),
        )

        builder.save()

        """
        builder.add_meta_graph_and_variables(
            self.model.sess,
            [tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                'predict_images':
                    prediction_signature,
                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                    classification_signature,
            },
            main_op=tf.tables_initializer())
        """

        self.mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz'))
        self.mcep_mean_A = self.mcep_normalization_params['mean_A']
        self.mcep_std_A = self.mcep_normalization_params['std_A']
        self.mcep_mean_B = self.mcep_normalization_params['mean_B']
        self.mcep_std_B = self.mcep_normalization_params['std_B']

        self.logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz'))
        self.logf0s_mean_A = self.logf0s_normalization_params['mean_A']
        self.logf0s_std_A = self.logf0s_normalization_params['std_A']
        self.logf0s_mean_B = self.logf0s_normalization_params['mean_B']
        self.logf0s_std_B = self.logf0s_normalization_params['std_B']

    def convert(self, wav, conversion_direction='A2B'):
        wav = wav_padding(wav = wav, sr = self.sampling_rate, frame_period = self.frame_period, multiple = 4)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = self.sampling_rate, frame_period = self.frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = self.sampling_rate, dim = self.num_features)
        coded_sp_transposed = coded_sp.T

        if conversion_direction == 'A2B':
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = self.logf0s_mean_A, std_log_src = self.logf0s_std_A, mean_log_target = self.logf0s_mean_B, std_log_target = self.logf0s_std_B)
            coded_sp_norm = (coded_sp_transposed - self.mcep_mean_A) / self.mcep_std_A
            coded_sp_converted_norm = self.model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * self.mcep_std_B + self.mcep_mean_B
        else:
            f0_converted = pitch_conversion(f0 = f0, mean_log_src = self.logf0s_mean_B, std_log_src = self.logf0s_std_B, mean_log_target = self.logf0s_mean_A, std_log_target = self.logf0s_std_A)
            coded_sp_norm = (coded_sp_transposed - self.mcep_mean_B) / self.mcep_std_B
            coded_sp_converted_norm = self.model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0]
            coded_sp_converted = coded_sp_converted_norm * self.mcep_std_A + self.mcep_mean_A

        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = self.sampling_rate)
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = self.sampling_rate, frame_period = self.frame_period)

        # For debugging model output, uncomment the following line:
        # librosa.output.write_wav('model_output.wav', wav_transformed, self.sampling_rate)

        # TODO: Perhaps ditch this. It's probably unnecessary work.
        upsampled = librosa.resample(wav_transformed, self.sampling_rate, 48000)
        pcm_data = upsampled.astype(np.float64)
        stereo_pcm_data = np.tile(pcm_data, (2,1)).T

        buf = io.BytesIO()
        scipy.io.wavfile.write(buf, 48000, stereo_pcm_data.astype(np.float32))
        return buf