Пример #1
0
    def validation_for_A_dir(self):
        num_mcep = 24
        sampling_rate = 16000
        frame_period = 5.0
        n_frames = 128
        validation_A_dir = self.validation_A_dir
        output_A_dir = self.output_A_dir

        print("Generating Test Data B from A...")
        for file in os.listdir(validation_A_dir):
            filePath = os.path.join(validation_A_dir, file)
            wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True)
            wav = preprocess.wav_padding(wav=wav,
                                         sr=sampling_rate,
                                         frame_period=frame_period,
                                         multiple=4)
            f0, timeaxis, sp, ap = preprocess.world_decompose(
                wav=wav, fs=sampling_rate, frame_period=frame_period)
            f0_converted = preprocess.pitch_conversion(
                f0=f0,
                mean_log_src=self.log_f0s_mean_A,
                std_log_src=self.log_f0s_std_A,
                mean_log_target=self.log_f0s_mean_B,
                std_log_target=self.log_f0s_std_B)
            coded_sp = preprocess.world_encode_spectral_envelop(
                sp=sp, fs=sampling_rate, dim=num_mcep)
            coded_sp_transposed = coded_sp.T
            coded_sp_norm = (coded_sp_transposed -
                             self.coded_sps_A_mean) / self.coded_sps_A_std
            coded_sp_norm = np.array([coded_sp_norm])

            if torch.cuda.is_available():
                coded_sp_norm = torch.from_numpy(coded_sp_norm).cuda().float()
            else:
                coded_sp_norm = torch.from_numpy(coded_sp_norm).float()

            coded_sp_converted_norm = self.generator_A2B(coded_sp_norm)
            coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach(
            ).numpy()
            coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
            coded_sp_converted = coded_sp_converted_norm * \
                self.coded_sps_B_std + self.coded_sps_B_mean
            coded_sp_converted = coded_sp_converted.T
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            decoded_sp_converted = preprocess.world_decode_spectral_envelop(
                coded_sp=coded_sp_converted, fs=sampling_rate)
            wav_transformed = preprocess.world_speech_synthesis(
                f0=f0_converted,
                decoded_sp=decoded_sp_converted,
                ap=ap,
                fs=sampling_rate,
                frame_period=frame_period)
            librosa.output.write_wav(path=os.path.join(output_A_dir,
                                                       os.path.basename(file)),
                                     y=wav_transformed,
                                     sr=sampling_rate)
        print("finish!")
def test(filename):
    wav, _ = librosa.load(filename, sr=hp.rate)
    f0, timeaxis, sp, ap = world_decompose(wav, hp.rate)
    f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,
                                    log_f0s_mean_B, log_f0s_std_B)
    coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps)
    coded_sp_transposed = coded_sp.T
    coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
    coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames)

    wav_forms = []
    for i, sp_norm in enumerate(coded_sp_norm):
        sp_norm = np.expand_dims(sp_norm, axis=-1)
        coded_sp_converted_norm = infer(sp_norm)
        coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
        coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decode_sp_converted = world_decode_spectral_envelop(
            coded_sp_converted, hp.rate)
        if len(f0) < (i + 1) * hp.output_size:
            decode_sp_converted = decode_sp_converted[:len(f0) %
                                                      hp.output_size]
            f0_piece = f0_converted[i * hp.output_size:i * hp.output_size +
                                    len(f0) % hp.output_size]
            ap_piece = ap[i * hp.output_size:i * hp.output_size +
                          len(f0) % hp.output_size]
            wav_transformed = world_speech_synthesis(f0_piece,
                                                     decode_sp_converted,
                                                     ap_piece, hp.rate,
                                                     hp.duration)
            wav_forms.append(wav_transformed)
            break
        else:
            f0_piece = f0_converted[i * hp.output_size:(i + 1) *
                                    hp.output_size]
            ap_piece = ap[i * hp.output_size:(i + 1) * hp.output_size]

        wav_transformed = world_speech_synthesis(f0_piece, decode_sp_converted,
                                                 ap_piece, hp.rate,
                                                 hp.duration)
        wav_forms.append(wav_transformed)

    wav_forms = np.concatenate(wav_forms)
    wav_forms = np.expand_dims(wav_forms, axis=-1)
    wav_forms = np.expand_dims(wav_forms, axis=0)

    return wav_forms
Пример #3
0
model = CycleGAN2()
latest = tf.train.latest_checkpoint(hp.weights_dir)
model.load_weights(latest)

print('Loading cached data...')
with open('./datasets/JSUT/jsut.p', 'rb') as f:
    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = pickle.load(
        f)

with open('./datasets/target_voice/target_voice.p', 'rb') as f:
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = pickle.load(
        f)

wav, _ = librosa.load('./outputs/100002.wav', sr=hp.rate)
f0, timeaxis, sp, ap = world_decompose(wav, hp.rate)
f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,
                                log_f0s_mean_B, log_f0s_std_B)
coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps)
coded_sp_transposed = coded_sp.T
coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames)

wav_forms = []
for i, sp_norm in enumerate(coded_sp_norm):
    sp_norm = np.expand_dims(sp_norm, axis=-1)
    coded_sp_converted_norm = model([sp_norm, sp_norm])[1][0]
    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
    coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T
    coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
    decode_sp_converted = world_decode_spectral_envelop(
        coded_sp_converted, hp.rate)
    if len(f0) < (i + 1) * hp.output_size: