def validation_for_A_dir(self): num_mcep = 24 sampling_rate = 16000 frame_period = 5.0 n_frames = 128 validation_A_dir = self.validation_A_dir output_A_dir = self.output_A_dir print("Generating Test Data B from A...") for file in os.listdir(validation_A_dir): filePath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True) wav = preprocess.wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = preprocess.world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = preprocess.pitch_conversion( f0=f0, mean_log_src=self.log_f0s_mean_A, std_log_src=self.log_f0s_std_A, mean_log_target=self.log_f0s_mean_B, std_log_target=self.log_f0s_std_B) coded_sp = preprocess.world_encode_spectral_envelop( sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - self.coded_sps_A_mean) / self.coded_sps_A_std coded_sp_norm = np.array([coded_sp_norm]) if torch.cuda.is_available(): coded_sp_norm = torch.from_numpy(coded_sp_norm).cuda().float() else: coded_sp_norm = torch.from_numpy(coded_sp_norm).float() coded_sp_converted_norm = self.generator_A2B(coded_sp_norm) coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach( ).numpy() coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm) coded_sp_converted = coded_sp_converted_norm * \ self.coded_sps_B_std + self.coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = preprocess.world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = preprocess.world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav(path=os.path.join(output_A_dir, os.path.basename(file)), y=wav_transformed, sr=sampling_rate) print("finish!")
def test(filename): wav, _ = librosa.load(filename, sr=hp.rate) f0, timeaxis, sp, ap = world_decompose(wav, hp.rate) f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A, log_f0s_mean_B, log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames) wav_forms = [] for i, sp_norm in enumerate(coded_sp_norm): sp_norm = np.expand_dims(sp_norm, axis=-1) coded_sp_converted_norm = infer(sp_norm) coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decode_sp_converted = world_decode_spectral_envelop( coded_sp_converted, hp.rate) if len(f0) < (i + 1) * hp.output_size: decode_sp_converted = decode_sp_converted[:len(f0) % hp.output_size] f0_piece = f0_converted[i * hp.output_size:i * hp.output_size + len(f0) % hp.output_size] ap_piece = ap[i * hp.output_size:i * hp.output_size + len(f0) % hp.output_size] wav_transformed = world_speech_synthesis(f0_piece, decode_sp_converted, ap_piece, hp.rate, hp.duration) wav_forms.append(wav_transformed) break else: f0_piece = f0_converted[i * hp.output_size:(i + 1) * hp.output_size] ap_piece = ap[i * hp.output_size:(i + 1) * hp.output_size] wav_transformed = world_speech_synthesis(f0_piece, decode_sp_converted, ap_piece, hp.rate, hp.duration) wav_forms.append(wav_transformed) wav_forms = np.concatenate(wav_forms) wav_forms = np.expand_dims(wav_forms, axis=-1) wav_forms = np.expand_dims(wav_forms, axis=0) return wav_forms
model = CycleGAN2() latest = tf.train.latest_checkpoint(hp.weights_dir) model.load_weights(latest) print('Loading cached data...') with open('./datasets/JSUT/jsut.p', 'rb') as f: coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = pickle.load( f) with open('./datasets/target_voice/target_voice.p', 'rb') as f: coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = pickle.load( f) wav, _ = librosa.load('./outputs/100002.wav', sr=hp.rate) f0, timeaxis, sp, ap = world_decompose(wav, hp.rate) f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A, log_f0s_mean_B, log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames) wav_forms = [] for i, sp_norm in enumerate(coded_sp_norm): sp_norm = np.expand_dims(sp_norm, axis=-1) coded_sp_converted_norm = model([sp_norm, sp_norm])[1][0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decode_sp_converted = world_decode_spectral_envelop( coded_sp_converted, hp.rate) if len(f0) < (i + 1) * hp.output_size: