def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4, fs=16000, mge_training=True): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = hp_acoustic.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training) if post_filter: mgc = merlin_post_filter(mgc, alpha, coef=coef) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) # Convert range to int16 generated_waveform = generated_waveform / \ np.max(np.abs(generated_waveform)) * 32767 # return features as well to compare natural/genearted later return generated_waveform, mgc, lf0, vuv, bap
def decode_sp(coded_sp: np.array, sp_type: str = "mcep", fs: int = None, alpha: float = None, mgc_gamma: float = None, n_fft: int = None, post_filtering: bool = False): if post_filtering: if sp_type in ["mcep", "mgc"]: coded_sp = merlin_post_filter( coded_sp, AudioProcessing.fs_to_mgc_alpha(fs)) else: logging.warning( "Post-filtering only implemented for cepstrum features.") if sp_type == "mcep": return AudioProcessing.mcep_to_amp_sp(coded_sp, fs, alpha) elif sp_type == "mgc": return AudioProcessing.mgc_to_amp_sp(coded_sp, fs, alpha, mgc_gamma, n_fft) elif sp_type == "mfbanks": return AudioProcessing.mfbanks_to_amp_sp(coded_sp, fs, n_fft) elif sp_type == "amp_sp": return coded_sp else: raise NotImplementedError("Unknown feature type {}. No decoding " "method available.".format(sp_type))
def generate(self, parm_var, do_postfilter=True): config = self.analysis_config for path in self.paths: file_id = splitext(basename(path))[0] print('Synthesizing %s ... ' % (file_id), end='') mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var) if do_postfilter: mgc = merlin_post_filter(mgc, config.alpha) sp = pysptk.mc2sp(mgc, fftlen=config.fft_length, alpha=config.alpha) ap = pyworld.decode_aperiodicity(bap.astype(np.float64), config.sampling_rate, config.fft_length) f0 = self._lf0_to_f0(lf0, vuv) generated = pyworld.synthesize(f0.flatten().astype(np.float64), sp.astype(np.float64), ap.astype(np.float64), config.sampling_rate, config.frame_period) with open(join(self.out_dir, file_id + '.wav'), 'wb') as f: f.write(Audio(generated, rate=config.sampling_rate).data) print('done!')
def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def run_r9y9wavenet_mulaw_world_feats_synth(synth_output, hparams): # If no path is given, use pre-trained model. if not hasattr( hparams, "synth_vocoder_path") or hparams.synth_vocoder_path is None: parent_dirs = os.path.realpath(__file__).split(os.sep) dir_root = str.join( os.sep, parent_dirs[:parent_dirs.index("IdiapTTS") + 1]) hparams.synth_vocoder_path = os.path.join( dir_root, "idiaptts", "misc", "pretrained", "r9y9wavenet_quantized_16k_world_feats_English.nn") # Default quantization is with mu=255. if not hasattr(hparams, "mu") or hparams.mu is None: hparams.add_hparam("mu", 255) if hasattr(hparams, 'frame_rate_output_Hz'): org_frame_rate_output_Hz = hparams.frame_rate_output_Hz hparams.frame_rate_output_Hz = 16000 else: org_frame_rate_output_Hz = None hparams.add_hparam("frame_rate_output_Hz", 16000) synth_output = copy.copy(synth_output) if hparams.do_post_filtering: for id_name, output in synth_output.items(): coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features( output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) coded_sp = merlin_post_filter( coded_sp, WorldFeatLabelGen.fs_to_mgc_alpha(hparams.synth_fs)) synth_output[ id_name] = WorldFeatLabelGen.convert_from_world_features( coded_sp, lf0, vuv, bap) if hasattr(hparams, 'bit_depth'): org_bit_depth = hparams.bit_depth hparams.bit_depth = 16 else: org_bit_depth = None hparams.add_hparam("bit_depth", 16) Synthesiser.run_wavenet_vocoder(synth_output, hparams) # Restore identifier. hparams.setattr_no_type_check( "bit_depth", org_bit_depth) # Can be None, thus no type check. hparams.setattr_no_type_check("frame_rate_output_Hz", org_frame_rate_output_Hz) # Can be None.
def test_merlin_post_filter(): root = join(DATA_DIR, "merlin_post_filter") mgc = np.fromfile(join(root, "arctic_b0539.mgc"), dtype=np.float32).reshape(-1, 60) weight = np.fromfile(join(root, "weight"), dtype=np.float32) alpha = 0.58 minimum_phase_order = 511 fftlen = 1024 coef = 1.4 # Step 1 mgc_r0 = np.fromfile(join(root, "arctic_b0539.mgc_r0"), dtype=np.float32) mgc_r0_hat = pysptk.c2acr(pysptk.freqt( mgc, minimum_phase_order, alpha=-alpha), 0, fftlen).flatten() assert np.allclose(mgc_r0, mgc_r0_hat) # Step 2 mgc_p_r0 = np.fromfile( join(root, "arctic_b0539.mgc_p_r0"), dtype=np.float32) mgc_p_r0_hat = pysptk.c2acr(pysptk.freqt( mgc * weight, minimum_phase_order, -alpha), 0, fftlen).flatten() assert np.allclose(mgc_p_r0, mgc_p_r0_hat) # Step 3 mgc_b0 = np.fromfile(join(root, "arctic_b0539.mgc_b0"), dtype=np.float32) mgc_b0_hat = pysptk.mc2b(weight * mgc, alpha)[:, 0] assert np.allclose(mgc_b0, mgc_b0_hat) # Step 4 mgc_p_b0 = np.fromfile( join(root, "arctic_b0539.mgc_p_b0"), dtype=np.float32) mgc_p_b0_hat = np.log(mgc_r0_hat / mgc_p_r0_hat) / 2 + mgc_b0_hat assert np.allclose(mgc_p_b0, mgc_p_b0_hat) # Final step mgc_p_mgc = np.fromfile( join(root, "arctic_b0539.mgc_p_mgc"), dtype=np.float32).reshape(-1, 60) mgc_p_mgc_hat = pysptk.b2mc( np.hstack((mgc_p_b0_hat[:, None], pysptk.mc2b(mgc * weight, alpha)[:, 1:])), alpha) assert np.allclose(mgc_p_mgc, mgc_p_mgc_hat) filtered_mgc = merlin_post_filter(mgc, alpha, coef=coef, weight=weight, minimum_phase_order=minimum_phase_order, fftlen=fftlen) assert np.allclose(filtered_mgc, mgc_p_mgc, atol=1e-6)
def gen_waveform(y_predicted, do_postfilter=False): y_predicted = trim_zeros_frames(y_predicted) # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted) if do_postfilter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) #print(bap.shape) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) return generated_waveform
def world2wav( clf0, vuv, cap, fs, fbin, mcep=None, sp=None, frame_period=None, mcep_postfilter=False): # setup frame_period = pyworld.default_frame_period \ if frame_period is None else frame_period clf0 = np.ascontiguousarray(clf0.astype('float64')) vuv = np.ascontiguousarray(vuv > 0.5).astype('int') cap = np.ascontiguousarray(cap.astype('float64')) fft_len = fbin * 2 - 2 alpha = pysptk.util.mcepalpha(fs) # clf0 2 f0 f0 = np.squeeze(np.exp(clf0)) * np.squeeze(vuv) # cap 2 ap if cap.ndim != 2: cap = np.expand_dims(cap, 1) ap = pyworld.decode_aperiodicity(cap, fs, fft_len) # mcep 2 sp if sp is None: if mcep is None: raise ValueError else: mcep = np.ascontiguousarray(mcep.astype('float64')) if mcep_postfilter: mcep = merlin_post_filter(mcep, alpha) sp = pysptk.mgc2sp(mcep, alpha=alpha, fftlen=fft_len) sp = np.abs(np.exp(sp)) ** 2 else: sp = np.ascontiguousarray(sp) wave = pyworld.synthesize(f0, sp, ap, fs, frame_period=frame_period) scale = np.abs(wave).max() if scale > 0.99: wave = wave / scale * 0.99 return wave
def svs( self, labels, vocoder_type="world", post_filter_type="merlin", trajectory_smoothing=True, trajectory_smoothing_cutoff=50, vuv_threshold=0.1, vibrato_scale=1.0, return_states=False, force_fix_vuv=True, post_filter=None, ): """Synthesize waveform given HTS-style labels Args: labels (nnmnkwii.io.HTSLabelFile): HTS-style labels vocoder_type (str): Vocoder type. world or pwg post_filter_type (str): Post-filter type. merlin or nnsvs. Returns: tuple: (synthesized waveform, sampling rate) """ vocoder_type = vocoder_type.lower() if vocoder_type not in ["world", "pwg"]: raise ValueError(f"Unknown vocoder type: {vocoder_type}") if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]: raise ValueError(f"Unknown post-filter type: {post_filter_type}") if vocoder_type == "pwg" and self.vocoder is None: raise ValueError("""Pre-trained vocodr model is not found. WORLD is only supported for waveform generation""") if post_filter is not None: warn("post_filter is deprecated. Use post_filter_type instead.") post_filter_type = "merlin" if post_filter else "none" # Time-lag lag = predict_timelag( self.device, labels, self.timelag_model, self.timelag_config, self.timelag_in_scaler, self.timelag_out_scaler, self.binary_dict, self.numeric_dict, self.pitch_indices, self.config.log_f0_conditioning, self.config.timelag.allowed_range, self.config.timelag.allowed_range_rest, self.config.timelag.force_clip_input_features, ) # Duration predictions durations = predict_duration( self.device, labels, self.duration_model, self.duration_config, self.duration_in_scaler, self.duration_out_scaler, self.binary_dict, self.numeric_dict, self.pitch_indices, self.config.log_f0_conditioning, self.config.duration.force_clip_input_features, ) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( self.device, duration_modified_labels, self.acoustic_model, self.acoustic_config, self.acoustic_in_scaler, self.acoustic_out_scaler, self.binary_dict, self.numeric_dict, self.config.acoustic.subphone_features, self.pitch_indices, self.config.log_f0_conditioning, self.config.acoustic.force_clip_input_features, ) # Apply GV post-filtering if post_filter_type in ["nnsvs", "gv"]: static_stream_sizes = get_static_stream_sizes( self.acoustic_config.stream_sizes, self.acoustic_config.has_dynamic_features, self.acoustic_config.num_windows, ) mgc_end_dim = static_stream_sizes[0] acoustic_features[:, :mgc_end_dim] = variance_scaling( self.postfilter_out_scaler.var_.reshape(-1)[:mgc_end_dim], acoustic_features[:, :mgc_end_dim], offset=2, ) # bap bap_start_dim = sum(static_stream_sizes[:3]) bap_end_dim = sum(static_stream_sizes[:4]) acoustic_features[:, bap_start_dim:bap_end_dim] = variance_scaling( self.postfilter_out_scaler.var_.reshape(-1) [bap_start_dim:bap_end_dim], acoustic_features[:, bap_start_dim:bap_end_dim], offset=0, ) # Learned post-filter using nnsvs if post_filter_type == "nnsvs" and self.postfilter_model is not None: in_feats = torch.from_numpy(acoustic_features).float().unsqueeze(0) in_feats = ( self.postfilter_out_scaler.transform(in_feats).float().to( self.device)) out_feats = self.postfilter_model.inference( in_feats, [in_feats.shape[1]]) acoustic_features = (self.postfilter_out_scaler.inverse_transform( out_feats.cpu()).squeeze(0).numpy()) # Generate WORLD parameters mgc, lf0, vuv, bap = gen_spsvs_static_features( duration_modified_labels, acoustic_features, self.binary_dict, self.numeric_dict, self.acoustic_config.stream_sizes, self.acoustic_config.has_dynamic_features, self.config.acoustic.subphone_features, self.pitch_idx, self.acoustic_config.num_windows, self.config.frame_period, self.config.acoustic.relative_f0, vibrato_scale=vibrato_scale, vuv_threshold=vuv_threshold, force_fix_vuv=force_fix_vuv, ) # NOTE: spectral enhancement based on the Merlin's post-filter implementation if post_filter_type == "merlin": alpha = pysptk.util.mcepalpha(self.config.sample_rate) mgc = merlin_post_filter(mgc, alpha) # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(mgc.shape[1]): mgc[:, d] = lowpass_filter(mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(bap.shape[1]): bap[:, d] = lowpass_filter(bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Waveform generation by (1) WORLD or (2) neural vocoder if vocoder_type == "world": f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, self.config.sample_rate, vuv_threshold=vuv_threshold) wav = pyworld.synthesize( f0, spectrogram, aperiodicity, self.config.sample_rate, self.config.frame_period, ) elif vocoder_type == "pwg": # NOTE: So far vocoder models are trained on binary V/UV features vuv = (vuv > vuv_threshold).astype(np.float32) voc_inp = (torch.from_numpy( self.vocoder_in_scaler.transform( np.concatenate([mgc, lf0, vuv, bap], axis=-1))).float().to(self.device)) wav = self.vocoder.inference(voc_inp).view(-1).to("cpu").numpy() wav = self.post_process(wav) if return_states: states = { "mgc": mgc, "lf0": lf0, "vuv": vuv, "bap": bap, } if vocoder_type == "world": states.update({ "f0": f0, "spectrogram": spectrogram, "aperiodicity": aperiodicity, }) return wav, self.config.sample_rate, states return wav, self.config.sample_rate
def gen_waveform(labels, acoustic_features, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) # 音量を小さくする(音割れ防止) # TODO: ここのかける定数をいい感じにする spectrogram *= 0.000000001 sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60) return f0, sp, bap, generated_waveform
def synthesis( config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, ): # load labels and question labels = hts.load(label_path).round_() binary_dict, numeric_dict = hts.load_question_set(question_path, append_hat_for_LL=False) # pitch indices in the input features # TODO: configuarable pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) log_f0_conditioning = config.log_f0_conditioning # Clipping settings # setting True by default for backward compatibility timelag_clip_input_features = (config.timelag.force_clip_input_features if "force_clip_input_features" in config.timelag else True) duration_clip_input_features = (config.duration.force_clip_input_features if "force_clip_input_features" in config.duration else True) acoustic_clip_input_features = (config.acoustic.force_clip_input_features if "force_clip_input_features" in config.acoustic else True) if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag lag = predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices, log_f0_conditioning, config.timelag.allowed_range, config.timelag.allowed_range_rest, timelag_clip_input_features, ) # Duration predictions durations = predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, binary_dict, numeric_dict, pitch_indices, log_f0_conditioning, duration_clip_input_features, ) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, numeric_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning, acoustic_clip_input_features, ) # Generate WORLD parameters mgc, lf0, vuv, bap = gen_spsvs_static_features( duration_modified_labels, acoustic_features, binary_dict, numeric_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, pitch_idx, acoustic_config.num_windows, config.frame_period, config.acoustic.relative_f0, config.vibrato_scale, ) if config.acoustic.post_filter: alpha = pysptk.util.mcepalpha(config.sample_rate) mgc = merlin_post_filter(mgc, alpha) f0, spectrogram, aperiodicity = gen_world_params(mgc, lf0, vuv, bap, config.sample_rate) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, config.sample_rate, config.frame_period) return wav