def test_labels_number_of_frames(): # https://github.com/r9y9/nnmnkwii/issues/85 binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "jp.hed")) labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab")) linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True) assert labels.num_frames() == linguistic_features.shape[0]
def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def collect_features(self, path): labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=self.add_frame_features, subphone_features=self.subphone_features) if self.log_f0_conditioning: for idx in self.pitch_idx: features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear") return features.astype(np.float32)
def test_correct_vuv_by_phone(): wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav" lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab" binary_dict, numeric_dict = hts.load_question_set( Path(__file__).parent / "data" / "jp_test.hed" ) labels = hts.load(lab_path) sr, wav = wavfile.read(wav_path) wav = wav.astype(np.float64) assert sr == 48000 out_feats, stream_sizes = _extract_static_feats(wav, sr) has_dynamic_features = [False] * len(stream_sizes) pitch_idx = len(binary_dict) + 1 linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features="coarse_coding", ) params = { "labels": labels, "acoustic_features": out_feats, "binary_dict": binary_dict, "numeric_dict": numeric_dict, "stream_sizes": stream_sizes, "has_dynamic_features": has_dynamic_features, "pitch_idx": pitch_idx, "relative_f0": False, "frame_period": 5, } out_vuv_idx = 61 vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1] vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features) # by correcting VUV should make a difference _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True}) assert np.any(vuv_corrected != vuv) # 0: Rest 1: Voiced 2: Unvoiced rest_idx = 0 voiced_idx = 1 unvoiced_idx = 2 assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5) assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5) assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
def predict_timelag(device, labels, timelag_model, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-30, 30]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu() # De-normalization and rounding lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy())) # Clip to the allowed range lag = np.clip(lag, allowed_range[0], allowed_range[1]) # frames -> 100 ns lag *= 50000 return lag
def collect_features(self, path): labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=self.add_frame_features, subphone_features=self.subphone_features) if self.add_frame_features: indices = labels.silence_frame_indices().astype(np.int) else: indices = labels.silence_phone_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, fs=16000): duration_model, acoustic_model = models["duration"], models["acoustic"] # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale(linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model = acoustic_model.cpu() acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() try: acoustic_predicted = acoustic_model(x).data.numpy() except: xl = len(x) x = x.view(1, -1, x.size(-1)) acoustic_predicted = acoustic_model(x, [xl]).data.numpy() acoustic_predicted = acoustic_predicted.reshape( -1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, fs=fs)
def lab2wav(args, device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_var, Y_scale, duration_model, acoustic_model, post_filter=False): # Predict durations duration_modified_hts_labels = gen_duration(device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_scale, duration_model) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full" if args.label == 'state_align' else "coarse_coding") # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = minmax_scale(linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features # acoustic_model = acoustic_model.cpu() acoustic_model.eval() x = torch.FloatTensor(linguistic_features) acoustic_predicted = acoustic_model(x.unsqueeze(0)).data.numpy() print("acoustic_predicted shape: {}".format(acoustic_predicted.shape)) # Apply denormalization acoustic_predicted = acoustic_predicted * Y_scale[ty] + Y_mean[ty] return gen_waveform(acoustic_predicted.squeeze(0), Y_var, post_filter)
def test_phone_alignment_label(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab") labels = hts.load(input_state_label) x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None) assert not labels.is_state_alignment_label() assert np.all(np.isfinite(x)) for subphone_features in ["coarse_coding", "minimal_phoneme"]: x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) assert np.all(np.isfinite(x)) x = fe.duration_features(labels) assert np.all(np.isfinite(x))
def predict_duration(device, labels, duration_model, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Get note indices note_indices = get_note_indices(labels) # append the end of note note_indices.append(len(labels)) # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) pred_durations = duration_model( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def test_linguistic_features_for_acoustic_model(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) # Linguistic features # To train acoustic model paired with linguistic features, # we need frame-level linguistic feature representation. input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) assert labels.is_state_alignment_label() x = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full") y = np.fromfile(join(DATA_DIR, "binary_label_425", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1]) assert np.allclose(x, y)
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, coef=1.4, fs=16000, mge_training=True): duration_model, acoustic_model = models["duration"], models["acoustic"] if use_cuda: duration_model = duration_model.cuda() acoustic_model = acoustic_model.cuda() # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration( label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale( linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy() acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, coef=coef, fs=fs, mge_training=mge_training)
def predict_acoustic(device, labels, acoustic_model, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d(_midi_to_hz( linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range linguistic_features = np.clip(linguistic_features, acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1]) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) pred_acoustic = acoustic_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) return pred_acoustic
def get_acoustic_parameter(self, label): self.acoustic_model.eval() self.acoustic_model.to(self.device) sil_index = label.silence_frame_indices() subphone_feat = self.config.subphone_feature input_ = linguistic_features(label, self.bin_dict, self.con_dict, add_frame_features=True, subphone_features=subphone_feat) input_ = np.delete(input_, sil_index, axis=0) input_ = self._get_x_scaled(self.acoustic_dataset, input_) predicted = self.get_predicted(self.acoustic_model, input_) predicted = self._get_t_scaled(self.acoustic_dataset, predicted) predicted = predicted.reshape(-1, predicted.shape[-1]) return predicted
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale(duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model = duration_model.cpu() duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() try: duration_predicted = duration_model(x).data.numpy() except: xl = len(x) x = x.view(1, -1, x.size(-1)) duration_predicted = duration_model(x, [xl]).data.numpy() duration_predicted = duration_predicted.reshape( -1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = duration_predicted * Y_std[ty] + Y_mean[ty] duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def gen_parameters(self, utt_id, labels): feature = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features='coarse_coding').astype(np.float32) # normalize feature = scaler['X']['acoustic'].transform(feature) # add speaker information feature = self.add_speaker_code(utt_id, feature) # predict acoustic features feature = torch.from_numpy(feature).to(device) pred = self.acoustic_model.predict(feature) pred_mean = pred['mean'].data.cpu().numpy() pred_var = pred['var'].data.cpu().numpy() # denormalize scale = self.scaler['Y']['acoustic'].scale_ pred_mean = self.scaler['Y']['acoustic'].inverse_transform(pred_mean) pred_var *= scale ** 2 # split acoustic features mgc = pred_mean[:, :self.lf0_start_idx] lf0 = pred_mean[:, self.lf0_start_idx:self.vuv_start_idx] vuv = pred_mean[:, self.vuv_start_idx] bap = pred_mean[:, self.bap_start_idx:] # make variances for Maximum Likelihood Parameter Generation (MLPG) mgc_variances = pred_var[:, :self.lf0_start_idx] lf0_variances = pred_var[:, self.lf0_start_idx:self.vuv_start_idx] bap_variances = pred_var[:, self.bap_start_idx:] # perform MLPG to calculate static features mgc = mlpg(mgc, mgc_variances, self.windows) lf0 = mlpg(lf0, lf0_variances, self.windows) bap = mlpg(bap, bap_variances, self.windows) feature = np.hstack([mgc, lf0, vuv.reshape(-1, 1), bap]) return feature
def test_singing_voice_question(): # Test SVS case """ QS "L-Phone_Yuusei_Boin" {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*} CQS "e1" {/E:(\\NOTE)]} """ binary_dict, continuous_dict = hts.load_question_set( join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False) input_phone_label = join(DATA_DIR, "song070_f00001_063.lab") labels = hts.load(input_phone_label) feats = fe.linguistic_features(labels, binary_dict, continuous_dict) assert feats.shape == (74, 2) # CQS e1: get the current MIDI number C_e1 = continuous_dict[0] for idx, lab in enumerate(labels): context = lab[-1] if C_e1.search(context) is not None: from nnmnkwii.frontend import NOTE_MAPPING assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]
def get_duration_label(self, path): label = hts.load(path) self.duration_model.eval() feat = linguistic_features(label, self.bin_dict, self.con_dict, add_frame_features=False, subphone_features=None) feat = feat.astype(np.float32) feat = self._get_x_scaled(self.duration_dataset, feat) self.duration_model.to(self.device) predicted = self.get_predicted(self.duration_model, feat) predicted = self._get_t_scaled(self.duration_dataset, predicted) predicted = np.round(predicted) predicted[predicted <= 0] = 1 label.set_durations(predicted) return label
def collect_features(self, wav_path, label_path): d,fs=librosa.load(wav_path,sr=sample_rate) #audio, _ = librosa.effects.trim( # audio,top_db=config["trim_threshold_in_db"],frame_length=config["trim_frame_size"],hop_length=config["trim_hop_size"] ) D = librosa.stft( d, n_fft=fft_len, hop_length=hop_size, win_length=None,window=window, pad_mode="reflect" ) S, _ = librosa.magphase(D) mel_basis = librosa.filters.mel(sr=fs,n_fft=fft_len,n_mels=mel_dim, fmin=fmin, fmax=fmax ) #mel_basis=librosa.effects.feature.melspectrogram(d,sr=fs,n_fft=fft_len,hop_length=hop_size,n_mels=mel_dim,fmin=0,htk=True) mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T #features=features[None,:,:] _f0, t = pyworld.dio(d.astype(np.double), fs=sample_rate, f0_ceil=fmax, frame_period=frame_period ) f0 = pyworld.stonemask(d.astype(np.double), _f0, t, sample_rate) # extract energy labels = _hts.load(label_path) features = fe.linguistic_features(labels, self.binary_dict, self.continuous_dict,add_frame_features=True,subphone_features='coarse_coding',frame_shift_in_micro_sec=frame_shift_in_micro_sec) num_frames=labels.num_frames(frame_shift_in_micro_sec=frame_shift_in_micro_sec) indices = labels.silence_frame_indices(frame_shift_in_micro_sec=frame_shift_in_micro_sec) #print(fs, wav_path, mel.shape[0],labels.num_frames()) mel = mel[:num_frames] if len(f0) >= len(mel): f0 = f0[: len(mel)] else: f0 = np.pad(f0, (0, len(mel) - len(f0))) energy = np.sqrt(np.sum(S ** 2, axis=0)) energy=energy[: len(mel)] assert (len(mel) == len(f0) == len(energy)),"error:%s,%s,%s,%s" %(wav_path,len(mel), len(f0), len(energy)) f0 = remove_outlier(f0) energy = remove_outlier(energy) if len(indices)>0: features = np.delete(features, indices, axis=0) mel = np.delete(mel, indices, axis=0) f0=np.delete(f0,indices,axis=0) energy = np.delete(energy, indices, axis=0) #print(features.shape) # print(wav_path, mel.shape[0],f0.shape[0], energy.shape[0],features.shape[0],num_frames,len(indices)) return mel,f0,energy,features
def test_silence_frame_removal_given_hts_labels(): qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed") binary_dict, continuous_dict = hts.load_question_set(qs_file_name) input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab") labels = hts.load(input_state_label) features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full") # Remove silence frames indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) y = np.fromfile(join(DATA_DIR, "nn_no_silence_lab_425", "arctic_a0001.lab"), dtype=np.float32).reshape(-1, features.shape[-1]) assert features.shape == y.shape assert np.allclose(features, y)
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale( duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x duration_predicted = duration_model(x, [xl]).data.cpu().numpy() duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty]) duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-20, 20], allowed_range_rest=[-40, 40]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) if isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range timelag_linguistic_features = np.clip( timelag_linguistic_features, timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1]) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * timelag_out_scaler.var_ max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_timelag = timelag_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]) else: pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0], allowed_range[1]) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def gen_waveform(labels, acoustic_features, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) # 音量を小さくする(音割れ防止) # TODO: ここのかける定数をいい感じにする spectrogram *= 0.000000001 sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60) return f0, sp, bap, generated_waveform
def gen_spsvs_static_features( labels, acoustic_features, binary_dict, numeric_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", pitch_idx=None, num_windows=3, frame_period=5, relative_f0=True, vibrato_scale=1.0, vuv_threshold=0.3, force_fix_vuv=True, ): """Generate static features from predicted acoustic features Args: labels (HTSLabelFile): HTS labels acoustic_features (ndarray): predicted acoustic features binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary stream_sizes (list): stream sizes has_dynamic_features (list): whether each stream has dynamic features subphone_features (str): subphone feature type pitch_idx (int): index of pitch features num_windows (int): number of windows frame_period (float): frame period relative_f0 (bool): whether to use relative f0 vibrato_scale (float): vibrato scale vuv_threshold (float): vuv threshold force_fix_vuv (bool): whether to use post-processing to fix VUV. Returns: tuple: tuple of mgc, lf0, vuv and bap. """ if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, num_windows ) else: static_stream_sizes = stream_sizes # Copy here to avoid inplace operations on input acoustic features acoustic_features = acoustic_features.copy() # Split multi-stream features streams = split_streams(acoustic_features, static_stream_sizes) if len(streams) == 4: mgc, target_f0, vuv, bap = streams vib, vib_flags = None, None elif len(streams) == 5: # Assuming diff-based vibrato parameters mgc, target_f0, vuv, bap, vib = streams vib_flags = None elif len(streams) == 6: # Assuming sine-based vibrato parameters mgc, target_f0, vuv, bap, vib, vib_flags = streams else: raise RuntimeError("Not supported streams") linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features=subphone_features, ) # Correct V/UV based on special phone flags if force_fix_vuv: vuv = correct_vuv_by_phone(vuv, binary_dict, linguistic_features) # F0 if relative_f0: diff_lf0 = target_f0 f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < vuv_threshold] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < vuv_threshold] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) if vib is not None: if vib_flags is not None: # Generate sine-based vibrato vib_flags = vib_flags.flatten() m_a, m_f = vib[:, 0], vib[:, 1] # Fill zeros for non-vibrato frames m_a[vib_flags < 0.5] = 0 m_f[vib_flags < 0.5] = 0 # Gen vibrato sr_f0 = int(1 / (frame_period * 0.001)) f0 = gen_sine_vibrato(f0.flatten(), sr_f0, m_a, m_f, vibrato_scale) else: # Generate diff-based vibrato f0 = f0.flatten() + vibrato_scale * vib.flatten() # NOTE: Back to log-domain for convenience lf0 = f0.copy() lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)]) # NOTE: interpolation is necessary lf0 = interp1d(lf0, kind="slinear") lf0 = lf0[:, None] if len(lf0.shape) == 1 else lf0 vuv = vuv[:, None] if len(vuv.shape) == 1 else vuv return mgc, lf0, vuv, bap
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding", ) f0_score = _midi_to_hz(l_features, self.pitch_idx, False) notes = l_features[:, self.pitch_idx] notes = notes[notes > 0] # allow 200 cent upper/lower to properly handle F0 estimation of # preparation, vibrato and overshoot. # NOET: set the minimum f0 to 63.5 Hz (125 - 3*20.5) # https://acoustics.jp/qanda/answer/50.html # NOTE: sinsy allows 30-150 cent frequency range for vibrato (as of 2010) # https://staff.aist.go.jp/m.goto/PAPER/SIGMUS201007oura.pdf min_f0 = max(63.5, librosa.midi_to_hz(min(notes) - 2)) max_f0 = librosa.midi_to_hz(max(notes) + 2) assert max_f0 > min_f0 # Workaround segfault issues of WORLD's CheapTrick min_f0 = min(min_f0, 500) fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if fs != self.sample_rate: raise RuntimeError( "Sample rate mismatch! {} != {}".format(fs, self.sample_rate) ) if self.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0 ) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0 ) f0 = pyworld.stonemask(x, f0, timeaxis, fs) # Workaround for https://github.com/r9y9/nnsvs/issues/7 f0 = np.maximum(f0, 0) # Correct V/UV (and F0) based on the musical score information # treat frames where musical notes are not assigned as unvoiced if self.correct_vuv: # Use smoothed mask so that we don't mask out overshoot or something # that could happen at the start/end of notes # 0.5 sec. window (could be tuned for better results) win_length = int(0.5 / (self.frame_period * 0.001)) mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same") if len(f0) > len(mask): mask = np.pad(mask, (0, len(f0) - len(mask)), "constant") elif len(f0) < len(mask): mask = mask[: len(f0)] f0 = f0 * np.sign(mask) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) # F0 -> continuous F0 lf0 = interp1d(lf0, kind="slinear") # Vibrato parameter extraction sr_f0 = int(1 / (self.frame_period * 0.001)) if self.vibrato_mode == "sine": win_length = 64 n_fft = 256 threshold = 0.12 if self.use_harvest: # NOTE: harvest is not supported here since the current implemented algorithm # relies on v/uv flags to find vibrato sections. # We use DIO since it provides more accurate v/uv detection in my experience. _f0, _timeaxis = pyworld.dio( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0, ) _f0 = pyworld.stonemask(x, _f0, _timeaxis, fs) f0_smooth = extract_smoothed_f0(_f0, sr_f0, cutoff=8) else: f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8) f0_smooth_cent = hz_to_cent_based_c4(f0_smooth) vibrato_likelihood = extract_vibrato_likelihood( f0_smooth_cent, sr_f0, win_length=win_length, n_fft=n_fft ) vib_flags, m_a, m_f = extract_vibrato_parameters( f0_smooth_cent, vibrato_likelihood, sr_f0, threshold=threshold ) m_a = interp1d(m_a, kind="linear") m_f = interp1d(m_f, kind="linear") vib = np.stack([m_a, m_f], axis=1) vib_flags = vib_flags[:, np.newaxis] elif self.vibrato_mode == "diff": # NOTE: vibrato is known to have 3 ~ 8 Hz range (in general) # remove higher frequency than 3 to separate vibrato from the original F0 f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=3) vib = (f0 - f0_smooth)[:, np.newaxis] vib_flags = None elif self.vibrato_mode == "none": vib, vib_flags = None, None else: raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode)) mgc = pysptk.sp2mc( spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs) ) # Post-processing for aperiodicy # ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py if self.interp_unvoiced_aperiodicity: is_voiced = (vuv > 0).reshape(-1) if not np.any(is_voiced): pass # all unvoiced, do nothing else: for k in range(aperiodicity.shape[1]): aperiodicity[~is_voiced, k] = np.interp( np.where(~is_voiced)[0], np.where(is_voiced)[0], aperiodicity[is_voiced, k], ) bap = pyworld.code_aperiodicity(aperiodicity, fs) # Parameter trajectory smoothing if self.trajectory_smoothing: modfs = int(1 / 0.005) for d in range(mgc.shape[1]): mgc[:, d] = lowpass_filter( mgc[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff ) for d in range(bap.shape[1]): bap[:, d] = lowpass_filter( bap[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff ) # Adjust lengths mgc = mgc[: labels.num_frames()] lf0 = lf0[: labels.num_frames()] vuv = vuv[: labels.num_frames()] bap = bap[: labels.num_frames()] vib = vib[: labels.num_frames()] if vib is not None else None vib_flags = vib_flags[: labels.num_frames()] if vib_flags is not None else None if self.relative_f0: # # F0 derived from the musical score f0_score = f0_score[:, None] if len(f0_score) > len(f0): print( "Warning! likely to have mistakes in alignment in {}".format( label_path ) ) print(f0_score.shape, f0.shape) f0_score = f0_score[: len(f0)] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # relative f0 diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) f0_target = diff_lf0 else: f0_target = lf0 mgc = apply_delta_windows(mgc, self.windows) f0_target = apply_delta_windows(f0_target, self.windows) bap = apply_delta_windows(bap, self.windows) vib = apply_delta_windows(vib, self.windows) if vib is not None else None if vib is None and vib_flags is None: features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32) elif vib is not None and vib_flags is None: features = np.hstack((mgc, f0_target, vuv, bap, vib)).astype(np.float32) elif vib is not None and vib_flags is not None: features = np.hstack((mgc, f0_target, vuv, bap, vib, vib_flags)).astype( np.float32 ) else: raise RuntimeError("Unknown combination of features") # Align waveform and features wave = x.astype(np.float32) / 2 ** 15 T = int(features.shape[0] * (fs * self.frame_period / 1000)) if len(wave) < T: if T - len(wave) > int(fs * 0.005): print("Warn!!", T, len(wave), T - len(wave)) print("you have unepxcted input. Please debug though ipdb") import ipdb ipdb.set_trace() else: pass wave = np.pad(wave, (0, T - len(wave))) assert wave.shape[0] >= T wave = wave[:T] return features, wave
def predict_acoustic( device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, numeric_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict acoustic features from HTS labels MLPG is applied to the predicted features if the output features have dynamic features. Args: device (torch.device): device to use labels (HTSLabelFile): HTS labels acoustic_model (nn.Module): acoustic model acoustic_config (AcousticConfig): acoustic configuration acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary subphone_features (str): subphone feature type pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log f0 conditioning force_clip_input_features (bool): whether to force clip input features Returns: ndarray: predicted acoustic features """ # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features=subphone_features, ) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d( _midi_to_hz(linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(linguistic_features.shape[1]) if idx not in pitch_indices ] linguistic_features[:, non_pitch_indices] = np.clip( linguistic_features[:, non_pitch_indices], acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1], ) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) else: # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_acoustic = ( acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) return pred_acoustic
def predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=None, allowed_range_rest=None, force_clip_input_features=False, ): """Predict time-lag from HTS labels Args: device (torch.device): device labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels timelag_model (nn.Module): time-lag model timelag_config (dict): time-lag model config timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler binary_dict (dict): binary feature dict numeric_dict (dict): numeric feature dict pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to condition on log f0 allowed_range (list): allowed range of time-lag allowed_range_rest (list): allowed range of time-lag for rest force_clip_input_features (bool): whether to clip input features Returns; ndarray: time-lag predictions """ if allowed_range is None: allowed_range = [-20, 20] if allowed_range_rest is None: allowed_range_rest = [-40, 40] # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d( _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features ) if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(timelag_linguistic_features.shape[1]) if idx not in pitch_indices ] timelag_linguistic_features[:, non_pitch_indices] = np.clip( timelag_linguistic_features[:, non_pitch_indices], timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1], ) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) else: # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_timelag = ( timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1] ) else: pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range[0], allowed_range[1] ) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) if isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range duration_linguistic_features = np.clip( duration_linguistic_features, duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1]) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * duration_out_scaler.var_ max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_durations = duration_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding") f0_score = _midi_to_hz(l_features, self.pitch_idx, False) notes = l_features[:, self.pitch_idx] notes = notes[notes > 0] # allow 1-tone upper/lower min_f0 = librosa.midi_to_hz(min(notes) - 2) max_f0 = librosa.midi_to_hz(max(notes) + 2) assert max_f0 > min_f0 fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if self.use_harvest: f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0) else: f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period, f0_floor=min_f0, f0_ceil=max_f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)) # F0 of speech f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") # Adjust lengths mgc = mgc[:labels.num_frames()] lf0 = lf0[:labels.num_frames()] vuv = vuv[:labels.num_frames()] bap = bap[:labels.num_frames()] if self.relative_f0: # # F0 derived from the musical score f0_score = f0_score[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # relative f0 diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) f0_target = diff_lf0 else: f0_target = lf0 mgc = apply_delta_windows(mgc, self.windows) f0_target = apply_delta_windows(f0_target, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32) # Align waveform and features wave = x.astype(np.float32) / 2**15 T = int(features.shape[0] * (fs * self.frame_period / 1000)) if len(wave) < T: if T - len(wave) > 100: print("Warn!!", T, len(wave), T-len(wave)) print("you have unepxcted input. Please debug though ipdb") import ipdb; ipdb.set_trace() else: pass wave = np.pad(wave, (0, T-len(wave))) assert wave.shape[0] >= T wave = wave[:T] return features, wave
def predict_acoustic(device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d(_midi_to_hz( linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range linguistic_features = np.clip(linguistic_features, acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1]) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # (B, T, D_out) max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * acoustic_out_scaler.var_ max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_acoustic = acoustic_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) return pred_acoustic
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features(labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding") f0_score = midi_to_hz(l_features, self.pitch_idx, False) # TODO: better to set the margin carefully max_f0 = int(max(f0_score)) + 100 min_f0 = int(max(self.f0_floor, min(f0_score[f0_score > 0]) - 20)) assert max_f0 > min_f0 fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if self.use_harvest: f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0) else: f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period, f0_floor=min_f0, f0_ceil=max_f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)) # F0 of speech f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") # # F0 derived from the musical score f0_score = f0_score[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # Adjust lengths mgc = mgc[:labels.num_frames()] lf0 = lf0[:labels.num_frames()] vuv = vuv[:labels.num_frames()] bap = bap[:labels.num_frames()] diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) mgc = apply_delta_windows(mgc, self.windows) diff_lf0 = apply_delta_windows(diff_lf0, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, diff_lf0, vuv, bap)) return features.astype(np.float32)
def predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict phoneme durations from HTS labels Args: device (torch.device): device to run the model on labels (nnmnkwii.io.hts.HTSLabelFile): labels duration_model (nn.Module): duration model duration_config (dict): duration config duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log-f0 conditioning force_clip_input_features (bool): whether to clip input features Returns: np.ndarray: predicted durations """ # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d( _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features ) if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(duration_linguistic_features.shape[1]) if idx not in pitch_indices ] duration_linguistic_features[:, non_pitch_indices] = np.clip( duration_linguistic_features[:, non_pitch_indices], duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1], ) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): raise RuntimeError( "Dynamic features are not supported for duration modeling" ) # Apply denormalization max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) return max_mu, max_sigma_sq else: # (T, D_out) pred_durations = ( duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features, ) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations