def postprocess(self, feats, curr_sample_rate): if self.sample_rate != curr_sample_rate: wav_tensor = feats.clone().detach() wav_tensor = Resample(curr_sample_rate, self.sample_rate)(wav_tensor) feats = wav_tensor.numpy() if feats.dim() == 2: feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() if self.normalize: with torch.no_grad(): feats = F.layer_norm(feats, feats.shape) return feats
def process_utterance(in_dir, out_dir, spker, basename): wav_path = os.path.join(in_dir, 'wav48', spker, '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', spker, '{}.TextGrid'.format(basename)) if not os.path.exists(tg_path): return None # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files sr, wav = read(wav_path) wav = torch.tensor(wav.astype(np.float32)) if sr != hp.sampling_rate: wav = Resample(orig_freq=sr, new_freq=hp.sampling_rate)(wav) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)] # Compute fundamental frequency f0, _ = pw.dio(wav.numpy().astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav) mel_spectrogram = mel_spectrogram.cpu().numpy().astype( np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # if the shape is not right, you can check get_alignment function try: assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1]) except AssertionError as e: print("duration problem: {}".format(wav_path)) return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) try: return '|'.join([basename, text]), max(f0), min([ f for f in f0 if f != 0 ]), max(energy), min(energy), mel_spectrogram.shape[1] except: print(basename) return None