def _adjust_time_resolution(self, batch, local_condition, max_time_steps): '''Adjust time resolution between audio and local condition ''' if local_condition: new_batch = [] for b in batch: x, c, g, l = b self._assert_ready_for_upsample(x, c) if max_time_steps is not None: max_steps = _ensure_divisible( max_time_steps, audio.get_hop_size(self._hparams), True) if len(x) > max_time_steps: max_time_frames = max_steps // audio.get_hop_size( self._hparams) start = np.random.randint(0, len(c) - max_time_frames) time_start = start * audio.get_hop_size(self._hparams) x = x[time_start:time_start + max_time_frames * audio.get_hop_size(self._hparams)] c = c[start:start + max_time_frames, :] self._assert_ready_for_upsample(x, c) new_batch.append((x, c, g, l)) return new_batch else: new_batch = [] for b in batch: x, c, g, l = b x = audio.trim_silence(x, hparams) if max_time_steps is not None and len(x) > max_time_steps: start = np.random.randint(0, len(c) - max_time_steps) x = x[start:start + max_time_steps] new_batch.append((x, c, g, l)) return new_batch
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # text to pinyin text = text.replace("#1", "").replace("#2", "").replace("#3", "").replace("#4", "") pinyin = " ".join(get_pinyin(text)) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav = wav / np.max(np.abs(wav)) * 0.9 # norm # denoise if hparams.mmse_denoise_by_bothEndOfAudio and len( wav) > hparams.sample_rate * (hparams.length_as_noise * 2 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * hparams.length_as_noise)], wav[-int(hparams.sample_rate * hparams.length_as_noise):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence( wav, hparams.trim_top_db) # top_db=30 for aishell, 60 for BZNSYP # audio.save_wav(wav, wav_path.replace(".wav", "_trimed.wav")) # convert wav to 16bit int wav *= 32768 wav = wav.astype(np.int16) # extract LPC feature extractor = lpcnet.FeatureExtractor() feat = extractor.compute_feature(wav) n_frames = feat.shape[0] # write the lpc feature to disk feature_filename = 'biaobei-lpc-feat-%05d.npy' % index np.save(os.path.join(out_dir, feature_filename), feat, allow_pickle=False) # Return a tuple describing this training example: return (feature_filename, n_frames, pinyin)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # rescale wav for unified measure for all clips wav = wav / np.abs(wav).max() * 0.999 # trim silence wav = audio.trim_silence(wav) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] if n_frames > hp.max_frame_num: return None # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'thchs30-spec-%05d.npy' % index mel_filename = 'thchs30-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def _process_utterance(out_dir, name, wav_path, text, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, hparams) # trim silences here wav = audio.trim_silence(wav, hparams) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav, hparams).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'mailabs-spec-{}.npy'.format(name) mel_filename = 'mailabs-mel-{}.npy'.format(name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin): # 读取语音 wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * 0.999 # 消除静音 wav = audio.trim_silence(wav) # 得到语音的线性频谱和梅尔频谱 spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # 保存两种频谱 spectrogram_filename = 'biaobei-spec-%05d.npy' % index mel_filename = 'biaobei-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, pinyin)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None # M-AILABS extra silence specific wav = audio.trim_silence(wav) #Pre-emphasize wav = audio.preemphasis(wav) #rescale wav #wav = wav / np.abs(wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): #raise RuntimeError('wav has invalid value: {}'.format(wav)) print('file {} has invalid value. skipping!'.format(wav_path)) return None #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_frame_num: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.spectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad, hop_size = audio.librosa_pad_lr(wav) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * hop_size #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * hop_size] assert len(out) % hop_size == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size()) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size() #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)