def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, speaker_num, lan_num, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Get spectrogram from wav ret = audio.wav2spectrograms(wav, hparams) if ret is None: return None out, mel_spectrogram, linear_spectrogram, time_steps, mel_frames = ret # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(np.float32), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, speaker_num, lan_num)
def audio_process_utterance(mel_dir, linear_dir, wav_dir, duration_dir, score_dir, index, wav, durations, scores, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, score_filename, duration_filename, time_steps, mel_frames) """ #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #Get spectrogram from wav ret = audio.wav2spectrograms(wav, hparams) if ret is None: return None out = ret[0] mel_spectrogram = ret[1] linear_spectrogram = ret[2] time_steps = ret[3] mel_frames = ret[4] # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) duration_filename = 'duration-{}.npy'.format(index) score_filename = 'score-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(np.float32), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) np.save(os.path.join(duration_dir, duration_filename), durations, allow_pickle=False) np.save(os.path.join(score_dir, score_filename), scores, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, duration_filename, score_filename, time_steps, mel_frames)