def _process_utterance(out_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out-dir: the directory to write the spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (mel_filename, n_frames, text) """ # Load the audio as numpy array wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav to calculate n_frames spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrogram to disk mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (mel_filename, n_frames, text)
def load_spectrogram(self, audio_path, spectrogram_path, normalize, is_mel): """Load a mel or linear spectrogram from file or compute from scratch if needed. Arguments: audio_path (string): Path to the audio from which will (possibly) be the spectrogram computed. spectrogram_path (string): Path to the spectrogram file which will be loaded (possibly). normalize (boolean): If True, the spectrogram is normalized (per channel, extract mean and divide by std). is_mel (boolean): If True, the mel spectrogram is loaded or computed, otherwise returns a linear spectrogram. """ # load or compute spectrogram if hp.cache_spectrograms: full_spec_path = os.path.join(self.root_dir, spectrogram_path) spectrogram = np.load(full_spec_path) else: full_audio_path = os.path.join(self.root_dir, audio_path) audio_data = audio.load(full_audio_path) spectrogram = audio.spectrogram(audio_data, is_mel) # check spectrogram dimensions expected_dimension = hp.num_mels if is_mel else hp.num_fft // 2 + 1 assert np.shape(spectrogram)[0] == expected_dimension, ( f'Spectrogram dimensions mismatch: given {np.shape(spectrogram)[0]}, expected {expected_dimension}') # normalize if desired if normalize: spectrogram = audio.normalize_spectrogram(spectrogram, is_mel) return spectrogram
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # print(len(spectrogram)) # print(len(spectrogram[0])) # print(type(spectrogram)) # print(np.shape(spectrogram)) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # print(np.shape(mel_spectrogram)) # print() # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'meta_spec_%05d.npy' % index mel_filename = 'meta_mel_%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
metadata = [] for d, fs in files_to_solve: with open(os.path.join(d, fs), 'r', encoding='utf-8') as f: metadata.append((d, fs, [line.rstrip().split('|') for line in f])) print("metadata is:::", metadata) print(f'Please wait, this may take a very long time.') for d, fs, m in metadata: print(f'Creating spectrograms for: {fs}') with open(os.path.join(d, fs), 'w', encoding='utf-8') as f: for i in m: idx, s, l, a, _, _, raw_text, ph = i spec_name = idx + '.npy' audio_path = os.path.join(d, a) audio_data = audio.load(audio_path) mel_path = os.path.join(spectrogram_dirs[0], spec_name) lin_path = os.path.join(spectrogram_dirs[1], spec_name) #mel_path = os.path.join(d, mel_path_partial) if not os.path.exists(mel_path): np.save(mel_path, audio.spectrogram(audio_data, True)) #lin_path = os.path.join(d, lin_path_partial) if not os.path.exists(lin_path): np.save(lin_path, audio.spectrogram(audio_data, False)) print( f'{idx}|{s}|{l}|{a}|{mel_path}|{lin_path}|{raw_text}|{ph}', file=f)
def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=True): """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally). Format details: Every line of the metadata file contains info about one dataset item. The line has following format 'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text' And the following must hold 'audio_file_path' can be empty if loading just spectrograms 'text' should be carefully normalized and should contain interpunction 'phonemized_text' can be empty if loading just raw text Arguments: dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name. dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved.. output_metafile_name (string): Name of the output meta-file. audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True. num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True. Keyword arguments: spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved. phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved. """ # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate if spectrograms: old_sample_rate = hp.sample_rate hp.sample_rate = audio_sample_rate old_fft_freqs = hp.num_fft hp.num_fft = num_fft_freqs # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code] items = loaders.get_loader_by_name(dataset_name)(dataset_root_dir) # build dictionaries for translation to IPA from source languages, see utils.text for details if phonemes: text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items] phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs) # prepare directories which will store spectrograms if spectrograms: spectrogram_dirs = [os.path.join(dataset_root_dir, 'spectrograms'), os.path.join(dataset_root_dir, 'linear_spectrograms')] for x in spectrogram_dirs: if not os.path.exists(x): os.makedirs(x) # iterate through items and build the meta-file metafile_path = os.path.join(dataset_root_dir, output_metafile_name) with open(metafile_path, 'w', encoding='utf-8') as f: Logger.progress(0, prefix='Building metafile:') for i in range(len(items)): raw_text, audio_path, speaker, language = items[i] if language == "": language = hp.languages[0] phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else "" spectrogram_paths = "|" if spectrograms: spec_name = f'{str(i).zfill(6)}.npy' audio_data = audio.load(os.path.join(dataset_root_dir, audio_path)) np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True)) np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False)) spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name) print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f) Logger.progress((i + 1) / len(items), prefix='Building metafile:') # restore the original sample rate and fft freq values if spectrograms: hp.sample_rate = old_sample_rate hp.num_fft = old_fft_freqs
f = open(tdd_file, encoding='utf-8') ctr = 0 for line in f: if len(line) > 2: ctr += 1 line = line.split('\n')[0] fname = line.split()[0] phones = ' '.join(k for k in line.split()[1:]) if generate_feats_flag: wav_fname = wav_dir + '/' + fname + '.wav' wav = audio.load_wav(wav_fname) max_samples = _max_out_length * 5 / 1000 * 16000 spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) lspec_fname = lspec_dir + '/' + fname + '_lspec.npy' mspec_fname = mspec_dir + '/' + fname + '_mspec.npy' np.save(lspec_fname, spectrogram.T, allow_pickle=False) np.save(mspec_fname, mel_spectrogram.T, allow_pickle=False) g = open(data_file, 'a') g.write(lspec_fname + '|' + mspec_fname + '|' + str(n_frames) + '| ' + phones + '\n') g.close() g = open(feats_dir + '/' + fname + '.feats', 'w') for phone in phones.split(): g.write(phone + '\n') g.close()
from griffin_lim import inv_spectrogram, tf import os if __name__ == '__main__': data_foler = "data" wavs = [ os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav") ] outputs_py = [file + ".py.gen.wav" for file in wavs] outputs_tf = [file + ".tf.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs] print("Linear spectrograms dim: ") print(spectrogram[0].shape) # --------------------------------- librosa Version --------------------------------- # convert back gens = [audio.inv_spectrogram(s) for s in spectrogram] for gen, output in zip(gens, outputs_py): audio.save_wav(gen, output) # --------------------------------- TensorFlow Version --------------------------------- samples = [inv_spectrogram(spec) for spec in spectrogram] with tf.Session() as sess: samples = [sess.run(sample) for sample in samples]
def main(): argv0: str = sys.argv[0] if argv0: workdir: str = os.path.dirname(argv0) if workdir: os.chdir(workdir) os.chdir("data") parser = argparse.ArgumentParser() parser.add_argument("--dataset", type=str, default="1a", # help="Params dataset for Training Data.") args = parser.parse_args() Params.load(f"../params/{args.dataset}.json") audio.hp = Params hop_frames: int = audio.ms_to_frames(audio.hp.stft_shift_ms) win_frames: int = audio.ms_to_frames(audio.hp.stft_window_ms) print(f"mel parameters: hop = {hop_frames:,}, win = {win_frames:,}") dataset_path: str = os.path.join("datasets", args.dataset) # as this code *alters* the train and val files, always regenerate them first! _: List[str] = ["python", os.path.join(dataset_path, "create_training_files.py")] subprocess.run(_, check=True, bufsize=0) files_to_solve = [(dataset_path, "train.txt"), (dataset_path, "val.txt"), ] mel_path: str = os.path.join(dataset_path, 'mel_spectrograms') os.makedirs(mel_path, exist_ok=True) mp3_path: str = os.path.join(dataset_path, "reference-audio") shutil.rmtree(mp3_path, ignore_errors=True) os.mkdir(mp3_path) mp3_bad_path: str = os.path.join(dataset_path, "reference-audio-bad") shutil.rmtree(mp3_bad_path, ignore_errors=True) os.mkdir(mp3_bad_path) mp3_fixed_path: str = os.path.join(dataset_path, "reference-audio-fixed") shutil.rmtree(mp3_fixed_path, ignore_errors=True) os.mkdir(mp3_fixed_path) metadata = [] for d, fs in files_to_solve: with open(os.path.join(d, fs), 'r', encoding='utf-8') as f: metadata.append((d, fs, [line.rstrip().split('|') for line in f])) bad_silence_count: int = 0 file_bad_entries: str = os.path.join(dataset_path, "entries-bad.txt") with open(file_bad_entries, "w"): pass fix_silence_count: int = 0 file_fixed_entries: str = os.path.join(dataset_path, "entries-fixed.txt") with open(file_fixed_entries, "w"): pass skipped_too_short: List[str] = list() skipped_too_long: List[str] = list() spec_id: int = 0 print(f'Please wait, this may take a very long time.') for d, fs, m in metadata: print(f'Creating spectrograms for: {fs}') bar: progressbar.ProgressBar = progressbar.ProgressBar(maxval=len(m)) bar.start() with open(os.path.join(d, fs + "-tmp"), 'w', encoding='utf-8') as f: for i in m: idx, speaker, lang, wav, _, _, raw_text, phonemes = i if lang not in Params.languages: continue raw_text = ud.normalize("NFC", raw_text) phonemes = ud.normalize("NFC", phonemes) spec_id += 1 spec_name = f"{lang}_{speaker}-{spec_id:06d}.npy" mel_path_partial = os.path.join("mel_spectrograms", spec_name) mel_path = os.path.join(dataset_path, mel_path_partial) entry: str = f'{idx}|{speaker}|{lang}|{wav}|{mel_path_partial}||{raw_text}|{phonemes}' audio_path = os.path.join(d, wav) py_audio: AudioSegment = AudioSegment.from_file(audio_path) py_audio = py_audio.set_channels(1).set_frame_rate(Params.sample_rate) py_audio = effects.normalize(py_audio) py_audio = trim_silence(py_audio) # Output altered audio (compressed) for manual review mp3_name = f"{lang}_{speaker}-{spec_id:06d}.mp3" ref_audio_mp3: str = os.path.join(mp3_path, mp3_name) if Params.fix_silence: fix_silence: int = Params.fix_silence_len segments = silence.split_on_silence(py_audio, # min_silence_len=fix_silence, # silence_thresh=-50, # keep_silence=fix_silence / 2) if len(segments) > 1: new_py_audio = AudioSegment.empty() for segment in segments: new_py_audio = new_py_audio.append(segment, crossfade=0) assert len(new_py_audio), "Empty fixed audio after recombining?" py_audio = new_py_audio.set_channels(1).set_frame_rate(py_audio.frame_rate) with open(file_fixed_entries, "a") as w: print(entry, file=w) fix_audio_mp3: str = os.path.join(mp3_fixed_path, f"fix-{mp3_name}") py_audio.export(fix_audio_mp3, format="mp3", parameters=["-qscale:a", "3"]) fix_silence_count += 1 if Params.skip_silence: max_silence: int = Params.max_silence_len if silence.detect_silence(py_audio, # min_silence_len=max_silence, # silence_thresh=-50): with open(file_bad_entries, "a") as w: print(entry, file=w) bad_audio_mp3: str = os.path.join(mp3_bad_path, f"bad-{mp3_name}") py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"]) bad_silence_count += 1 continue if len(py_audio) < Params.audio_min_length: skipped_too_short.append(entry) bad_audio_mp3: str = os.path.join(mp3_bad_path, f"too-short-{mp3_name}") py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"]) continue if len(py_audio) > Params.audio_max_length: skipped_too_long.append(entry) bad_audio_mp3: str = os.path.join(mp3_bad_path, f"too-long-{mp3_name}") py_audio.export(bad_audio_mp3, format="mp3", parameters=["-qscale:a", "3"]) continue if Params.lead_in_silence > 0: # Add lead_in_silence ms of silence at the beginning py_audio = AudioSegment.silent(Params.lead_in_silence) + py_audio if Params.lead_out_silence > 0: # Add lead_out_silence ms of silence at the end py_audio = py_audio + AudioSegment.silent(Params.lead_out_silence) if not os.path.exists(ref_audio_mp3): py_audio.export(ref_audio_mp3, format="mp3", parameters=["-qscale:a", "3"]) py_audio_samples: array = np.array(py_audio.get_array_of_samples()).astype(np.float32) py_audio_samples = py_audio_samples / (1 << 8 * 2 - 1) if not os.path.exists(mel_path): np.save(mel_path, audio.spectrogram(py_audio_samples, True)) print(entry, file=f) bar.update(bar.currval + 1) print(f"Records skipped (>{Params.audio_max_length / 1000:.02f}): {len(skipped_too_long):,}") with open(os.path.join(d, "too-long-" + fs), "w") as w: for entry in skipped_too_long: print(entry, file=w) print(f"Records skipped (<{Params.audio_min_length / 1000:.02f}): {len(skipped_too_short):,}") with open(os.path.join(d, "too-short-" + fs), "w") as w: for entry in skipped_too_short: print(entry, file=w) bar.finish() if bad_silence_count: print(f"Records skipped because of excessive silence: {bad_silence_count:,}") if fix_silence_count: print(f"Records altered because of excessive silence: {fix_silence_count:,}") for d, fs in files_to_solve: tmp = os.path.join(d, fs + "-tmp") dst = os.path.join(d, fs) bkup = os.path.join(d, fs + "-bkup") if os.path.exists(bkup): os.remove(bkup) os.rename(dst, bkup) os.rename(tmp, dst) sys.exit()