def __process_data(dataset_path, stat_path, min_duration, max_duration, val_size, test_size, seed_for_ds_split): # Create normalizer text_normalizer = Normalizer( lang="de", input_case="cased", overwrite_cache=True, cache_dir=str(dataset_path / "cache_dir"), ) text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True } normalizer_call = lambda x: text_normalizer.normalize( x, **text_normalizer_call_kwargs) entries = [] with open(stat_path) as f: # Let's skip the header f.readline() for line in tqdm(f): file_stem, duration, *_, text = line.strip().split("|") duration = float(duration) # file_stem -> dir_name (e.g. maerchen_01_f000051 -> maerchen, ber_psychoanalyse_01_f000046 -> ber_psychoanalyse) dir_name = "_".join(file_stem.split("_")[:-2]) audio_path = dataset_path / dir_name / "wavs" / f"{file_stem}.wav" if min_duration <= duration <= max_duration: normalized_text = normalizer_call(text) entry = { 'audio_filepath': str(audio_path), 'duration': duration, 'text': text, 'normalized_text': normalized_text, } entries.append(entry) random.Random(seed_for_ds_split).shuffle(entries) train_size = len(entries) - val_size - test_size assert train_size > 0, "Not enough data for train, val and test" def save(p, data): with open(p, 'w') as f: for d in data: f.write(json.dumps(d) + '\n') save(dataset_path / "train_manifest.json", entries[:train_size]) save(dataset_path / "val_manifest.json", entries[train_size:train_size + val_size]) save(dataset_path / "test_manifest.json", entries[train_size + val_size:])
def __process_data(data_root, whitelist_path): if whitelist_path is None: wget.download( "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv", out=str(data_root), ) whitelist_path = data_root / "whitelist_lj_speech.tsv" text_normalizer = Normalizer( lang="en", input_case="cased", whitelist=whitelist_path, overwrite_cache=True, cache_dir=data_root / "cache_dir", ) text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True} normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs) # Create manifests (based on predefined NVIDIA's split) filelists = ['train', 'val', 'test'] for split in tqdm(filelists): # Download file list if necessary filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt" if not filelist_path.exists(): wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root)) manifest_target = data_root / f"{split}_manifest.json" with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in tqdm(filelist): basename = line[6:16] text = line[21:].strip() norm_text = normalizer_call(text) # Make sure corresponding wavfile exists wav_path = data_root / 'wavs' / f"{basename}.wav" assert wav_path.exists(), f"{wav_path} does not exist!" entry = { 'audio_filepath': str(wav_path), 'duration': sox.file_info.duration(wav_path), 'text': text, 'normalized_text': norm_text, } f_out.write(json.dumps(entry) + '\n')
def main(): args = get_args() ljspeech_dir = args.ljspeech_dir # Download LJSpeech dataset if needed if args.download_ljspeech: get_lj_speech(args.ljspeech_dir) ljspeech_dir = os.path.join(args.ljspeech_dir, "LJSpeech-1.1") # Create normalizer if args.normalizer_class == "ENCharParser": normalizer_call = parsers.make_parser(name='en')._normalize elif args.normalizer_class == "Normalizer": whitelist_path = args.whitelist_path if whitelist_path is None: wget.download( "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv", out=ljspeech_dir, ) whitelist_path = os.path.join(ljspeech_dir, "whitelist_lj_speech.tsv") text_normalizer = Normalizer( lang="en", input_case="cased", whitelist=whitelist_path, overwrite_cache=True, cache_dir=os.path.join(ljspeech_dir, "cache_dir"), ) text_normalizer_call_kwargs = { "punct_pre_process": True, "punct_post_process": True } normalizer_call = lambda x: text_normalizer.normalize( x, **text_normalizer_call_kwargs) else: raise ValueError("normalizer_class must be ENCharParser or Normalizer") # Create manifests (based on predefined NVIDIA's split) and optionally save transcripts in .txt files filelist_base = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists' filelists = ['train', 'val', 'test'] for split in filelists: # Download file list if necessary filelist_path = os.path.join(ljspeech_dir, f"ljs_audio_text_{split}_filelist.txt") if not os.path.exists(filelist_path): wget.download( f"{filelist_base}/ljs_audio_text_{split}_filelist.txt", out=ljspeech_dir) manifest_target = os.path.join(ljspeech_dir, f"ljspeech_{split}.json") with open(manifest_target, 'w') as f_out: with open(filelist_path, 'r') as filelist: print(f"\nCreating {manifest_target}...") for line in filelist: basename = line[6:16] text = line[21:].strip() norm_text = normalizer_call(text) # Make sure corresponding wavfile exists wav_path = os.path.join(ljspeech_dir, 'wavs', basename + '.wav') assert os.path.exists(wav_path) if args.save_transcripts_in_txt: txt_path = os.path.join(ljspeech_dir, 'wavs', basename + '.txt') with open(txt_path, 'w') as f_txt: f_txt.write(norm_text) # Write manifest entry entry = { 'audio_filepath': wav_path, 'duration': sox.file_info.duration(wav_path), 'text': norm_text if args.manifest_text_var_is_normalized else text, 'normalized_text': norm_text, } f_out.write(json.dumps(entry) + '\n')