예제 #1
0
파일: get_data.py 프로젝트: quuhua911/NeMo
def __process_data(dataset_path, stat_path, min_duration, max_duration,
                   val_size, test_size, seed_for_ds_split):
    # Create normalizer
    text_normalizer = Normalizer(
        lang="de",
        input_case="cased",
        overwrite_cache=True,
        cache_dir=str(dataset_path / "cache_dir"),
    )
    text_normalizer_call_kwargs = {
        "punct_pre_process": True,
        "punct_post_process": True
    }
    normalizer_call = lambda x: text_normalizer.normalize(
        x, **text_normalizer_call_kwargs)

    entries = []
    with open(stat_path) as f:
        # Let's skip the header
        f.readline()
        for line in tqdm(f):
            file_stem, duration, *_, text = line.strip().split("|")
            duration = float(duration)

            # file_stem -> dir_name (e.g. maerchen_01_f000051 -> maerchen, ber_psychoanalyse_01_f000046 -> ber_psychoanalyse)
            dir_name = "_".join(file_stem.split("_")[:-2])
            audio_path = dataset_path / dir_name / "wavs" / f"{file_stem}.wav"

            if min_duration <= duration <= max_duration:
                normalized_text = normalizer_call(text)
                entry = {
                    'audio_filepath': str(audio_path),
                    'duration': duration,
                    'text': text,
                    'normalized_text': normalized_text,
                }
                entries.append(entry)

    random.Random(seed_for_ds_split).shuffle(entries)
    train_size = len(entries) - val_size - test_size

    assert train_size > 0, "Not enough data for train, val and test"

    def save(p, data):
        with open(p, 'w') as f:
            for d in data:
                f.write(json.dumps(d) + '\n')

    save(dataset_path / "train_manifest.json", entries[:train_size])
    save(dataset_path / "val_manifest.json",
         entries[train_size:train_size + val_size])
    save(dataset_path / "test_manifest.json", entries[train_size + val_size:])
예제 #2
0
파일: get_data.py 프로젝트: quuhua911/NeMo
def __process_data(data_root, whitelist_path):
    if whitelist_path is None:
        wget.download(
            "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv",
            out=str(data_root),
        )
        whitelist_path = data_root / "whitelist_lj_speech.tsv"

    text_normalizer = Normalizer(
        lang="en",
        input_case="cased",
        whitelist=whitelist_path,
        overwrite_cache=True,
        cache_dir=data_root / "cache_dir",
    )
    text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True}
    normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs)

    # Create manifests (based on predefined NVIDIA's split)
    filelists = ['train', 'val', 'test']
    for split in tqdm(filelists):
        # Download file list if necessary
        filelist_path = data_root / f"ljs_audio_text_{split}_filelist.txt"

        if not filelist_path.exists():
            wget.download(f"{FILELIST_BASE}/ljs_audio_text_{split}_filelist.txt", out=str(data_root))

        manifest_target = data_root / f"{split}_manifest.json"
        with open(manifest_target, 'w') as f_out:
            with open(filelist_path, 'r') as filelist:
                print(f"\nCreating {manifest_target}...")
                for line in tqdm(filelist):
                    basename = line[6:16]

                    text = line[21:].strip()
                    norm_text = normalizer_call(text)

                    # Make sure corresponding wavfile exists
                    wav_path = data_root / 'wavs' / f"{basename}.wav"
                    assert wav_path.exists(), f"{wav_path} does not exist!"

                    entry = {
                        'audio_filepath': str(wav_path),
                        'duration': sox.file_info.duration(wav_path),
                        'text': text,
                        'normalized_text': norm_text,
                    }

                    f_out.write(json.dumps(entry) + '\n')
예제 #3
0
def main():
    args = get_args()
    ljspeech_dir = args.ljspeech_dir

    # Download LJSpeech dataset if needed
    if args.download_ljspeech:
        get_lj_speech(args.ljspeech_dir)
        ljspeech_dir = os.path.join(args.ljspeech_dir, "LJSpeech-1.1")

    # Create normalizer
    if args.normalizer_class == "ENCharParser":
        normalizer_call = parsers.make_parser(name='en')._normalize
    elif args.normalizer_class == "Normalizer":
        whitelist_path = args.whitelist_path

        if whitelist_path is None:
            wget.download(
                "https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/text_normalization/en/data/whitelist_lj_speech.tsv",
                out=ljspeech_dir,
            )
            whitelist_path = os.path.join(ljspeech_dir,
                                          "whitelist_lj_speech.tsv")

        text_normalizer = Normalizer(
            lang="en",
            input_case="cased",
            whitelist=whitelist_path,
            overwrite_cache=True,
            cache_dir=os.path.join(ljspeech_dir, "cache_dir"),
        )
        text_normalizer_call_kwargs = {
            "punct_pre_process": True,
            "punct_post_process": True
        }

        normalizer_call = lambda x: text_normalizer.normalize(
            x, **text_normalizer_call_kwargs)
    else:
        raise ValueError("normalizer_class must be ENCharParser or Normalizer")

    # Create manifests (based on predefined NVIDIA's split) and optionally save transcripts in .txt files
    filelist_base = 'https://raw.githubusercontent.com/NVIDIA/tacotron2/master/filelists'
    filelists = ['train', 'val', 'test']
    for split in filelists:
        # Download file list if necessary
        filelist_path = os.path.join(ljspeech_dir,
                                     f"ljs_audio_text_{split}_filelist.txt")
        if not os.path.exists(filelist_path):
            wget.download(
                f"{filelist_base}/ljs_audio_text_{split}_filelist.txt",
                out=ljspeech_dir)

        manifest_target = os.path.join(ljspeech_dir, f"ljspeech_{split}.json")
        with open(manifest_target, 'w') as f_out:
            with open(filelist_path, 'r') as filelist:
                print(f"\nCreating {manifest_target}...")
                for line in filelist:
                    basename = line[6:16]

                    text = line[21:].strip()
                    norm_text = normalizer_call(text)

                    # Make sure corresponding wavfile exists
                    wav_path = os.path.join(ljspeech_dir, 'wavs',
                                            basename + '.wav')
                    assert os.path.exists(wav_path)

                    if args.save_transcripts_in_txt:
                        txt_path = os.path.join(ljspeech_dir, 'wavs',
                                                basename + '.txt')
                        with open(txt_path, 'w') as f_txt:
                            f_txt.write(norm_text)

                    # Write manifest entry
                    entry = {
                        'audio_filepath':
                        wav_path,
                        'duration':
                        sox.file_info.duration(wav_path),
                        'text':
                        norm_text
                        if args.manifest_text_var_is_normalized else text,
                        'normalized_text':
                        norm_text,
                    }

                    f_out.write(json.dumps(entry) + '\n')