Пример #1
0
def process(args):
    root = op.join(args.data_root, args.src_lang)
    os.makedirs(root, exist_ok=True)
    # Extract features
    feature_root = op.join(root, 'fbank80')
    os.makedirs(feature_root, exist_ok=True)
    for split in CoVoST.SPLITS:
        print(f'Fetching split {split}...')
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang,
                         download=True)
        print('Extracting log mel filter bank features...')
        for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
            extract_fbank_features(waveform, sample_rate,
                                   op.join(feature_root, f'{utt_id}.npy'))
    # Pack features into ZIP
    zip_filename = 'fbank80.zip'
    zip_path = op.join(root, zip_filename)
    print('ZIPing features...')
    create_zip(feature_root, zip_path)
    print('Fetching ZIP manifest...')
    zip_manifest = get_zip_manifest(args.data_root,
                                    f'{args.src_lang}/{zip_filename}')
    # Generate TSV manifest
    print('Generating manifest...')
    train_text = []
    task = f'asr_{args.src_lang}'
    if args.tgt_lang is not None:
        task = f'st_{args.src_lang}_{args.tgt_lang}'
    for split in CoVoST.SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
            manifest['id'].append(utt_id)
            manifest['audio'].append(zip_manifest[utt_id])
            duration_ms = int(wav.size(1) / sr * 1000)
            manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
            manifest['tgt_text'].append(
                src_utt if args.tgt_lang is None else tgt_utt
            )
            manifest['speaker'].append(speaker_id)
        is_train_split = split.startswith('train')
        if is_train_split:
            train_text.extend(manifest['tgt_text'])
        df = pd.DataFrame.from_dict(manifest)
        df = filter_manifest_df(df, is_train_split=is_train_split)
        save_df_to_tsv(df, op.join(root, f'{split}_{task}.tsv'))
    # Generate vocab
    vocab_size_str = '' if args.vocab_type == 'char' else str(args.vocab_size)
    spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size_str}_{task}'
    with NamedTemporaryFile(mode='w') as f:
        for t in train_text:
            f.write(t + '\n')
        gen_vocab(f.name, op.join(root, spm_filename_prefix),
                  args.vocab_type, args.vocab_size)
    # Generate config YAML
    gen_config_yaml(root, spm_filename_prefix + '.model',
                    yaml_filename=f'config_{task}.yaml',
                    specaugment_policy='lb')
    # Clean up
    shutil.rmtree(feature_root)
Пример #2
0
def process(args):
    root = Path(args.data_root).absolute() / args.src_lang
    if not root.is_dir():
        raise NotADirectoryError(f"{root} does not exist")
    # Extract features
    feature_root = root / "fbank80"
    feature_root.mkdir(exist_ok=True)
    for split in CoVoST.SPLITS:
        print(f"Fetching split {split}...")
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        print("Extracting log mel filter bank features...")
        for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
            extract_fbank_features(waveform, sample_rate,
                                   feature_root / f"{utt_id}.npy")
    # Pack features into ZIP
    zip_path = root / "fbank80.zip"
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    zip_manifest = get_zip_manifest(zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    task = f"asr_{args.src_lang}"
    if args.tgt_lang is not None:
        task = f"st_{args.src_lang}_{args.tgt_lang}"
    for split in CoVoST.SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = CoVoST(root, split, args.src_lang, args.tgt_lang)
        for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
            manifest["id"].append(utt_id)
            manifest["audio"].append(zip_manifest[utt_id])
            duration_ms = int(wav.size(1) / sr * 1000)
            manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
            manifest["tgt_text"].append(
                src_utt if args.tgt_lang is None else tgt_utt)
            manifest["speaker"].append(speaker_id)
        is_train_split = split.startswith("train")
        if is_train_split:
            train_text.extend(manifest["tgt_text"])
        df = pd.DataFrame.from_dict(manifest)
        df = filter_manifest_df(df, is_train_split=is_train_split)
        save_df_to_tsv(df, root / f"{split}_{task}.tsv")
    # Generate vocab
    vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(Path(f.name), root / spm_filename_prefix, args.vocab_type,
                  args.vocab_size)
    # Generate config YAML
    gen_config_yaml(
        root,
        spm_filename_prefix + ".model",
        yaml_filename=f"config_{task}.yaml",
        specaugment_policy="lb",
    )
    # Clean up
    shutil.rmtree(feature_root)
Пример #3
0
def process(args):
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)
    # Extract features
    feature_root = out_root / "fbank80"
    feature_root.mkdir(exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            extract_fbank_features(
                wav, sample_rate, feature_root / f"{sample_id}.npy"
            )
    # Pack features into ZIP
    zip_path = out_root / "fbank80.zip"
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(out_root.as_posix(), url=split)
        for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(audio_paths[sample_id])
            manifest["n_frames"].append(audio_lengths[sample_id])
            manifest["tgt_text"].append(utt.lower())
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv"
        )
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            Path(f.name),
            out_root / spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(
        out_root,
        spm_filename=spm_filename_prefix + ".model",
        specaugment_policy="ld"
    )
    # Clean up
    shutil.rmtree(feature_root)
Пример #4
0
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, "fbank80")
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f"Fetching split {split}...")
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print("Extracting log mel filter bank features...")
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f"{sample_id}.npy"))
    # Pack features into ZIP
    zip_filename = "fbank80.zip"
    zip_path = op.join(args.output_root, zip_filename)
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    print("Fetching ZIP manifest...")
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print("Generating manifest...")
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f"{spk_id}-{chapter_id}-{utt_id}"
            manifest["id"].append(sample_id)
            manifest["audio"].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
            manifest["tgt_text"].append(utt)
            manifest["speaker"].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f"{split}.tsv"))
        if split.startswith("train"):
            train_text.extend(manifest["tgt_text"])
    # Generate vocab
    vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size)
    spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}"
    with NamedTemporaryFile(mode="w") as f:
        for t in train_text:
            f.write(t + "\n")
        gen_vocab(
            f.name,
            op.join(args.output_root, spm_filename_prefix),
            args.vocab_type,
            args.vocab_size,
        )
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + ".model",
                    specaugment_policy="ld")
    # Clean up
    shutil.rmtree(feature_root)
Пример #5
0
def manifest_preparation(manifest, track, data, tgt_text, track_path):
    waveform, sample_rate = torchaudio.load(track_path)
    utt_id = data[1].removesuffix(".flac")
    extract_fbank_features(waveform, sample_rate,
                           feature_root / utt_id + ".npy")
    manifest["id"].append(utt_id)
    manifest["audio"].append(feature_root / utt_id + ".npy")
    duration_ms = track.duration_seconds * ms
    manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
    manifest["tgt_text"].append(tgt_text)
    manifest["speaker"].append(data[0])
Пример #6
0
def process(args):
    os.makedirs(args.output_root, exist_ok=True)
    # Extract features
    feature_root = op.join(args.output_root, 'fbank80')
    os.makedirs(feature_root, exist_ok=True)
    for split in SPLITS:
        print(f'Fetching split {split}...')
        dataset = LIBRISPEECH(args.output_root, url=split, download=True)
        print('Extracting log mel filter bank features...')
        for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            extract_fbank_features(wav, sample_rate,
                                   op.join(feature_root, f'{sample_id}.npy'))
    # Pack features into ZIP
    zip_filename = 'fbank80.zip'
    zip_path = op.join(args.output_root, zip_filename)
    print('ZIPing features...')
    create_zip(feature_root, zip_path)
    print('Fetching ZIP manifest...')
    zip_manifest = get_zip_manifest(args.output_root, zip_filename)
    # Generate TSV manifest
    print('Generating manifest...')
    train_text = []
    for split in SPLITS:
        manifest = {c: [] for c in MANIFEST_COLUMNS}
        dataset = LIBRISPEECH(args.output_root, url=split)
        for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset):
            sample_id = f'{spk_id}-{chapter_id}-{utt_id}'
            manifest['id'].append(sample_id)
            manifest['audio'].append(zip_manifest[sample_id])
            duration_ms = int(wav.size(1) / sample_rate * 1000)
            manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
            manifest['tgt_text'].append(utt)
            manifest['speaker'].append(spk_id)
        save_df_to_tsv(pd.DataFrame.from_dict(manifest),
                       op.join(args.output_root, f'{split}.tsv'))
        if split.startswith('train'):
            train_text.extend(manifest['tgt_text'])
    # Generate vocab
    vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size)
    spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}'
    with NamedTemporaryFile(mode='w') as f:
        for t in train_text:
            f.write(t + '\n')
        gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix),
                  args.vocab_type, args.vocab_size)
    # Generate config YAML
    gen_config_yaml(args.output_root,
                    spm_filename_prefix + '.model',
                    specaugment_policy='ld')
    # Clean up
    shutil.rmtree(feature_root)
Пример #7
0
def add_data_to_manifest(manifest, track_path_segment, data, counter,
                         track_segment, tr):
    waveform, sample_rate = torchaudio.load(track_path_segment)
    feature_root = Path(
        "../speech_translation/data/sound").absolute() / "fbank"
    utt_id = data[0] + "_" + str(counter)
    extract_fbank_features(waveform, sample_rate,
                           feature_root / f"{utt_id}.npy")
    manifest["id"].append(utt_id)
    manifest["audio"].append(feature_root / f"{utt_id}.npy")
    duration_ms = track_segment.duration_seconds * ms
    manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
    manifest["tgt_text"].append(tr)
    manifest["speaker"].append(21918)
Пример #8
0
def process(args):
    for lang in MUSTC.LANGUAGES:
        cur_root = op.join(args.data_root, f"en-{lang}")
        if not op.isdir(cur_root):
            print(f"{cur_root} does not exist. Skipped.")
            continue
        # Extract features
        feature_root = op.join(cur_root, "fbank80")
        os.makedirs(feature_root, exist_ok=True)
        for split in MUSTC.SPLITS:
            print(f"Fetching split {split}...")
            dataset = MUSTC(args.data_root, lang, split)
            print("Extracting log mel filter bank features...")
            for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                extract_fbank_features(waveform, sample_rate,
                                       op.join(feature_root, f"{utt_id}.npy"))
        # Pack features into ZIP
        zip_filename = "fbank80.zip"
        zip_path = op.join(cur_root, zip_filename)
        print("ZIPing features...")
        create_zip(feature_root, zip_path)
        print("Fetching ZIP manifest...")
        zip_manifest = get_zip_manifest(args.data_root,
                                        f"en-{lang}/{zip_filename}")
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            dataset = MUSTC(args.data_root, lang, split)
            for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest["id"].append(utt_id)
                manifest["audio"].append(zip_manifest[utt_id])
                duration_ms = int(wav.size(1) / sr * 1000)
                manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))
                manifest["tgt_text"].append(src_utt if args.task ==
                                            "asr" else tgt_utt)
                manifest["speaker"].append(speaker_id)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, op.join(cur_root, f"{split}_{args.task}.tsv"))
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                f.name,
                op.join(cur_root, spm_filename_prefix),
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        gen_config_yaml(
            cur_root,
            spm_filename_prefix + ".model",
            yaml_filename=f"config_{args.task}.yaml",
            specaugment_policy="lb",
        )
        # Clean up
        shutil.rmtree(feature_root)
Пример #9
0
def process(args):
    root = Path(args.data_root).absolute()
    for lang in MUSTC.LANGUAGES:
        cur_root = root / f"en-{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)

        for split in MUSTC.SPLITS:
            print(f"Fetching split {split}...")
            dataset = MUSTC(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform,
                        sample_rate,
                        to_mono=True,
                        to_sample_rate=tgt_sample_rate)
                    sf.write((audio_root / f"{utt_id}.flac").as_posix(),
                             _wavform.T.numpy(), tgt_sample_rate)
            else:
                print("Extracting log mel filter bank features...")
                gcmvn_feature_list = []
                if split == 'train' and args.cmvn_type == "global":
                    print("And estimating cepstral mean and variance stats...")

                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    features = extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy")
                    if split == 'train' and args.cmvn_type == "global":
                        if len(gcmvn_feature_list) < args.gcmvn_max_num:
                            gcmvn_feature_list.append(features)

                if split == 'train' and args.cmvn_type == "global":
                    # Estimate and save cmv
                    stats = cal_gcmvn_stats(gcmvn_feature_list)
                    with open(cur_root / "gcmvn.npz", "wb") as f:
                        np.savez(f, mean=stats["mean"], std=stats["std"])

        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(
            zip_path,
            is_audio=args.use_audio_input,
        )
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            dataset = MUSTC(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(src_utt if args.task ==
                                            "asr" else tgt_utt)
                manifest["speaker"].append(speaker_id)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(cur_root,
                            spm_filename=spm_filename_prefix + ".model",
                            yaml_filename=f"config_{args.task}.yaml",
                            specaugment_policy=None,
                            extra={"use_audio_input": True})
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
                cmvn_type=args.cmvn_type,
                gcmvn_path=(cur_root / "gcmvn.npz"
                            if args.cmvn_type == "global" else None),
            )
        # Clean up
        shutil.rmtree(audio_root)
Пример #10
0
def process(args):
    root = Path(args.data_root).absolute()
    for lang in mTEDx.LANGPAIRS:
        cur_root = root / f"{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)
        for split in mTEDx.SPLITS:
            print(f"Fetching split {split}...")
            dataset = mTEDx(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform, sample_rate, to_mono=True,
                        to_sample_rate=tgt_sample_rate
                    )
                    sf.write(
                        (audio_root / f"{utt_id}.flac").as_posix(),
                        _wavform.numpy(), tgt_sample_rate
                    )
            else:
                print("Extracting log mel filter bank features...")
                for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset):
                    extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy"
                    )
        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(zip_path)
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in mTEDx.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            ds = mTEDx(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(
                    src_utt if args.task == "asr" else tgt_utt
                )
                manifest["speaker"].append(spk_id)
                manifest["tgt_lang"].append(tgt_lang)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy=None,
                extra={"use_audio_input": True}
            )
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
            )
        # Clean up
        shutil.rmtree(audio_root)
Пример #11
0
def process(args):
    for lang in MUSTC.LANGUAGES:
        cur_root = op.join(args.data_root, f'en-{lang}')
        if not op.isdir(cur_root):
            print(f'{cur_root} does not exist. Skipped.')
            continue
        # Extract features
        feature_root = op.join(cur_root, 'fbank80')
        os.makedirs(feature_root, exist_ok=True)
        for split in MUSTC.SPLITS:
            print(f'Fetching split {split}...')
            dataset = MUSTC(args.data_root, lang, split)
            print('Extracting log mel filter bank features...')
            for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                extract_fbank_features(waveform, sample_rate,
                                       op.join(feature_root, f'{utt_id}.npy'))
        # Pack features into ZIP
        zip_filename = 'fbank80.zip'
        zip_path = op.join(cur_root, zip_filename)
        print('ZIPing features...')
        create_zip(feature_root, zip_path)
        print('Fetching ZIP manifest...')
        zip_manifest = get_zip_manifest(args.data_root,
                                        f'en-{lang}/{zip_filename}')
        # Generate TSV manifest
        print('Generating manifest...')
        train_text = {task: [] for task in TASKS}
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith('train')
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            text = {task: [] for task in TASKS}
            dataset = MUSTC(args.data_root, lang, split)
            for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest['id'].append(utt_id)
                manifest['audio'].append(zip_manifest[utt_id])
                duration_ms = int(wav.size(1) / sr * 1000)
                manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10))
                text['asr'].append(src_utt)
                text['st'].append(tgt_utt)
                manifest['speaker'].append(speaker_id)
            if is_train_split:
                for task in TASKS:
                    train_text[task].extend(text[task])
            for task in TASKS:
                manifest['tgt_text'] = text[task]
                df = pd.DataFrame.from_dict(manifest)
                df = filter_manifest_df(df, is_train_split=is_train_split)
                save_df_to_tsv(df, op.join(cur_root, f'{split}_{task}.tsv'))
        # Generate vocab
        for task in TASKS:
            vocab_type, vocab_size = args.asr_vocab_type, args.asr_vocab_size
            if task == 'st':
                vocab_type, vocab_size = args.st_vocab_type, args.st_vocab_size
            vocab_size_str = '' if vocab_type == 'char' else str(vocab_size)
            spm_filename_prefix = f'spm_{vocab_type}{vocab_size_str}_{task}'
            with NamedTemporaryFile(mode='w') as f:
                for t in train_text[task]:
                    f.write(t + '\n')
                gen_vocab(f.name, op.join(cur_root, spm_filename_prefix),
                          vocab_type, vocab_size)
            # Generate config YAML
            gen_config_yaml(cur_root,
                            spm_filename_prefix + '.model',
                            yaml_filename=f'config_{task}.yaml',
                            specaugment_policy='lb')
        # Clean up
        shutil.rmtree(feature_root)