예제 #1
0
def main():
    corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Aishell data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('aishell manifest preparation:')
    aishell_manifests = prepare_aishell(corpus_dir=corpus_dir,
                                        output_dir=output_dir)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan',
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                num_jobs=num_jobs if ex is not None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=Fbank(),
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
예제 #2
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('dev', 'test', 'train')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French'))
    musan_dir = locate_corpus(Path('/mnt/corpora/musan'))

    output_dir = Path('exp/data')
    print('mls manifest preparation:')
    mls_manifests = prepare_mls(corpus_dir=corpus_dir,
                                output_dir=output_dir,
                                opus=False,
                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in mls_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            mls_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
예제 #3
0
def test_lazy_cuts_combine_split_issue():
    # Test for lack of exception
    cuts = DummyManifest(CutSet, begin_id=0, end_id=1000)
    with TemporaryDirectory() as d, NamedTemporaryFile(suffix=".jsonl.gz") as f:
        cuts.to_file(f.name)
        f.flush()

        cuts_lazy = load_manifest_lazy(f.name)
        cuts_lazy = combine(cuts_lazy, cuts_lazy.perturb_speed(0.9))
        cuts_lazy.split_lazy(d, chunk_size=100)
예제 #4
0
def test_combine_lazy(manifest_type):
    expected = DummyManifest(manifest_type, begin_id=0, end_id=200)
    with as_lazy(DummyManifest(
            manifest_type, begin_id=0, end_id=68)) as part1, as_lazy(
                DummyManifest(manifest_type, begin_id=68,
                              end_id=136)) as part2, as_lazy(
                                  DummyManifest(manifest_type,
                                                begin_id=136,
                                                end_id=200)) as part3:
        combined = combine(part1, part2, part3)
        # Equivalent under iteration
        assert list(combined) == list(expected)
예제 #5
0
파일: prepare.py 프로젝트: zhu-han/snowfall
def main():
    corpus_dir = locate_corpus(
        (Path('/mnt/cfs2/asr/database/AM/aishell'),
         Path('/root/fangjun/data/aishell'),
         Path(
             '/home/storage04/zhuangweiji/data/open-source-data/SLR33-aishell/data'
         )),
        msg='Please specify the directory to the AIShell dataset')

    musan_dir = locate_corpus(
        (Path('/export/corpora5/JHU/musan'),
         Path('/export/common/data/corpora/MUSAN/musan'),
         Path('/root/fangjun/data/musan')),
        msg='Please specify the directory to the MUSAN dataset')

    output_dir = Path('exp/data')
    print('aishell manifest preparation:')
    aishell_manifests = prepare_aishell(
        corpus_dir=corpus_dir,
        output_dir=output_dir
    )

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(
        corpus_dir=musan_dir,
        output_dir=output_dir,
        parts=('music', 'speech', 'noise')
    )

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions']
            )
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                num_jobs=num_jobs if ex is not None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(
                recordings=combine(part['recordings'] for part in musan_manifests.values())
            ).cut_into_windows(10.0).filter(lambda c: c.duration > 5).compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_musan',
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            musan_cuts.to_json(musan_cuts_path)
예제 #6
0
파일: prepare.py 프로젝트: zhu-han/snowfall
def main():
    args = get_parser().parse_args()
    if args.full_libri:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100', 'train-clean-360',
                         'train-other-500')
    else:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        ), Path('/root/fangjun/data/librispeech/LibriSpeech'),
        Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech'))
    musan_dir = locate_corpus(
        Path('/export/corpora5/JHU/musan'),
        Path('/export/common/data/corpora/MUSAN/musan'),
        Path('/root/fangjun/data/musan'),
    )

    output_dir = Path('exp/data')
    print('LibriSpeech manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
예제 #7
0
def main():
    if full_libri:
        dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100',
                         'train-clean-360', 'train-other-500')
    else:
        dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dirs = [
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        )
    ]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Librispeech data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('LibriSpeech manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan',
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=Fbank(),
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
예제 #8
0
def main():
    args = get_parser().parse_args()
    dataset_parts = [args.subset, "DEV", "TEST"]

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path("/export/corpora5/gigaspeech"),
        Path("/exp/pzelasko/gigaspeech"),
    )
    musan_dir = locate_corpus(
        Path("/export/corpora5/JHU/musan"),
        Path("/export/common/data/corpora/MUSAN/musan"),
        Path("/root/fangjun/data/musan"),
    )

    output_dir = Path("exp/data")
    print("GigaSpeech manifest preparation:")
    gigaspeech_manifests = prepare_gigaspeech(
        corpus_dir=corpus_dir,
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        num_jobs=args.num_jobs,
    )

    print("Musan manifest preparation:")
    musan_cuts_path = output_dir / "cuts_musan.json.gz"
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=("music", "speech", "noise"))

    ctx_suffix = get_context_suffix(args)

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in gigaspeech_manifests.items():
            raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz"
            cuts_path = (output_dir /
                         f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz")

            if raw_cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping feature extraction."
                )
            else:
                # Note this step makes the recipe different than LibriSpeech:
                # We must filter out some utterances and remove punctuation to be consistent with Kaldi.
                print("Filtering OOV utterances from supervisions")
                manifests["supervisions"] = manifests["supervisions"].filter(
                    has_no_oov)
                print("Normalizing text in", partition)
                for sup in manifests["supervisions"]:
                    sup.text = normalize_text(sup.text)

                # Create long-recording cut manifests.
                print("Processing", partition)
                cut_set = CutSet.from_manifests(
                    recordings=manifests["recordings"],
                    supervisions=manifests["supervisions"],
                )

                # Run data augmentation that needs to be done in the time domain.
                if partition not in ["DEV", "TEST"]:
                    cut_set = (cut_set + cut_set.perturb_speed(0.9) +
                               cut_set.perturb_speed(1.1))

                cut_set.to_file(raw_cuts_path)

            if cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping cutting into sub-segments."
                )
            else:
                try:
                    # If we skipped initializing `cut_set` because it exists on disk, we'll load it.
                    # This helps us avoid re-computing the features for different variants of
                    # context windows.
                    cut_set
                except NameError:
                    print(f"Reading {partition} raw cuts from disk.")
                    cut_set = CutSet.from_file(raw_cuts_path)
                # Note this step makes the recipe different than LibriSpeech:
                # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions.
                # We cut these into smaller chunks centered around each supervision, possibly adding acoustic
                # context.
                print(
                    f"About to split {partition} raw cuts into smaller chunks."
                )
                cut_set = cut_set.trim_to_supervisions(
                    keep_overlapping=False,
                    min_duration=None
                    if args.context_window <= 0.0 else args.context_window,
                    context_direction=args.context_direction,
                )
                if partition in ["L", "XL"]:
                    # Before storing manifests in, we want to pre-shuffle them,
                    # as the sampler won't be able to do it later in an efficient manner.
                    cut_set = cut_set.shuffle()

                if args.precomputed_features:
                    # Extract the features after cutting large recordings into smaller cuts.
                    # Note: we support very efficient "chunked" feature reads with the argument
                    #       `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient
                    #       data augmentation and feature computation for long recordings yet.
                    #       Therefore, we sacrifice some storage for the ability to precompute
                    #       features on shorter chunks, without memory blow-ups.
                    cut_set = cut_set.compute_and_store_features(
                        extractor=extractor,
                        storage_path=
                        f"{output_dir}/feats_gigaspeech_{partition}",
                        # when an executor is specified, make more partitions
                        num_jobs=args.num_jobs if ex is None else 80,
                        executor=ex,
                    )

                cut_set.to_file(cuts_path)

                # Remove cut_set so the next iteration can correctly infer whether it needs to
                # load the raw cuts from disk or not.
                del cut_set

        # Now onto Musan
        if not musan_cuts_path.is_file():
            print("Extracting features for Musan")
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = (CutSet.from_manifests(recordings=combine(
                part["recordings"]
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f"{output_dir}/feats_musan",
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer,
                        ))
            musan_cuts.to_file(musan_cuts_path)