Exemplo n.º 1
0
def prepare_spgispeech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    normalize_text: bool = True,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text (similar to ESPNet recipe).
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case to lower case.
        This includes removing possibly important punctuations such as dashes and apostrophes.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    audio_dir = (corpus_dir if
                 (corpus_dir /
                  "train").is_dir() else corpus_dir / "spgispeech")

    dataset_parts = ["train", "val"]
    manifests = {}

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe the manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="spgispeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in dataset_parts:
        logging.info(f"Processing SPGISpeech subset: {part}")
        if manifests_exist(part=part,
                           output_dir=output_dir,
                           prefix="spgispeech",
                           suffix="jsonl.gz"):
            logging.info(
                f"SPGISpeech subset: {part} already prepared - skipping.")
            continue

        # Read the recordings and write them into manifest. We additionally store the
        # duration of the recordings in a dict which will be used later to create the
        # supervisions.
        global audio_read_worker
        durations = {}

        def audio_read_worker(p: Path) -> Recording:
            r = Recording.from_file(p,
                                    recording_id=f"{p.parent.stem}_{p.stem}")
            durations[r.id] = r.duration
            return r

        with RecordingSet.open_writer(
                output_dir /
                f"spgispeech_recordings_{part}.jsonl.gz") as rec_writer:
            for recording in tqdm(
                    parallel_map(
                        audio_read_worker,
                        (audio_dir / part).rglob("*.wav"),
                        num_jobs=num_jobs,
                    ),
                    desc="Processing SPGISpeech recordings",
            ):
                rec_writer.write(recording)

        # Read supervisions and write them to manifest
        with SupervisionSet.open_writer(
                output_dir / f"spgispeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, open(corpus_dir / f"{part}.csv", "r") as f:
            # Skip the header
            next(f)
            for line in tqdm(f, desc="Processing utterances"):
                parts = line.strip().split("|")
                # 07a785e9237c389c1354bb60abca42d5/1.wav -> 07a785e9237c389c1354bb60abca42d5_1
                recording_id = parts[0].replace("/", "_").replace(".wav", "")
                text = parts[2]
                if normalize_text:
                    text = normalize(text)
                spkid = recording_id.split("_")[0]
                segment = SupervisionSegment(
                    id=recording_id,
                    recording_id=recording_id,
                    text=text,
                    speaker=spkid,
                    start=0,
                    duration=durations[recording_id],
                    language="English",
                )
                sup_writer.write(segment)

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
        }

    return manifests
Exemplo n.º 2
0
def test_parallel_map_two_iterables():
    squares = list(map(mul, range(100), range(100)))
    squares_parallel = list(
        parallel_map(mul, range(100), range(100), num_jobs=2))
    assert squares == squares_parallel
Exemplo n.º 3
0
def test_parallel_map_num_jobs(num_jobs):
    squares = list(map(pow2, range(100)))
    squares_parallel = list(parallel_map(pow2, range(100), num_jobs=num_jobs))
    assert squares == squares_parallel
Exemplo n.º 4
0
def test_parallel_map_threads():
    squares = list(map(pow2, range(100)))
    squares_parallel = list(
        parallel_map(pow2, range(100), num_jobs=2, threads=True))
    assert squares == squares_parallel
Exemplo n.º 5
0
def prepare_gigaspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike],
    dataset_parts: Union[str, Sequence[str]] = "auto",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available("speechcolab"):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab"
        )

    subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts
    if isinstance(subsets, str):
        subsets = [subsets]
    corpus_dir = Path(corpus_dir)
    gigaspeech = GigaSpeech(corpus_dir)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe some manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="gigaspeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in subsets:
        logging.info(f"Processing GigaSpeech subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz"
        ):
            logging.info(f"GigaSpeech subset: {part} already prepared - skipping.")
            continue

        with RecordingSet.open_writer(
            output_dir / f"gigaspeech_recordings_{part}.jsonl.gz"
        ) as rec_writer, SupervisionSet.open_writer(
            output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, CutSet.open_writer(
            output_dir / f"gigaspeech_cuts_{part}.jsonl.gz"
        ) as cut_writer:
            for recording, segments in tqdm(
                parallel_map(
                    parse_utterance,
                    gigaspeech.audios("{" + part + "}"),
                    repeat(gigaspeech.gigaspeech_dataset_dir),
                    num_jobs=num_jobs,
                ),
                desc="Processing GigaSpeech JSON entries",
            ):
                # Fix and validate the recording + supervisions
                recordings, segments = fix_manifests(
                    recordings=RecordingSet.from_recordings([recording]),
                    supervisions=SupervisionSet.from_segments(segments),
                )
                validate_recordings_and_supervisions(
                    recordings=recordings, supervisions=segments
                )
                # Create the cut since most users will need it anyway.
                # There will be exactly one cut since there's exactly one recording.
                cuts = CutSet.from_manifests(
                    recordings=recordings, supervisions=segments
                )
                # Write the manifests
                rec_writer.write(recordings[0])
                for s in segments:
                    sup_writer.write(s)
                cut_writer.write(cuts[0])

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
            "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
        }

    return dict(manifests)