Пример #1
0
def prepare_single_commonvoice_tsv(
    lang: str,
    part: str,
    output_dir: Pathlike,
    lang_path: Pathlike,
) -> Tuple[RecordingSet, SupervisionSet]:
    """
    Prepares part of CommonVoice data from a single TSV file.

    :param lang: string language code (e.g., "en").
    :param part: which split to prepare (e.g., "train", "validated", etc.).
    :param output_dir: path to directory where we will store the manifests.
    :param lang_path: path to a CommonVoice directory for a specific language
        (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl").
    :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode,
        as CommonVoice manifests may be fairly large in memory.
    """
    if not is_module_available("pandas"):
        raise ValueError(
            "To prepare CommonVoice data, please 'pip install pandas' first.")
    import pandas as pd

    lang_path = Path(lang_path)
    output_dir = Path(output_dir)
    tsv_path = lang_path / f"{part}.tsv"

    # Read the metadata
    df = pd.read_csv(tsv_path, sep="\t")
    # Scan all the audio files
    with RecordingSet.open_writer(
            output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as recs_writer, SupervisionSet.open_writer(
            output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz",
            overwrite=False,
    ) as sups_writer:
        for idx, row in tqdm(
                df.iterrows(),
                desc="Processing audio files",
                total=len(df),
        ):
            try:
                result = parse_utterance(row, lang_path, lang)
                if result is None:
                    continue
                recording, segment = result
                validate_recordings_and_supervisions(recording, segment)
                recs_writer.write(recording)
                sups_writer.write(segment)
            except Exception as e:
                logging.error(
                    f"Error when processing TSV file: line no. {idx}: '{row}'.\n"
                    f"Original error type: '{type(e)}' and message: {e}")
                continue
    recordings = RecordingSet.from_jsonl_lazy(recs_writer.path)
    supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path)
    return recordings, supervisions
Пример #2
0
def prepare_fisher_english(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    absolute_paths: bool = False,
    num_jobs: int = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_dir / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_dir}'.")

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_dir / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(
        audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}"
    if len(transcript_paths) != len(sessions):
        warnings.warn(
            f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, "
            f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}."
        )

    recs_path = output_dir / "recordings_notfixed.jsonl.gz"
    if recs_path.is_file():
        logging.info(f"Using existing recording manifest at {recs_path}")
        recordings = RecordingSet.from_jsonl_lazy(recs_path)
    else:
        logging.info(f"Building fresh recording manifest")
        create_recordings_input = [(p, None if absolute_paths else 5)
                                   for p in audio_paths]
        err_recos = 0
        with ProcessPoolExecutor(
                num_jobs) as executor, RecordingSet.open_writer(
                    recs_path) as writer:
            with tqdm(total=len(create_recordings_input),
                      desc="Collect recordings") as pbar:
                for reco in executor.map(create_recording,
                                         create_recordings_input):
                    if reco is not None:
                        writer.write(reco, flush=True)
                    else:
                        err_recos += 1
                    pbar.update()
        if err_recos:
            warnings.warn(f"Out of {len(create_recordings_input)} recordings, "
                          f"{err_recos} had errors and were omitted.")
        recordings = writer.open_manifest()

    sups_path = output_dir / "supervisions_notfixed.jsonl.gz"
    if sups_path.is_file():
        logging.info(f"Using existing supervision manifest at {recs_path}")
        supervisions = SupervisionSet.from_jsonl_lazy(sups_path)
    else:
        logging.info(f"Building fresh supervision manifest")
        create_supervisions_input = [(sessions, p) for p in transcript_paths]
        err_sups = 0
        with ThreadPoolExecutor(os.cpu_count() *
                                4) as executor, SupervisionSet.open_writer(
                                    sups_path) as writer:
            with tqdm(total=len(create_supervisions_input),
                      desc="Create supervisions") as pbar:
                for tmp_supervisions in executor.map(
                        create_supervision, create_supervisions_input):
                    if not tmp_supervisions:
                        err_sups += 1
                    for s in tmp_supervisions:
                        writer.write(s)
                    pbar.update()
        supervisions = writer.open_manifest()
        if err_recos:
            warnings.warn(
                f"Out of {len(create_supervisions_input)} transcript files, "
                f"{err_sups} had errors and were omitted.")

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    # Write the fixed and validated version to files with standard names.
    recordings.to_file(recs_path.parent / "recordings.jsonl.gz")
    supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Пример #3
0
def prepare_peoples_speech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    The metadata is read lazily and written to manifests in a stream to minimize
    the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet`
    without using excessive memory, we suggest to call it like::

        >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...)
        >>> cuts = CutSet.from_manifests(
        ...     recordings=peoples_speech["recordings"],
        ...     supervisions=peoples_speech["supervisions"],
        ...     output_path=...,
        ...     lazy=True,
        ... )

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    recs_path = output_dir / "peoples-speech_recordings_all.jsonl.gz"
    sups_path = output_dir / "peoples-speech_supervisions_all.jsonl.gz"

    if recs_path.is_file() and sups_path.is_file():
        # Nothing to do: just open the manifests in lazy mode.
        return {
            "recordings": RecordingSet.from_jsonl_lazy(recs_path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sups_path),
        }

    exist = 0
    tot = 0
    err = 0
    with RecordingSet.open_writer(
            recs_path, ) as rec_writer, SupervisionSet.open_writer(
                sups_path, ) as sup_writer:
        for item in tqdm(
                # Note: People's Speech manifest.json is really a JSONL.
                load_jsonl(corpus_dir / "manifest.json"),
                desc=
                "Converting People's Speech manifest.json to Lhotse manifests",
        ):
            for duration_ms, text, audio_path in zip(
                    *item["training_data"].values()):
                full_path = corpus_dir / audio_path

                tot += 1
                if not full_path.exists():
                    # If we can't find some data, we'll just continue and some items
                    # were missing later.
                    continue
                exist += 1

                try:
                    audio_info = info(full_path)
                    duration = duration_ms / 1000
                    r = Recording(
                        id=full_path.stem,
                        sampling_rate=audio_info.samplerate,
                        num_samples=compute_num_samples(
                            duration, audio_info.samplerate),
                        duration=duration,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[0],
                                source=str(full_path),
                            )
                        ],
                    )
                    s = SupervisionSegment(
                        id=r.id,
                        recording_id=r.id,
                        start=0,
                        duration=r.duration,
                        channel=0,
                        text=text,
                        language="English",
                        custom={"session_id": item["identifier"]},
                    )

                    validate_recordings_and_supervisions(recordings=r,
                                                         supervisions=s)

                    rec_writer.write(r)
                    sup_writer.write(s)

                except Exception as e:
                    # If some files are missing (e.g. somebody is working on a subset
                    # of 30.000 hours), we won't interrupt processing; we will only
                    # do so for violated assertions.
                    if isinstance(e, AssertionError):
                        raise
                    err += 1
                    continue

    if exist < tot or err > 0:
        warnings.warn(
            f"We finished preparing The People's Speech Lhotse manifests. "
            f"Out of {tot} entries in the original manifest, we found {exist} "
            f"audio files existed, out of which {err} had errors during processing."
        )

    return {
        "recordings": rec_writer.open_manifest(),
        "supervisions": sup_writer.open_manifest(),
    }
Пример #4
0
def prepare_spgispeech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    normalize_text: bool = True,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text (similar to ESPNet recipe).
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case to lower case.
        This includes removing possibly important punctuations such as dashes and apostrophes.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    audio_dir = (corpus_dir if
                 (corpus_dir /
                  "train").is_dir() else corpus_dir / "spgispeech")

    dataset_parts = ["train", "val"]
    manifests = {}

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe the manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="spgispeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in dataset_parts:
        logging.info(f"Processing SPGISpeech subset: {part}")
        if manifests_exist(part=part,
                           output_dir=output_dir,
                           prefix="spgispeech",
                           suffix="jsonl.gz"):
            logging.info(
                f"SPGISpeech subset: {part} already prepared - skipping.")
            continue

        # Read the recordings and write them into manifest. We additionally store the
        # duration of the recordings in a dict which will be used later to create the
        # supervisions.
        global audio_read_worker
        durations = {}

        def audio_read_worker(p: Path) -> Recording:
            r = Recording.from_file(p,
                                    recording_id=f"{p.parent.stem}_{p.stem}")
            durations[r.id] = r.duration
            return r

        with RecordingSet.open_writer(
                output_dir /
                f"spgispeech_recordings_{part}.jsonl.gz") as rec_writer:
            for recording in tqdm(
                    parallel_map(
                        audio_read_worker,
                        (audio_dir / part).rglob("*.wav"),
                        num_jobs=num_jobs,
                    ),
                    desc="Processing SPGISpeech recordings",
            ):
                rec_writer.write(recording)

        # Read supervisions and write them to manifest
        with SupervisionSet.open_writer(
                output_dir / f"spgispeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, open(corpus_dir / f"{part}.csv", "r") as f:
            # Skip the header
            next(f)
            for line in tqdm(f, desc="Processing utterances"):
                parts = line.strip().split("|")
                # 07a785e9237c389c1354bb60abca42d5/1.wav -> 07a785e9237c389c1354bb60abca42d5_1
                recording_id = parts[0].replace("/", "_").replace(".wav", "")
                text = parts[2]
                if normalize_text:
                    text = normalize(text)
                spkid = recording_id.split("_")[0]
                segment = SupervisionSegment(
                    id=recording_id,
                    recording_id=recording_id,
                    text=text,
                    speaker=spkid,
                    start=0,
                    duration=durations[recording_id],
                    language="English",
                )
                sup_writer.write(segment)

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
        }

    return manifests
Пример #5
0
def prepare_gigaspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike],
    dataset_parts: Union[str, Sequence[str]] = "auto",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available("speechcolab"):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab"
        )

    subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts
    if isinstance(subsets, str):
        subsets = [subsets]
    corpus_dir = Path(corpus_dir)
    gigaspeech = GigaSpeech(corpus_dir)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Maybe some manifests already exist: we can read them and save a bit of preparation time.
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        prefix="gigaspeech",
        suffix="jsonl.gz",
        lazy=True,
    )

    for part in subsets:
        logging.info(f"Processing GigaSpeech subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz"
        ):
            logging.info(f"GigaSpeech subset: {part} already prepared - skipping.")
            continue

        with RecordingSet.open_writer(
            output_dir / f"gigaspeech_recordings_{part}.jsonl.gz"
        ) as rec_writer, SupervisionSet.open_writer(
            output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz"
        ) as sup_writer, CutSet.open_writer(
            output_dir / f"gigaspeech_cuts_{part}.jsonl.gz"
        ) as cut_writer:
            for recording, segments in tqdm(
                parallel_map(
                    parse_utterance,
                    gigaspeech.audios("{" + part + "}"),
                    repeat(gigaspeech.gigaspeech_dataset_dir),
                    num_jobs=num_jobs,
                ),
                desc="Processing GigaSpeech JSON entries",
            ):
                # Fix and validate the recording + supervisions
                recordings, segments = fix_manifests(
                    recordings=RecordingSet.from_recordings([recording]),
                    supervisions=SupervisionSet.from_segments(segments),
                )
                validate_recordings_and_supervisions(
                    recordings=recordings, supervisions=segments
                )
                # Create the cut since most users will need it anyway.
                # There will be exactly one cut since there's exactly one recording.
                cuts = CutSet.from_manifests(
                    recordings=recordings, supervisions=segments
                )
                # Write the manifests
                rec_writer.write(recordings[0])
                for s in segments:
                    sup_writer.write(s)
                cut_writer.write(cuts[0])

        manifests[part] = {
            "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path),
            "cuts": CutSet.from_jsonl_lazy(cut_writer.path),
        }

    return dict(manifests)