Пример #1
0
def prepare_callhome_english(
        audio_dir: Pathlike,
        rttm_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / 'fullref.rttm'
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, '*.sph')
    recordings = RecordingSet.from_recordings(
        make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)
    )

    recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions)
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Пример #2
0
def prepare_callhome_english_sre(
    audio_dir: Pathlike,
    rttm_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / "fullref.rttm"
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, "*.sph")
    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 4)
        for p in tqdm(audio_paths))

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")
    return {"recordings": recordings, "supervisions": supervisions}
Пример #3
0
def prepare_voxceleb(
    voxceleb1_root: Optional[Pathlike] = None,
    voxceleb2_root: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb v1 and v2 corpora.

    The manifests are created in a dict with three splits: train, dev and test, for each
    of the two versions.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param voxceleb1_root: Path to the VoxCeleb v1 dataset.
    :param voxceleb2_root: Path to the VoxCeleb v2 dataset.
    :param output_dir: Path to the output directory.
    :param num_jobs: Number of parallel jobs to run.
    :return: A dict with standard corpus splits ("train" and "test") containing the manifests.

    NOTE: We prepare the data using the Kaldi style split, i.e., the whole VoxCeleb2
    ("dev" and "test") and the training portion ("dev") of VoxCeleb1 are put into the
    "train" split. The "test" split contains the "test" portion of VoxCeleb1. So if
    VoxCeleb1 is not provided, no "test" split is created in the output manifests.

    Example usage:

    .. code-block:: python

        >>> from lhotse.recipes.voxceleb import prepare_voxceleb
        >>> manifests = prepare_voxceleb(voxceleb_v1_root='/path/to/voxceleb1',
        ...                               voxceleb_v2_root='/path/to/voxceleb2',
        ...                               output_dir='/path/to/output',
        ...                               num_jobs=4)

    NOTE: If VoxCeleb1 is provided, we also prepare the trials file using the list provided
    in http://www.openslr.org/resources/49/voxceleb1_test_v2.txt. This file is used in the
    Kaldi recipes for VoxCeleb speaker verification. This is prepared as 2 tuples of the form
    (CutSet, CutSet) with identical id's, one for each of positive pairs and negative pairs.
    These are stored in the dict under keys 'pos_trials' and 'neg_trials', respectively.
    For evaluation purpose, the :class:`lhotse.dataset.sampling.CutPairsSampler`
    can be used to sample from this tuple.
    """
    voxceleb1_root = Path(voxceleb1_root) if voxceleb1_root else None
    voxceleb2_root = Path(voxceleb2_root) if voxceleb2_root else None
    if not (voxceleb1_root or voxceleb2_root):
        raise ValueError("Either VoxCeleb1 or VoxCeleb2 path must be provided.")

    output_dir = Path(output_dir) if output_dir is not None else None
    manifests = defaultdict(dict)
    if voxceleb1_root:
        logging.info("Preparing VoxCeleb1...")
        manifests.update(_prepare_voxceleb_v1(voxceleb1_root, num_jobs))
        manifests.update(_prepare_voxceleb_trials(manifests["test"]))
    else:
        logging.info(
            "VoxCeleb1 not provided, no test split or trials file will be created..."
        )
    if voxceleb2_root:
        logging.info("Preparing VoxCeleb2...")
        v2_manifests = _prepare_voxceleb_v2(voxceleb2_root, num_jobs)
        if "train" in manifests:
            manifests["train"]["recordings"] = combine(
                manifests["train"]["recordings"], v2_manifests["recordings"]
            )
            manifests["train"]["supervisions"] = combine(
                manifests["train"]["supervisions"], v2_manifests["supervisions"]
            )
        else:
            manifests["train"] = v2_manifests

    for split in ("train", "test"):
        recordings = manifests[split]["recordings"]
        supervisions = manifests[split]["supervisions"]
        validate_recordings_and_supervisions(recordings, supervisions)
        if output_dir is not None:
            recordings.to_file(output_dir / f"recordings_voxceleb_{split}.jsonl.gz")
            supervisions.to_file(output_dir / f"supervisions_voxceleb_{split}.jsonl.gz")

    # Write the trials cut sets to the output directory
    if output_dir is not None:
        if "pos_trials" in manifests:
            for i, cuts in enumerate(manifests["pos_trials"]):
                cuts.to_file(output_dir / f"pos_trials_voxceleb_utt{i+1}.jsonl.gz")
        if "neg_trials" in manifests:
            for i, cuts in enumerate(manifests["neg_trials"]):
                cuts.to_file(output_dir / f"neg_trials_voxceleb_utt{i+1}.jsonl.gz")

    return manifests
Пример #4
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Пример #5
0
def prepare_fisher_english(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    absolute_paths: bool = False,
    num_jobs: int = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_dir / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_dir}'.")

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_dir / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(
        audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}"
    if len(transcript_paths) != len(sessions):
        warnings.warn(
            f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, "
            f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}."
        )

    recs_path = output_dir / "recordings_notfixed.jsonl.gz"
    if recs_path.is_file():
        logging.info(f"Using existing recording manifest at {recs_path}")
        recordings = RecordingSet.from_jsonl_lazy(recs_path)
    else:
        logging.info(f"Building fresh recording manifest")
        create_recordings_input = [(p, None if absolute_paths else 5)
                                   for p in audio_paths]
        err_recos = 0
        with ProcessPoolExecutor(
                num_jobs) as executor, RecordingSet.open_writer(
                    recs_path) as writer:
            with tqdm(total=len(create_recordings_input),
                      desc="Collect recordings") as pbar:
                for reco in executor.map(create_recording,
                                         create_recordings_input):
                    if reco is not None:
                        writer.write(reco, flush=True)
                    else:
                        err_recos += 1
                    pbar.update()
        if err_recos:
            warnings.warn(f"Out of {len(create_recordings_input)} recordings, "
                          f"{err_recos} had errors and were omitted.")
        recordings = writer.open_manifest()

    sups_path = output_dir / "supervisions_notfixed.jsonl.gz"
    if sups_path.is_file():
        logging.info(f"Using existing supervision manifest at {recs_path}")
        supervisions = SupervisionSet.from_jsonl_lazy(sups_path)
    else:
        logging.info(f"Building fresh supervision manifest")
        create_supervisions_input = [(sessions, p) for p in transcript_paths]
        err_sups = 0
        with ThreadPoolExecutor(os.cpu_count() *
                                4) as executor, SupervisionSet.open_writer(
                                    sups_path) as writer:
            with tqdm(total=len(create_supervisions_input),
                      desc="Create supervisions") as pbar:
                for tmp_supervisions in executor.map(
                        create_supervision, create_supervisions_input):
                    if not tmp_supervisions:
                        err_sups += 1
                    for s in tmp_supervisions:
                        writer.write(s)
                    pbar.update()
        supervisions = writer.open_manifest()
        if err_recos:
            warnings.warn(
                f"Out of {len(create_supervisions_input)} transcript files, "
                f"{err_sups} had errors and were omitted.")

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    # Write the fixed and validated version to files with standard names.
    recordings.to_file(recs_path.parent / "recordings.jsonl.gz")
    supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Пример #6
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Пример #7
0
def prepare_peoples_speech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    The metadata is read lazily and written to manifests in a stream to minimize
    the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet`
    without using excessive memory, we suggest to call it like::

        >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...)
        >>> cuts = CutSet.from_manifests(
        ...     recordings=peoples_speech["recordings"],
        ...     supervisions=peoples_speech["supervisions"],
        ...     output_path=...,
        ...     lazy=True,
        ... )

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    recs_path = output_dir / "peoples-speech_recordings_all.jsonl.gz"
    sups_path = output_dir / "peoples-speech_supervisions_all.jsonl.gz"

    if recs_path.is_file() and sups_path.is_file():
        # Nothing to do: just open the manifests in lazy mode.
        return {
            "recordings": RecordingSet.from_jsonl_lazy(recs_path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sups_path),
        }

    exist = 0
    tot = 0
    err = 0
    with RecordingSet.open_writer(
            recs_path, ) as rec_writer, SupervisionSet.open_writer(
                sups_path, ) as sup_writer:
        for item in tqdm(
                # Note: People's Speech manifest.json is really a JSONL.
                load_jsonl(corpus_dir / "manifest.json"),
                desc=
                "Converting People's Speech manifest.json to Lhotse manifests",
        ):
            for duration_ms, text, audio_path in zip(
                    *item["training_data"].values()):
                full_path = corpus_dir / audio_path

                tot += 1
                if not full_path.exists():
                    # If we can't find some data, we'll just continue and some items
                    # were missing later.
                    continue
                exist += 1

                try:
                    audio_info = info(full_path)
                    duration = duration_ms / 1000
                    r = Recording(
                        id=full_path.stem,
                        sampling_rate=audio_info.samplerate,
                        num_samples=compute_num_samples(
                            duration, audio_info.samplerate),
                        duration=duration,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[0],
                                source=str(full_path),
                            )
                        ],
                    )
                    s = SupervisionSegment(
                        id=r.id,
                        recording_id=r.id,
                        start=0,
                        duration=r.duration,
                        channel=0,
                        text=text,
                        language="English",
                        custom={"session_id": item["identifier"]},
                    )

                    validate_recordings_and_supervisions(recordings=r,
                                                         supervisions=s)

                    rec_writer.write(r)
                    sup_writer.write(s)

                except Exception as e:
                    # If some files are missing (e.g. somebody is working on a subset
                    # of 30.000 hours), we won't interrupt processing; we will only
                    # do so for violated assertions.
                    if isinstance(e, AssertionError):
                        raise
                    err += 1
                    continue

    if exist < tot or err > 0:
        warnings.warn(
            f"We finished preparing The People's Speech Lhotse manifests. "
            f"Out of {tot} entries in the original manifest, we found {exist} "
            f"audio files existed, out of which {err} had errors during processing."
        )

    return {
        "recordings": rec_writer.open_manifest(),
        "supervisions": sup_writer.open_manifest(),
    }
Пример #8
0
def prepare_fisher_spanish(
    audio_dir_path: Pathlike,
    transcript_dir_path: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:

    """
    Prepares manifests for Fisher Spanish.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param audio_dir_path: Path to audio directory (usually LDC2010S01).
    :param transcript_dir_path: Path to transcript directory (usually LDC2010T04).
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path(
        transcript_dir_path
    )

    audio_paths = check_and_rglob(audio_dir_path, "*.sph")
    transcript_paths = check_and_rglob(transcript_dir_path, "*.tdf")

    sessions_data_path = check_and_rglob(transcript_dir_path, "*_call.tbl")[0]
    with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
        session_lines = [
            l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
        ][1:]
        sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines}

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(audio_paths), desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                executor.map(create_recording, create_recordings_input)
            ):
                recordings[i] = reco
                pbar.update()
    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(
            total=len(create_supervisions_input), desc="Create supervisions"
        ) as pbar:
            for i, tmp_supervisions in enumerate(
                executor.map(create_supervision, create_supervisions_input)
            ):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions).filter(
        lambda s: s.duration > 0.0
    )

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Пример #9
0
def prepare_fisher_english(
    corpus_path: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_path = Path(corpus_path)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_path / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_path}'."
            )

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_path / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_path / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_path / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 5)
                               for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_recordings_input),
                  desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                    executor.map(create_recording, create_recordings_input)):
                recordings[i] = reco
                pbar.update()

    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_supervisions_input),
                  desc="Create supervisions") as pbar:
            for i, tmp_supervisions in enumerate(
                    executor.map(create_supervision,
                                 create_supervisions_input)):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions)

    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "recordings.jsonl.gz")
        supervisions.to_file(output_dir / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Пример #10
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ['train', 'devtest', 'evaltest']:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / 'callhome/arabic' /
            split.replace('evaltest', 'evltest'),
            '*.sph')
        recordings = RecordingSet.from_recordings(
            make_recording_callhome(p, sph2pipe_path=sph2pipe_path)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt')

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(id=f'{recording_id}_{idx}',
                                       recording_id=recording_id,
                                       start=start,
                                       duration=duration,
                                       speaker=f'{recording_id}_{spk}',
                                       text=text))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = remove_missing_recordings_and_supervisions(
            recordings, supervisions)
        supervisions = trim_supervisions_to_recordings(recordings,
                                                       supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f'recordings_{split}.json')
            supervisions.to_json(output_dir / f'supervisions_{split}.json')

        manifests[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests