Exemplo n.º 1
0
def parse_utterance(row: Any, lang_path: Path,
                    language: str) -> Tuple[Recording, SupervisionSegment]:
    # Create the Recording first
    audio_path = lang_path / "clips" / row.path
    if not audio_path.is_file():
        raise ValueError(f"No such file: {audio_path}")
    recording_id = Path(row.path).stem
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        # Look up language code => language name mapping (it is empty at the time of writing this comment)
        # if the language code is unknown, fall back to using the language code.
        language=COMMONVOICE_CODE2LANG.get(language, language),
        speaker=row.client_id,
        text=row.sentence.strip(),
        gender=row.gender if row.gender != "nan" else None,
        custom={
            "age": row.age if row.age != "nan" else None,
            "accent": row.accent if row.accent != "nan" else None,
        },
    )
    return recording, segment
Exemplo n.º 2
0
def parse_utterance(
    dataset_split_path: Path,
    line: str,
    alignments: Dict[str, List[AlignmentItem]],
) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    # Create the Recording first
    audio_path = (dataset_split_path /
                  Path(recording_id.replace("-", "/")).parent /
                  f"{recording_id}.flac")
    if not audio_path.is_file():
        logging.warning(f"No such file: {audio_path}")
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        start=0.0,
        duration=recording.duration,
        channel=0,
        language="English",
        speaker=re.sub(r"-.*", r"", recording.id),
        text=text.strip(),
        alignment={"word": alignments[recording_id]}
        if recording_id in alignments else None,
    )
    return recording, segment
Exemplo n.º 3
0
def test_recording_from_bytes():
    path = "test/fixtures/mono_c0.wav"
    recording = Recording.from_file(path)
    memory_recording = Recording.from_bytes(
        data=open(path, "rb").read(),
        recording_id=recording.id,
    )
    np.testing.assert_equal(memory_recording.load_audio(),
                            recording.load_audio())
Exemplo n.º 4
0
def test_opus_stereo_recording_from_file_force_sampling_rate():
    path = "test/fixtures/stereo.opus"
    recording = Recording.from_file(path, force_opus_sampling_rate=8000)
    assert recording.sampling_rate == 8000
    assert isclose(recording.duration, 1.0055)
    samples = recording.load_audio()
    num_channels, num_samples = samples.shape
    assert num_channels == recording.num_channels
    assert num_samples == recording.num_samples
    assert num_samples == 8044
Exemplo n.º 5
0
def test_save_audio(libri_cut, ext):
    with NamedTemporaryFile(suffix=ext) as f:
        stored_cut = libri_cut.save_audio(f.name)
        samples1 = libri_cut.load_audio()
        rec = Recording.from_file(f.name)
        samples2 = rec.load_audio()
        assert np.array_equal(samples1, samples2)
        assert rec.duration == libri_cut.duration
        assert rec.duration == stored_cut.duration
        assert libri_cut.duration == stored_cut.duration
Exemplo n.º 6
0
def test_store_audio(libri_cut):
    with NamedTemporaryFile() as f:
        stored_cut = libri_cut.compute_and_store_recording(f.name)
        samples1 = libri_cut.load_audio()
        rec = Recording.from_file(f.name)
        samples2 = rec.load_audio()
        assert np.array_equal(samples1, samples2)
        assert rec.duration == libri_cut.duration
        assert rec.duration == stored_cut.duration
        assert libri_cut.duration == stored_cut.duration
Exemplo n.º 7
0
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, "*.sph")
    sgml_paths = check_and_rglob(transcripts_dir, "*.sgml")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths
    )

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["sections"] for sups in supervisions_list)
    )
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["segments"] for sups in supervisions_list)
    )

    validate_recordings_and_supervisions(recordings, segment_supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz")
        section_supervisions.to_file(
            output_dir / "broadcast-news_sections_all.jsonl.gz"
        )
        segment_supervisions.to_file(
            output_dir / "broadcast-news_segments_all.jsonl.gz"
        )

    return {
        "recordings": recordings,
        "sections": section_supervisions,
        "segments": segment_supervisions,
    }
Exemplo n.º 8
0
def create_recording(
    audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int, None]]
) -> Optional[Recording]:
    audio_path, rel_path_depth = audio_path_and_rel_path_depth
    try:
        return Recording.from_file(
            audio_path,
            relative_path_depth=rel_path_depth,
        )
    except CalledProcessError:
        return None
Exemplo n.º 9
0
def test_opus_recording_from_file():
    path = "test/fixtures/mono_c0.opus"
    recording = Recording.from_file(path)
    # OPUS always overrides the sampling rate to 48000
    assert recording.sampling_rate == 48000
    # OPUS may crate extra audio frames / samples...
    assert isclose(recording.duration, 0.5054166666666666)
    samples = recording.load_audio()
    num_channels, num_samples = samples.shape
    assert num_channels == recording.num_channels
    assert num_samples == recording.num_samples
    assert num_samples == 24260
Exemplo n.º 10
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
Exemplo n.º 11
0
def test_opus_stereo_recording_from_file_force_sampling_rate_read_chunk():
    path = "test/fixtures/stereo.opus"
    recording = Recording.from_file(path, force_opus_sampling_rate=8000)
    assert recording.sampling_rate == 8000
    assert isclose(recording.duration, 1.0055)
    all_samples = recording.load_audio()
    samples = recording.load_audio(offset=0.5, duration=0.25)
    num_channels, num_samples = samples.shape
    assert num_channels == recording.num_channels
    assert num_samples == 2000
    np.testing.assert_almost_equal(samples,
                                   all_samples[:, 4000:6000],
                                   decimal=5)
Exemplo n.º 12
0
def test_recording_from_sphere(relative_path_depth, expected_source_path):
    rec = Recording.from_file("test/fixtures/stereo.sph",
                              relative_path_depth=relative_path_depth)
    assert rec == Recording(
        id="stereo",
        sampling_rate=8000,
        num_samples=8000,
        duration=1.0,
        sources=[
            AudioSource(type="file",
                        channels=[0, 1],
                        source=expected_source_path)
        ],
    )
Exemplo n.º 13
0
def parse_utterance(
    dataset_split_path: Path,
    line: str,
) -> Optional[Tuple[Recording, SupervisionSegment]]:
    recording_id, text = line.strip().split(maxsplit=1)
    # Create the Recording first
    audio_path = dataset_split_path / Path(recording_id.replace(
        '-', '/')).parent / f'{recording_id}.flac'
    if not audio_path.is_file():
        logging.warning(f'No such file: {audio_path}')
        return None
    recording = Recording.from_file(audio_path, recording_id=recording_id)
    # Then, create the corresponding supervisions
    segment = SupervisionSegment(id=recording_id,
                                 recording_id=recording_id,
                                 start=0.0,
                                 duration=recording.duration,
                                 channel=0,
                                 language='English',
                                 speaker=re.sub(r'-.*', r'', recording.id),
                                 text=text.strip())
    return recording, segment
Exemplo n.º 14
0
def _prepare_dataset(
    dataset: List[Pathlike],
) -> Tuple[List[Recording], List[SupervisionSegment]]:
    """Build a list of Recording and SupervisionSegment from a list
    of sound filenames.

    :param dataset: List[Pathlike], a list of sound filenames
    :return: a tuple containing a list of Recording and a list
        of SupervisionSegment
    """
    word_map = {"0": "NO", "1": "YES"}

    recordings = []
    supervisions = []
    for audio_path in dataset:
        words = audio_path.stem.split("_")
        assert len(words) == 8
        assert set(words).union({"0", "1"}) == {"0", "1"}, f"words is: {words}"

        words = [word_map[w] for w in words]
        text = " ".join(words)

        recording = Recording.from_file(audio_path)
        recordings.append(recording)

        segment = SupervisionSegment(
            id=audio_path.stem,
            recording_id=audio_path.stem,
            start=0.0,
            duration=recording.duration,
            channel=0,
            language="Hebrew",
            text=text,
        )
        supervisions.append(segment)

    return recordings, supervisions
Exemplo n.º 15
0
def prepare_gale_mandarin(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    segment_words: Optional[bool] = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Mandarin Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param segment_words: Use `jieba` package to perform word segmentation (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable(
                [
                    check_and_rglob(dir, ext, strict=False)
                    for dir in audio_dirs
                    for ext in ["*.wav", "*.flac"]
                ]
            )
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs]
    )

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values()
    )

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths, segment_words=segment_words)
    ).filter(lambda s: s.recording_id in audio_paths)

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    TEST = [
        line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url)
    ]

    manifests = defaultdict(dict)
    manifests["dev"] = {
        "recordings": recordings.filter(lambda r: r.id in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests["train"] = {
        "recordings": recordings.filter(lambda r: r.id not in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSONL files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "dev"]:
            manifests[part]["recordings"].to_file(
                output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz"
            )
            manifests[part]["supervisions"].to_file(
                output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz"
            )

    return manifests
Exemplo n.º 16
0
def prepare_cslu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    normalize_text: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CSLU Kids corpus. The supervision contains either the
    prompted text, or a transcription of the spontaneous speech, depending on
    whether the utterance was scripted or spontaneous.

    Additionally, the following information is present in the `custom` tag:
    scripted/spontaneous utterance, and verification label (rating between 1 and 4)
    for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt
    or top documentation in this script for more information).

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir

    # Get list of all recordings
    audio_paths = check_and_rglob(corpus_dir, "*.wav")

    # Read verification labels
    verification = {}
    for file in check_and_rglob(corpus_dir, "*-verified.txt"):
        with open(file, "r") as f:
            for line in f:
                path, label = line.strip().split()
                utt = Path(path).stem
                verification[utt] = int(label)

    # Read prompted transcriptions
    prompts = {}
    with open(corpus_dir / "docs" / "all.map", "r") as f:
        for line in f:
            if line.strip() != "":
                prompt, text = line.strip().split(maxsplit=1)
                prompts[prompt] = text[1:-1]  # remove " " around the text

    recordings = []
    supervisions = []
    for p in tqdm(audio_paths, desc="Preparing manifests"):

        # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav
        uttid = p.stem  # ks001000
        spk = p.parent.stem  # ks001
        cat = p.parent.parent.stem  # 0
        prompt = p.parent.parent.parent.stem  # 00
        type = p.parent.parent.parent.parent.stem  # scripted

        recording = Recording.from_file(
            p, relative_path_depth=None if absolute_paths else 3
        )
        recordings.append(recording)

        if type == "scripted":
            text = prompts[prompt]
            verification_label = verification[uttid] if uttid in verification else None
            custom = {"type": type, "verification_label": verification_label}
        elif type == "spontaneous":
            text = read_text(
                corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt",
                normalize=normalize_text,
            )
            custom = {"type": type}
        supervisions.append(
            SupervisionSegment(
                id=uttid,
                recording_id=uttid,
                start=0,
                duration=recording.duration,
                speaker=spk,
                language="English",
                text=text,
                custom=custom,
            )
        )

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        "recordings": recordings,
        "supervisions": supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / "recordings.json")
        manifests["supervisions"].to_json(output_dir / "supervisions.json")

    return manifests
Exemplo n.º 17
0
def prepare_aishell4(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AISHELL-4 data, please 'pip install textgrid' first.")
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    global_spk_id = {}
    for part in ["train_L", "train_M", "train_S", "test"]:
        recordings = []
        supervisions = []
        wav_path = corpus_dir / part / "wav"
        for audio_path in wav_path.rglob("*.flac"):
            idx = audio_path.stem

            try:
                tg = textgrid.TextGrid.fromFile(
                    f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid")
            except ValueError:
                logging.warning(
                    f"{idx} has annotation issues. Skipping this recording.")
                continue

            recording = Recording.from_file(audio_path)
            recordings.append(recording)

            for tier in tg.tiers:
                local_spk_id = tier.name
                key = (idx, local_spk_id)
                if key not in global_spk_id:
                    global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}"
                spk_id = global_spk_id[key]
                for j, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{idx}-{spk_id}-{j}",
                            recording_id=idx,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir / f"supervisions_{part}.jsonl")
            recording_set.to_file(output_dir / f"recordings_{part}.jsonl")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 18
0
def test_recording_from_file_using_audioread():
    path = 'test/fixtures/mono_c0.opus'
    recording = Recording.from_file(path)
    recording.load_audio()
Exemplo n.º 19
0
def recording():
    return Recording.from_file(
        "test/fixtures/libri/libri-1088-134315-0000.wav")
Exemplo n.º 20
0
def prepare_aidatatang_200zh(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    d = corpus_dir / "aidatatang_200zh"
    assert d.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    transcript_path = d / "transcript/aidatatang_200_zh_transcript.txt"
    assert transcript_path.is_file(), f"No such file: {transcript_path}"

    transcript_dict = {}
    with open(transcript_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ["dev", "test", "train"]

    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        logging.info(f"Processing {part}")
        recordings = []
        supervisions = []
        wav_path = d / "corpus" / part
        for audio_path in wav_path.rglob("**/*.wav"):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f"No transcript: {idx}")
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="Chinese",
                speaker=speaker,
                text=text.strip(),
            )
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 21
0
def prepare_ali_meeting(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "far",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AliMeeting data, please 'pip install textgrid' first."
        )
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    for part in ["Train", "Eval", "Test"]:
        recordings = []
        supervisions = []
        # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together)
        if part == "Eval" or part == "Test":
            corpus_dir = (
                corpus_dir / f"{part}_Ali"
                if (corpus_dir / f"{part}_Ali").is_dir()
                else corpus_dir
            )
        wav_paths = corpus_dir / f"{part}_Ali_{mic}" / "audio_dir"
        text_paths = corpus_dir / f"{part}_Ali_{mic}" / "textgrid_dir"

        # For 'near' setting:
        #  - wav files have names like R0003_M0046_F_SPK0093.wav
        #  - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid
        # Speaker ID information is present in the file name itself

        # For 'far' setting:
        #  - wav files have names like R0015_M0151_MS002.wav
        #  - textgrid files have names like R0015_M015.TextGrid
        # Speaker ID information is present inside the TextGrid file

        for text_path in tqdm(
            list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}"
        ):
            session_id = text_path.stem

            if mic == "near":
                _, _, gender, spk_id = session_id.split("_")
                spk_id = spk_id[3:]  # SPK1953 -> 1953

            try:
                tg = textgrid.TextGrid.fromFile(str(text_path))
            except ValueError:
                logging.warning(
                    f"{session_id} has annotation issues. Skipping this recording."
                )
                continue

            wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0]

            recording = Recording.from_file(wav_path, recording_id=session_id)
            recordings.append(recording)

            for tier in tg.tiers:
                if mic == "far":
                    parts = tier.name.split("_")
                    if len(parts) == 4:
                        _, _, gender, spk_id = parts
                    elif len(parts) == 2:
                        gender, spk_id = parts
                    spk_id = spk_id[3:]  # SPK1953 -> 1953

                for i, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{session_id}-{spk_id}-{i}",
                            recording_id=recording.id,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            gender=gender,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set, supervision_set = fix_manifests(
            RecordingSet.from_recordings(recordings),
            SupervisionSet.from_segments(supervisions),
        )
        # Fix manifests
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir / f"supervisions_{part.lower()}.jsonl")
            recording_set.to_file(output_dir / f"recordings_{part.lower()}.jsonl")

        manifests[part.lower()] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Exemplo n.º 22
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt'
    transcript_dict = {}
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}'
        for audio_path in wav_path.rglob('**/*.wav'):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f'No transcript: {idx}')
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f'No such file: {audio_path}')
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(id=idx,
                                         recording_id=idx,
                                         start=0.0,
                                         duration=recording.duration,
                                         channel=0,
                                         language='Chinese',
                                         speaker=speaker,
                                         text=text.strip())
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Exemplo n.º 23
0
def create_recording(
    audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int,
                                                         None]]) -> Recording:
    audio_path, rel_path_depth = audio_path_and_rel_path_depth
    return Recording.from_file(audio_path, relative_path_depth=rel_path_depth)
Exemplo n.º 24
0
def prepare_switchboard(
    audio_dir: Pathlike,
    transcripts_dir: Optional[Pathlike] = None,
    sentiment_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    omit_silence: bool = True,
    absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({
            'audio': ap,
            'text-0': name_to_text[f'{name}A'],
            'text-1': name_to_text[f'{name}B']
        })

    recordings = RecordingSet.from_recordings(
        Recording.from_file(group['audio'],
                            relative_path_depth=None if absolute_paths else 3)
        for group in groups)
    supervisions = SupervisionSet.from_segments(
        chain.from_iterable(
            make_segments(transcript_path=group[f'text-{channel}'],
                          recording=recording,
                          channel=channel,
                          omit_silence=omit_silence)
            for group, recording in zip(groups, recordings)
            for channel in [0, 1]))

    validate_recordings_and_supervisions(recordings, supervisions)

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {'recordings': recordings, 'supervisions': supervisions}
Exemplo n.º 25
0
 def audio_read_worker(p: Path) -> Recording:
     r = Recording.from_file(p,
                             recording_id=f"{p.parent.stem}_{p.stem}")
     durations[r.id] = r.duration
     return r
Exemplo n.º 26
0
def prepare_earnings21(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    normalize_text: bool = False,
) -> Union[RecordingSet, SupervisionSet]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    media_dir = corpus_dir / "media"
    audio_files = list(media_dir.glob("*.mp3"))
    assert len(audio_files) == 44

    audio_files.sort()
    recording_set = RecordingSet.from_recordings(
        Recording.from_file(p) for p in audio_files)

    nlp_dir = corpus_dir / "transcripts" / "nlp_references"
    nlp_files = list(nlp_dir.glob("*.nlp"))
    assert len(nlp_files) == 44

    supervision_segments = list()
    for nlp_file in nlp_files:
        id = nlp_file.stem
        text = " ".join(parse_nlp_file(nlp_file))
        if normalize_text:
            text = normalize(text)

        s = SupervisionSegment(
            id=id,
            recording_id=id,
            start=0.0,
            duration=recording_set[id].duration,  # recording.duration,
            channel=0,
            language="English",
            text=text,
        )
        supervision_segments.append(s)
    supervision_set = SupervisionSet.from_segments(supervision_segments)

    validate_recordings_and_supervisions(recording_set, supervision_set)
    if output_dir is not None:
        supervision_set.to_file(output_dir /
                                "earnings21_supervisions_all.jsonl.gz")
        recording_set.to_file(output_dir /
                              "earnings21_recordings_all.jsonl.gz")

    return recording_set, supervision_set
Exemplo n.º 27
0
def prepare_libricss(
    corpus_dir: Pathlike,
    output_dir: Pathlike = None,
    type: str = "mdm",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    """
    assert type in ["mdm", "ihm-mix", "ihm"]

    manifests = {}

    corpus_dir = Path(corpus_dir)
    corpus_dir = (corpus_dir / "for_release"
                  if corpus_dir.stem != "for_release" else corpus_dir)

    recordings = []
    segments = []

    for ov in OVERLAP_RATIOS:
        for session in (corpus_dir / ov).iterdir():
            _, _, _, _, _, name, actual_ov = session.name.split("_")
            actual_ov = float(actual_ov.split("actual")[1])
            recording_id = f"{ov}_{name}"
            audio_path = (session / "clean" /
                          "mix.wav" if type == "ihm-mix" else session /
                          "clean" /
                          "each_spk.wav" if type == "ihm" else session /
                          "record" / "raw_recording.wav")
            recordings.append(
                Recording.from_file(audio_path, recording_id=recording_id))
            for idx, seg in enumerate(
                    parse_transcript(session / "transcription" /
                                     "meeting_info.txt")):
                segments.append(
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=seg[0],
                        duration=seg[1] - seg[0],
                        text=seg[4],
                        language="English",
                        speaker=seg[2],
                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
                        if type == "ihm" else 0,
                    ))

    supervisions = SupervisionSet.from_segments(segments)
    recordings = RecordingSet.from_recordings(recordings)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        recordings.to_jsonl(output_dir / "recordings.jsonl")
        supervisions.to_jsonl(output_dir / "supervisions.jsonl")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 28
0
def prepare_timit(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_phones: int = 48,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consists of the Recodings and Supervisions.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write and save the manifests.
    :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    manifests = defaultdict(dict)
    dataset_parts = ["TRAIN", "DEV", "TEST"]

    phones_dict = {}

    if num_phones in [60, 48, 39]:
        phones_dict = get_phonemes(num_phones)
    else:
        raise ValueError("The value of num_phones must be in [60, 48, 39].")

    dev_spks, test_spks = get_speakers()

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in dataset_parts:
            wav_files = []

            if part == "TRAIN":
                print("starting....")
                wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
            elif part == "DEV":
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in dev_spks,
                           wav_files))
            else:
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in test_spks,
                           wav_files))

            logging.debug(f"{part} dataset manifest generation.")
            recordings = []
            supervisions = []

            for wav_file in tqdm(wav_files):
                items = str(wav_file).strip().split("/")
                idx = items[-2] + "-" + items[-1][:-4]
                speaker = items[-2]
                transcript_file = Path(wav_file).with_suffix(".PHN")
                if not Path(wav_file).is_file():
                    logging.warning(f"No such file: {wav_file}")
                    continue
                if not Path(transcript_file).is_file():
                    logging.warning(f"No transcript: {transcript_file}")
                    continue
                text = []
                with open(transcript_file, "r") as f:
                    lines = f.readlines()
                    for line in lines:
                        phone = line.rstrip("\n").split(" ")[-1]
                        if num_phones != 60:
                            phone = phones_dict[str(phone)]
                        text.append(phone)

                    text = " ".join(text).replace("h#", "sil")

                recording = Recording.from_file(path=wav_file,
                                                recording_id=idx)
                recordings.append(recording)
                segment = SupervisionSegment(
                    id=idx,
                    recording_id=idx,
                    start=0.0,
                    duration=recording.duration,
                    channel=0,
                    language="English",
                    speaker=speaker,
                    text=text.strip(),
                )

                supervisions.append(segment)

                recording_set = RecordingSet.from_recordings(recordings)
                supervision_set = SupervisionSet.from_segments(supervisions)
                validate_recordings_and_supervisions(recording_set,
                                                     supervision_set)

                if output_dir is not None:
                    supervision_set.to_file(
                        output_dir / f"timit_supervisions_{part}.jsonl.gz")
                    recording_set.to_file(output_dir /
                                          f"timit_recordings_{part}.jsonl.gz")

                manifests[part] = {
                    "recordings": recording_set,
                    "supervisions": supervision_set,
                }

    return manifests
Exemplo n.º 29
0
def prepare_cmu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CMU Kids corpus. The prepared supervisions contain the
    prompt text as the `text`. Additionally, in the `custom` tag, we provide the
    following data: speaker grade/age, population where the speaker came from
    (SIM95/FP), spoken transcript, and transcription bin (1/2).

    Here, bin `1` means utterances where the speaker followed the prompt and no
    noise/mispronunciation is present, and `2` refers to noisy utterances.

    The tag `spoken_transcript` is the transcription that was actually spoken. It
    contains noise tags and phone transcription in case the pronunciation differed
    from that in CMU Dict.

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir,
                                                str) else corpus_dir
    corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir

    recordings = []
    supervisions = []

    # Get transcripts for all utterances
    utterances = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f:
        for line in f:
            utt, count, text = line.strip().split('\t')
            utterances[utt] = text

    # Get speaker metadata
    speaker_info = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f:
        for _ in range(2):
            next(f)
        for line in f:
            # ID    LOC     GR/AGE  TOT     BIN2
            # fabm    SUM95   3/9     100     62
            # facs    SUM95   2/8     90      55
            spk, pop, gr_age, _, _ = line.strip().split('\t')
            grade, age = gr_age.split('/')
            speaker_info[spk] = (pop, grade, age)

    # Iterate through all transcriptions and add to supervisions
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f:
        for line in f:
            trn_id, transcript = line.strip().split(maxsplit=1)
            spk = trn_id[0:4]
            utt = trn_id[4:7]
            bin = int(trn_id[7])
            pop, grade, age = speaker_info[spk]

            audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' /
                          f'{trn_id}.sph')
            recording = Recording.from_file(
                audio_path, relative_path_depth=None if absolute_paths else 3)
            recordings.append(recording)

            supervisions.append(
                SupervisionSegment(
                    id=trn_id,
                    recording_id=trn_id,
                    start=0,
                    duration=recording.duration,
                    speaker=spk,
                    gender="Male" if spk[0] == 'm' else "Female",
                    language='English',
                    text=utterances[utt],
                    custom={
                        'speaker_grade': grade if grade != "NA" else None,
                        'speaker_age': int(age) if age != "NA" else None,
                        'speaker_population': pop,
                        'bin': bin,
                        'spoken_transcript': transcript,
                    },
                ))

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        'recordings': recordings,
        'supervisions': supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / 'recordings.json')
        manifests["supervisions"].to_json(output_dir / 'supervisions.json')

    return manifests
Exemplo n.º 30
0
def prepare_gale_arabic(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Arabic Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable([
                check_and_rglob(dir, ext, strict=False) for dir in audio_dirs
                for ext in ['*.wav', '*.flac']
            ])
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs])
    transcript_paths = [p for p in transcript_paths]

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values())

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths))

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = defaultdict(dict)
    manifests['test'] = {
        'recordings': recordings.filter(lambda r: r.id in TEST),
        'supervisions': supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests['train'] = {
        'recordings': recordings.filter(lambda r: r.id not in TEST),
        'supervisions':
        supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "test"]:
            manifests[part]["recordings"].to_json(output_dir /
                                                  f'recordings_{part}.json')
            manifests[part]["supervisions"].to_json(
                output_dir / f'supervisions_{part}.json')

    return manifests