Exemplo n.º 1
0
def prepare_mobvoihotwords(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    dataset_parts = ['train', 'dev', 'test']

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    for part in dataset_parts:
        logging.info(f'Preparing MobvoiHotwords subset: {part}')
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(
                f'MobvoiHotwords subset: {part} already prepared - skipping.')
            continue
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        for prefix in ['p_', 'n_']:
            prefixed_part = prefix + part
            json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json'
            with open(json_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                for entry in json_data:
                    idx = entry['utt_id']
                    speaker = idx if entry['speaker_id'] is None else entry[
                        'speaker_id']
                    audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav'
                    text = 'FREETEXT'
                    if entry['keyword_id'] == 0:
                        text = 'HiXiaowen'
                    elif entry['keyword_id'] == 1:
                        text = 'NihaoWenwen'
                    else:
                        assert entry['keyword_id'] == -1
                    if not audio_path.is_file():
                        logging.warning(f'No such file: {audio_path}')
                        continue
                    recording = Recording.from_file(audio_path)
                    recordings.append(recording)
                    segment = SupervisionSegment(id=idx,
                                                 recording_id=idx,
                                                 start=0.0,
                                                 duration=recording.duration,
                                                 channel=0,
                                                 language='Chinese',
                                                 speaker=speaker,
                                                 text=text.strip())
                    supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Exemplo n.º 2
0
def prepare_ali_meeting(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "far",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AliMeeting data, please 'pip install textgrid' first.")
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    for part in ["Train", "Eval", "Test"]:
        recordings = []
        supervisions = []
        # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together)
        if part == "Eval" or part == "Test":
            corpus_dir = (corpus_dir / f"{part}_Ali" if
                          (corpus_dir /
                           f"{part}_Ali").is_dir() else corpus_dir)
        wav_paths = corpus_dir / f"{part}_Ali_{mic}" / "audio_dir"
        text_paths = corpus_dir / f"{part}_Ali_{mic}" / "textgrid_dir"

        # For 'near' setting:
        #  - wav files have names like R0003_M0046_F_SPK0093.wav
        #  - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid
        # Speaker ID information is present in the file name itself

        # For 'far' setting:
        #  - wav files have names like R0015_M0151_MS002.wav
        #  - textgrid files have names like R0015_M015.TextGrid
        # Speaker ID information is present inside the TextGrid file

        for text_path in tqdm(list(text_paths.rglob("*.TextGrid")),
                              desc=f"Preparing {part}"):
            session_id = text_path.stem

            if mic == "near":
                _, _, gender, spk_id = session_id.split("_")
                spk_id = spk_id[3:]  # SPK1953 -> 1953

            try:
                tg = textgrid.TextGrid.fromFile(str(text_path))
            except ValueError:
                logging.warning(
                    f"{session_id} has annotation issues. Skipping this recording."
                )
                continue

            wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0]

            recording = Recording.from_file(wav_path, recording_id=session_id)
            recordings.append(recording)

            for tier in tg.tiers:
                if mic == "far":
                    parts = tier.name.split("_")
                    if len(parts) == 4:
                        _, _, gender, spk_id = parts
                    elif len(parts) == 2:
                        gender, spk_id = parts
                    spk_id = spk_id[3:]  # SPK1953 -> 1953

                for i, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{session_id}-{spk_id}-{i}",
                            recording_id=recording.id,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            gender=gender,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set, supervision_set = fix_manifests(
            RecordingSet.from_recordings(recordings),
            SupervisionSet.from_segments(supervisions),
        )
        # Fix manifests
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"supervisions_{part.lower()}.jsonl")
            recording_set.to_file(output_dir /
                                  f"recordings_{part.lower()}.jsonl")

        manifests[part.lower()] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Exemplo n.º 3
0
def prepare_librimix(
    librimix_csv: Pathlike,
    output_dir: Optional[Pathlike] = None,
    with_precomputed_mixtures: bool = False,
    sampling_rate: int = 16000,
    min_segment_seconds: Seconds = 3.0
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    import pandas as pd
    assert Path(librimix_csv).is_file(), f'No such file: {librimix_csv}'
    df = pd.read_csv(librimix_csv)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    manifests = defaultdict(dict)

    # First, create the audio manifest that specifies the pairs of source recordings
    # to be mixed together.
    audio_sources = RecordingSet.from_recordings(
        Recording(
            id=row['mixture_ID'],
            sources=[
                AudioSource(
                    type='file', channels=[0], source=row['source_1_path']),
                AudioSource(
                    type='file', channels=[1], source=row['source_2_path'])
            ],
            sampling_rate=sampling_rate,
            num_samples=int(row['length']),
            duration=row['length'] / sampling_rate)
        for idx, row in df.iterrows()
        if row['length'] / sampling_rate > min_segment_seconds)
    supervision_sources = make_corresponding_supervisions(audio_sources)
    validate_recordings_and_supervisions(audio_sources, supervision_sources)
    if output_dir is not None:
        audio_sources.to_json(output_dir / 'recordings_sources.json')
        supervision_sources.to_json(output_dir / 'supervisions_sources.json')
    manifests['sources'] = {
        'recordings': audio_sources,
        'supervisions': supervision_sources
    }

    # When requested, create an audio manifest for the pre-computed mixtures.
    # A different way of performing the mix would be using Lhotse's on-the-fly
    # overlaying of audio Cuts.
    if with_precomputed_mixtures:
        audio_mix = RecordingSet.from_recordings(
            Recording(id=row['mixture_ID'],
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=row['mixture_path']),
                      ],
                      sampling_rate=sampling_rate,
                      num_samples=int(row['length']),
                      duration=row['length'] / sampling_rate)
            for idx, row in df.iterrows()
            if row['length'] / sampling_rate > min_segment_seconds)
        supervision_mix = make_corresponding_supervisions(audio_mix)
        validate_recordings_and_supervisions(audio_mix, supervision_mix)
        if output_dir is not None:
            audio_mix.to_json(output_dir / 'recordings_mix.json')
            supervision_mix.to_json(output_dir / 'supervisions_mix.json')
        manifests['premixed'] = {
            'recordings': audio_mix,
            'supervisions': supervision_mix
        }

    # When the LibriMix CSV specifies noises, we create a separate RecordingSet for them,
    # so that we can extract their features and overlay them as Cuts later.
    if 'noise_path' in df:
        audio_noise = RecordingSet.from_recordings(
            Recording(id=row['mixture_ID'],
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=row['noise_path']),
                      ],
                      sampling_rate=sampling_rate,
                      num_samples=int(row['length']),
                      duration=row['length'] / sampling_rate)
            for idx, row in df.iterrows()
            if row['length'] / sampling_rate > min_segment_seconds)
        supervision_noise = make_corresponding_supervisions(audio_noise)
        validate_recordings_and_supervisions(audio_noise, supervision_noise)
        if output_dir is not None:
            audio_noise.to_json(output_dir / 'recordings_noise.json')
            supervision_noise.to_json(output_dir / 'supervisions_noise.json')
        manifests['noise'] = {
            'recordings': audio_noise,
            'supervisions': supervision_noise
        }

    return manifests
Exemplo n.º 4
0
def prepare_gale_arabic(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Arabic Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable([
                check_and_rglob(dir, ext, strict=False) for dir in audio_dirs
                for ext in ["*.wav", "*.flac"]
            ])
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs])
    transcript_paths = [p for p in transcript_paths]

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values())

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths))

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = defaultdict(dict)
    manifests["test"] = {
        "recordings": recordings.filter(lambda r: r.id in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests["train"] = {
        "recordings": recordings.filter(lambda r: r.id not in TEST),
        "supervisions":
        supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "test"]:
            manifests[part]["recordings"].to_json(output_dir /
                                                  f"recordings_{part}.json")
            manifests[part]["supervisions"].to_json(
                output_dir / f"supervisions_{part}.json")

    return manifests
Exemplo n.º 5
0
def prepare_heroico(
    speech_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    speech_dir = Path(speech_dir)
    transcript_dir = Path(transcript_dir)
    assert speech_dir.is_dir(), f'No such directory: {speech_dir}'
    assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)

    # set some patterns to match fields in transcript files and filenames
    answers_line_pattern = re.compile("\d+/\d+\t.+")
    answers_path_pattern = re.compile('Answers_Spanish')
    heroico_recitations_line_pattern = re.compile("\d+\t.+")
    heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish')
    heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish')
    usma_line_pattern = re.compile("s\d+\t.+")
    usma_native_demo_pattern = re.compile(
        "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+")
    usma_native_path_pattern = re.compile('usma/native')
    usma_native_prompt_id_pattern = re.compile('s\d+')
    usma_nonnative_demo_pattern = re.compile(
        "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+"
    )
    usma_nonnative_path_pattern = re.compile('nonnative.+\.wav')

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)

    transcripts = defaultdict(dict)
    # store answers trnscripts
    answers_trans_path = Path(transcript_dir, heroico_dataset_answers)
    with open(answers_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            # some recordings do not have a transcript, skip them here
            if not answers_line_pattern.match(line):
                continue
            # IDs have the form speaker/prompt_id
            spk_utt, text = line.split(maxsplit=1)
            spk_id, prompt_id = spk_utt.split('/')
            utt_id = '-'.join(['answers', spk_id, prompt_id])
            transcripts[utt_id] = text

    # store heroico recitations transcripts
    heroico_recitations_trans_path = Path(transcript_dir,
                                          heroico_dataset_recordings)
    with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not heroico_recitations_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['heroico-recitations', idx])
            transcripts[utt_id] = text

    # store usma transcripts
    usma_trans_path = Path(transcript_dir, usma_dataset)
    with open(usma_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not usma_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['usma', idx])
            transcripts[utt_id] = text

    # store utterance info
    audio_paths = speech_dir.rglob('*.wav')
    uttdata = {}
    for wav_file in audio_paths:
        wav_path = Path(wav_file)
        path_components = wav_path.parts
        pid = wav_path.stem
        if re.findall(answers_path_pattern, str(wav_file)):
            # store utternce info for Heroico Answers
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['answers', spk, pid])
            if utt_id not in transcripts:
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='answers',
                                             utterance_id=utt_id,
                                             transcript=transcripts[utt_id])
        elif re.findall(usma_native_path_pattern, str(wav_file)):
            # store utterance info for usma native data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_native_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
            if not usma_native_prompt_id_pattern.match(pid):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif re.findall(usma_nonnative_path_pattern, str(wav_file)):
            # store utterance data for usma nonnative data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_nonnative_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) <= 354 or int(pid) >= 562:
            # store utterance info for heroico recitations for train dataset
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations', spk, pid])
            trans_id = '-'.join(['heroico-recitations', pid])
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='heroico-recitations',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) > 354 and int(pid) < 562:
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations-repeats', spk, pid])
            trans_id = '-'.join(['heroico-recitations-repeats', pid])
            uttdata[str(wav_file)] = UttInfo(
                fold='devtest',
                speaker=spk,
                prompt_id=pid,
                subcorpus='heroico-recitations-repeats',
                utterance_id=utt_id,
                transcript=transcripts[trans_id])
        else:
            logging.warning(f'No such file: {wav_file}')

    audio_paths = speech_dir.rglob('*.wav')
    audio_files = [w for w in audio_paths]

    for fld in folds:
        metadata = {}
        for wav_file in audio_files:
            wav_path = Path(wav_file)
            # skip files with no record
            if not uttdata[str(wav_file)]:
                continue
            # only process the current fold
            if uttdata[str(wav_file)].fold != fld:
                continue
            path_components = wav_path.parts
            prompt_id = wav_path.stem
            # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
            # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
            info = torchaudio.info(str(wav_file))
            spk = wav_path.parts[-2]
            utt_id = '-'.join(
                [uttdata[str(wav_file)].subcorpus, spk, prompt_id])
            metadata[utt_id] = HeroicoMetaData(
                audio_path=wav_file,
                audio_info=info[0],
                text=uttdata[str(wav_file)].transcript)

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.rate),
                      num_samples=metadata[idx].audio_info.length,
                      duration=(metadata[idx].audio_info.length /
                                metadata[idx].audio_info.rate))
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='Spanish',
                               speaker=idx.split('-')[-2],
                               text=metadata[idx].text)
            for idx in audio.recordings)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{fld}.json')
            audio.to_json(output_dir / f'recordings_{fld}.json')

        manifests[fld] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 6
0
def prepare_switchboard(
    audio_dir: Pathlike,
    transcripts_dir: Optional[Pathlike] = None,
    sentiment_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    omit_silence: bool = True,
    absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({
            'audio': ap,
            'text-0': name_to_text[f'{name}A'],
            'text-1': name_to_text[f'{name}B']
        })

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(
            group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups)
    supervisions = SupervisionSet.from_segments(
        chain.from_iterable(
            make_segments(transcript_path=group[f'text-{channel}'],
                          recording=recording,
                          channel=channel,
                          omit_silence=omit_silence)
            for group, recording in zip(groups, recordings)
            for channel in [0, 1]))

    validate_recordings_and_supervisions(recordings, supervisions)

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {'recordings': recordings, 'supervisions': supervisions}
Exemplo n.º 7
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt'
    transcript_dict = {}
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}'
        for audio_path in wav_path.rglob('**/*.wav'):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f'No transcript: {idx}')
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f'No such file: {audio_path}')
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(id=idx,
                                         recording_id=idx,
                                         start=0.0,
                                         duration=recording.duration,
                                         channel=0,
                                         language='Chinese',
                                         speaker=speaker,
                                         text=text.strip())
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Exemplo n.º 8
0
def prepare_ami(
    data_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param data_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'.
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f'No such directory: {data_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip')
    wav_dir = data_dir / 'wav_db'
    audio_paths = list(wav_dir.rglob('*.wav'))

    manifests = defaultdict(dict)

    for part in dataset_parts:
        # Audio
        recordings = []
        for audio_path in audio_paths:
            audio_idx = audio_path.name
            if re.sub(r'\..*', '', audio_idx) not in dataset_parts[part]:
                continue
            if audio_idx not in anotation_lists:
                logging.warning(f'No annotation found for {audio_idx}')
                continue
            audio_info = torchaudio.info(str(audio_path))[0]

            recordings.append(
                Recording(
                    id=audio_idx,
                    sources=[
                        AudioSource(type='file',
                                    channels=[0],
                                    source=str(audio_path))
                    ],
                    sampling_rate=int(audio_info.rate),
                    num_samples=audio_info.length,
                    duration=int(audio_info.length / audio_info.rate),
                ))
        if len(recordings) == 0:
            continue
        audio = RecordingSet.from_recordings(recordings)

        # Supervisions
        segments_by_pause = []
        for idx in audio.recordings:
            anotation = anotation_lists[idx]
            for seg_idx, seg_info in enumerate(anotation):
                for subseg_idx, subseg_info in enumerate(seg_info):
                    duration = subseg_info.end_time - subseg_info.begin_time
                    if duration > 0:
                        segments_by_pause.append(
                            SupervisionSegment(
                                id=f'{idx}-{seg_idx}-{subseg_idx}',
                                recording_id=idx,
                                start=subseg_info.begin_time,
                                duration=duration,
                                channel=0,
                                language='English',
                                speaker=re.sub(r'-.*', r'', idx),
                                text=subseg_info.text))
        supervision = SupervisionSet.from_segments(segments_by_pause)
        if output_dir is not None:
            audio.to_json(output_dir / f'audio_{part}.json')
            supervision.to_json(output_dir / f'supervisions_{part}.json')

        manifests[part] = {'audio': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 9
0
def prepare_aishell4(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AISHELL-4 data, please 'pip install textgrid' first.")
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    global_spk_id = {}
    for part in ["train_L", "train_M", "train_S", "test"]:
        recordings = []
        supervisions = []
        wav_path = corpus_dir / part / "wav"
        for audio_path in wav_path.rglob("*.flac"):
            idx = audio_path.stem

            try:
                tg = textgrid.TextGrid.fromFile(
                    f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid")
            except ValueError:
                logging.warning(
                    f"{idx} has annotation issues. Skipping this recording.")
                continue

            recording = Recording.from_file(audio_path)
            recordings.append(recording)

            for tier in tg.tiers:
                local_spk_id = tier.name
                key = (idx, local_spk_id)
                if key not in global_spk_id:
                    global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}"
                spk_id = global_spk_id[key]
                for j, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{idx}-{spk_id}-{j}",
                            recording_id=idx,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"aishell4_supervisions_{part}.jsonl.gz")
            recording_set.to_file(output_dir /
                                  f"aishell4_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 10
0
def prepare_aspire(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: str = "single"
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21).
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, the microphone type, either "single" or "multi".
    :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    assert mic in [
        "single",
        "multi",
    ], f"mic must be either 'single' or 'multi', got {mic}"
    corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data"
    audio_dir = corpus_dir / "dev_and_dev_test_audio"
    stm_dir = corpus_dir / "dev_and_dev_test_STM_files"

    if mic == "single":
        audio_paths = {
            "dev": audio_dir / "ASpIRE_single_dev",
            "dev_test": audio_dir / "ASpIRE_single_dev_test",
        }
        stm_file = {
            "dev": stm_dir / "dev.stm",
            "dev_test": stm_dir / "dev_test.stm",
        }
    else:
        audio_paths = {
            "dev": audio_dir / "ASpIRE_multi_dev",
            "dev_test": audio_dir / "ASpIRE_multi_dev_test",
        }
        stm_file = {
            "dev": stm_dir / "multi_dev.stm",
            "dev_test": stm_dir / "multi_dev_test.stm",
        }
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    for part in ["dev", "dev_test"]:
        recordings = []
        supervisions = []

        # Prepare the recordings
        if mic == "single":
            recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav")
        else:
            import soundfile as sf

            audio_groups = {
                k: list(v)
                for k, v in itertools.groupby(
                    sorted(audio_paths[part].glob("*.wav")),
                    key=lambda x: "_".join(x.stem.split("_")[:-1]),
                )
            }  # group audios so that each entry is a session containing all channels
            for session_name, audios in audio_groups.items():
                audio_sf = sf.SoundFile(str(audios[0]))
                recordings.append(
                    Recording(
                        id=session_name,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[int(audio.stem[-2:]) - 1],
                                source=str(audio),
                            ) for audio in sorted(audios)
                        ],
                        sampling_rate=audio_sf.samplerate,
                        num_samples=audio_sf.frames,
                        duration=audio_sf.frames / audio_sf.samplerate,
                    ))
            recording_set = RecordingSet.from_recordings(recordings)

        # Read STM file and prepare segments
        segments = []
        with open(stm_file[part]) as f:
            for line in f:
                session, _, speaker, start, end, text = line.strip().split(
                    maxsplit=5)
                segments.append(
                    AspireSegmentAnnotation(session, speaker, float(start),
                                            float(end), text))

        # Group the segments by session and speaker
        segments_grouped = defaultdict(list)
        for segment in segments:
            segments_grouped[(segment.session,
                              segment.speaker)].append(segment)

        # Create the supervisions
        supervisions = []
        for k, segs in segments_grouped.items():
            session, speaker = k
            supervisions += [
                SupervisionSegment(
                    id=f"{session}-{speaker}-{i:03d}",
                    recording_id=session,
                    start=seg.start,
                    duration=round(seg.end - seg.start, 4),
                    speaker=speaker,
                    text=seg.text,
                    language="English",
                ) for i, seg in enumerate(segs)
            ]
        supervision_set = SupervisionSet.from_segments(supervisions)

        recording_set, supervision_set = fix_manifests(recording_set,
                                                       supervision_set)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"aspire_supervisions_{part}.jsonl.gz")
            recording_set.to_file(output_dir /
                                  f"aspire_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 11
0
def prepare_peoples_speech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    The metadata is read lazily and written to manifests in a stream to minimize
    the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet`
    without using excessive memory, we suggest to call it like::

        >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...)
        >>> cuts = CutSet.from_manifests(
        ...     recordings=peoples_speech["recordings"],
        ...     supervisions=peoples_speech["supervisions"],
        ...     output_path=...,
        ...     lazy=True,
        ... )

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    recs_path = output_dir / "peoples-speech_recordings.jsonl.gz"
    sups_path = output_dir / "peoples-speech_supervisions.jsonl.gz"

    if recs_path.is_file() and sups_path.is_file():
        # Nothing to do: just open the manifests in lazy mode.
        return {
            "recordings": RecordingSet.from_jsonl_lazy(recs_path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sups_path),
        }

    exist = 0
    tot = 0
    err = 0
    with RecordingSet.open_writer(
            recs_path, ) as rec_writer, SupervisionSet.open_writer(
                sups_path, ) as sup_writer:
        for item in tqdm(
                # Note: People's Speech manifest.json is really a JSONL.
                load_jsonl(corpus_dir / "manifest.json"),
                desc=
                "Converting People's Speech manifest.json to Lhotse manifests",
        ):
            for duration_ms, text, audio_path in zip(
                    *item["training_data"].values()):
                full_path = corpus_dir / audio_path

                tot += 1
                if not full_path.exists():
                    # If we can't find some data, we'll just continue and some items
                    # were missing later.
                    continue
                exist += 1

                try:
                    audio_info = info(full_path)
                    duration = duration_ms / 1000
                    r = Recording(
                        id=full_path.stem,
                        sampling_rate=audio_info.samplerate,
                        num_samples=compute_num_samples(
                            duration, audio_info.samplerate),
                        duration=duration,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[0],
                                source=str(full_path),
                            )
                        ],
                    )
                    s = SupervisionSegment(
                        id=r.id,
                        recording_id=r.id,
                        start=0,
                        duration=r.duration,
                        channel=0,
                        text=text,
                        language="English",
                        custom={"session_id": item["identifier"]},
                    )

                    validate_recordings_and_supervisions(recordings=r,
                                                         supervisions=s)

                    rec_writer.write(r)
                    sup_writer.write(s)

                except Exception as e:
                    # If some files are missing (e.g. somebody is working on a subset
                    # of 30.000 hours), we won't interrupt processing; we will only
                    # do so for violated assertions.
                    if isinstance(e, AssertionError):
                        raise
                    err += 1
                    continue

    if exist < tot or err > 0:
        warnings.warn(
            f"We finished preparing The People's Speech Lhotse manifests. "
            f"Out of {tot} entries in the original manifest, we found {exist} "
            f"audio files existed, out of which {err} had errors during processing."
        )

    return {
        "recordings": rec_writer.open_manifest(),
        "supervisions": sup_writer.open_manifest(),
    }
Exemplo n.º 12
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / 'metadata.csv'
    assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}'
    metadata = {}
    with open(metadata_csv_path) as f:
        for line in f:
            idx, text, _ = line.split('|')
            audio_path = corpus_dir / 'wavs' / f'{idx}.wav'
            if audio_path.is_file():
                # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                info = torchaudio.info(str(audio_path))
                metadata[idx] = LJSpeechMetaData(audio_path=audio_path,
                                                 audio_info=info[0],
                                                 text=text)
            else:
                logging.warning(f'No such file: {audio_path}')

    # Audio
    audio = RecordingSet.from_recordings(
        Recording(id=idx,
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source=str(metadata[idx].audio_path))
                  ],
                  sampling_rate=int(metadata[idx].audio_info.rate),
                  num_samples=metadata[idx].audio_info.length,
                  duration=(metadata[idx].audio_info.length /
                            metadata[idx].audio_info.rate))
        for idx in metadata)

    # Supervision
    supervision = SupervisionSet.from_segments(
        SupervisionSegment(id=idx,
                           recording_id=idx,
                           start=0.0,
                           duration=audio.recordings[idx].duration,
                           channel=0,
                           language='English',
                           gender='female',
                           text=metadata[idx].text)
        for idx in audio.recordings)

    if output_dir is not None:
        supervision.to_json(output_dir / 'supervisions.json')
        audio.to_json(output_dir / 'audio.json')

    return {'audio': audio, 'supervisions': supervision}
Exemplo n.º 13
0
def prepare_earnings22(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    normalize_text: bool = False,
) -> Union[RecordingSet, SupervisionSet]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    media_dir = corpus_dir / "media"
    audio_files = list(media_dir.glob("*.mp3"))
    assert len(audio_files) == 125

    audio_files.sort()
    recording_set = RecordingSet.from_recordings(
        Recording.from_file(p) for p in audio_files)

    nlp_dir = corpus_dir / "transcripts" / "nlp_references"
    nlp_files = list(nlp_dir.glob("*.nlp"))
    assert len(nlp_files) == 125

    metadata = read_metadata(corpus_dir / "metadata.csv")

    nlp_files.sort()
    supervision_segments = list()
    for nlp_file in nlp_files:
        id = nlp_file.stem
        text = " ".join(parse_nlp_file(nlp_file))
        if normalize_text:
            text = normalize(text)

        s = SupervisionSegment(
            id=id,
            recording_id=id,
            start=0.0,
            duration=recording_set[id].duration,  # recording.duration,
            channel=0,
            language=f"English-{metadata[id][4]}",
            text=text,
        )
        supervision_segments.append(s)
    supervision_set = SupervisionSet.from_segments(supervision_segments)

    validate_recordings_and_supervisions(recording_set, supervision_set)
    if output_dir is not None:
        supervision_set.to_file(output_dir /
                                "earnings22_supervisions_all.jsonl.gz")
        recording_set.to_file(output_dir /
                              "earnings22_recordings_all.jsonl.gz")

    return recording_set, supervision_set
Exemplo n.º 14
0
def create_recording(
    audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int,
                                                         None]]) -> Recording:
    audio_path, rel_path_depth = audio_path_and_rel_path_depth
    return Recording.from_file(audio_path, relative_path_depth=rel_path_depth)
Exemplo n.º 15
0
def recording(file_source):
    return Recording(id='rec',
                     sources=[file_source] * 2,
                     sampling_rate=8000,
                     num_samples=4000,
                     duration=0.5)
Exemplo n.º 16
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / "data_aishell/transcript/aishell_transcript_v0.8.txt"
    transcript_dict = {}
    with open(transcript_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ["train", "dev", "test"]
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / "data_aishell" / "wav" / f"{part}"
        for audio_path in wav_path.rglob("**/*.wav"):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f"No transcript: {idx}")
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="Chinese",
                speaker=speaker,
                text=text.strip(),
            )
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 17
0
def dummy_recording():
    return Recording(id='irrelevant', sources=[AudioSource(type='file', channels=[0], source='irrelevant')],
                     sampling_rate=16000, num_samples=160000, duration=10.0)
Exemplo n.º 18
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1] if path_or_cmd.
                    endswith("|") else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id],
                                            sampling_rate),
            duration=durations[recording_id],
        ) for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [
                sup_string.strip().split() for sup_string in f
            ]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            ) for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldiio"):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldiio",
                    num_frames=mat.shape[0],
                    num_features=mat.shape[1],
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat.shape[0] * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].
                    recording_id if supervision_set is not None else utt_id,
                    channels=0,
                )
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: "
                          "frame_shift must be not None. "
                          "Feature import omitted.")

    return recording_set, supervision_set, feature_set
Exemplo n.º 19
0
def recording():
    return Recording.from_file(
        "test/fixtures/libri/libri-1088-134315-0000.wav")
Exemplo n.º 20
0
def prepare_ami(
        data_dir: Pathlike,
        output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param data_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'.
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f'No such directory: {data_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip')
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {
        (filename.split('.')[0], int(filename[-5])): annotations
        for filename, annotations in anotation_lists.items()
    }
    wav_dir = data_dir / 'wav_db'
    audio_paths = wav_dir.rglob('*.wav')
    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby
    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    manifests = defaultdict(dict)

    for part in dataset_parts:
        # Audio
        recordings = []
        for session_name, channel_paths in channel_wavs.items():
            if session_name not in dataset_parts[part]:
                continue
            audio_info = torchaudio.info(str(channel_paths[0]))[0]
            recordings.append(Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[idx],
                        source=str(audio_path)
                    )
                    for idx, audio_path in enumerate(sorted(channel_paths))
                ],
                sampling_rate=int(audio_info.rate),
                num_samples=audio_info.length,
                duration=audio_info.length / audio_info.rate,
            ))
        audio = RecordingSet.from_recordings(recordings)

        # Supervisions
        segments_by_pause = []
        for recording in audio:
            for source in recording.sources:
                # In AMI "source.channels" will always be a one-element list
                channel, = source.channels
                anotation = annotation_by_id_and_channel.get((recording.id, channel))
                if anotation is None:
                    logging.warning(f'No annotation found for recording "{recording.id}" channel {channel} '
                                    f'(file {source.source})')
                    continue
                for seg_idx, seg_info in enumerate(anotation):
                    for subseg_idx, subseg_info in enumerate(seg_info):
                        duration = subseg_info.end_time - subseg_info.begin_time
                        if duration > 0:
                            segments_by_pause.append(SupervisionSegment(
                                id=f'{recording.id}-{seg_idx}-{subseg_idx}',
                                recording_id=recording.id,
                                start=subseg_info.begin_time,
                                duration=duration,
                                channel=channel,
                                language='English',
                                speaker=subseg_info.speaker,
                                gender=subseg_info.gender,
                                text=subseg_info.text
                            ))
        supervision = SupervisionSet.from_segments(segments_by_pause)
        if output_dir is not None:
            audio.to_json(output_dir / f'recordings_{part}.json')
            supervision.to_json(output_dir / f'supervisions_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Exemplo n.º 21
0
def prepare_libricss(
    corpus_dir: Pathlike,
    output_dir: Pathlike = None,
    type: str = "mdm",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    """
    assert type in ["mdm", "ihm-mix", "ihm"]

    manifests = {}

    corpus_dir = Path(corpus_dir)
    corpus_dir = (
        corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir
    )

    recordings = []
    segments = []

    for ov in OVERLAP_RATIOS:
        for session in (corpus_dir / ov).iterdir():
            _, _, _, _, _, name, actual_ov = session.name.split("_")
            actual_ov = float(actual_ov.split("actual")[1])
            recording_id = f"{ov}_{name}"
            audio_path = (
                session / "clean" / "mix.wav"
                if type == "ihm-mix"
                else session / "clean" / "each_spk.wav"
                if type == "ihm"
                else session / "record" / "raw_recording.wav"
            )
            recordings.append(
                Recording.from_file(audio_path, recording_id=recording_id)
            )
            for idx, seg in enumerate(
                parse_transcript(session / "transcription" / "meeting_info.txt")
            ):
                segments.append(
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=seg[0],
                        duration=seg[1] - seg[0],
                        text=seg[4],
                        language="English",
                        speaker=seg[2],
                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
                        if type == "ihm"
                        else 0,
                    )
                )

    supervisions = SupervisionSet.from_segments(segments)
    recordings = RecordingSet.from_recordings(recordings)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        recordings.to_file(output_dir / "libricss_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "libricss_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 22
0
def prepare_cslu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    normalize_text: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CSLU Kids corpus. The supervision contains either the
    prompted text, or a transcription of the spontaneous speech, depending on
    whether the utterance was scripted or spontaneous.

    Additionally, the following information is present in the `custom` tag:
    scripted/spontaneous utterance, and verification label (rating between 1 and 4)
    for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt
    or top documentation in this script for more information).

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir,
                                                str) else corpus_dir

    # Get list of all recordings
    audio_paths = check_and_rglob(corpus_dir, '*.wav')

    # Read verification labels
    verification = {}
    for file in check_and_rglob(corpus_dir, "*-verified.txt"):
        with open(file, 'r') as f:
            for line in f:
                path, label = line.strip().split()
                utt = Path(path).stem
                verification[utt] = int(label)

    # Read prompted transcriptions
    prompts = {}
    with open(corpus_dir / 'docs' / 'all.map', 'r') as f:
        for line in f:
            if line.strip() != "":
                prompt, text = line.strip().split(maxsplit=1)
                prompts[prompt] = text[1:-1]  # remove " " around the text

    recordings = []
    supervisions = []
    for p in tqdm(audio_paths, desc="Preparing manifests"):

        # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav
        uttid = p.stem  # ks001000
        spk = p.parent.stem  # ks001
        cat = p.parent.parent.stem  # 0
        prompt = p.parent.parent.parent.stem  # 00
        type = p.parent.parent.parent.parent.stem  # scripted

        recording = Recording.from_file(
            p, relative_path_depth=None if absolute_paths else 3)
        recordings.append(recording)

        if type == "scripted":
            text = prompts[prompt]
            verification_label = verification[
                uttid] if uttid in verification else None
            custom = {'type': type, 'verification_label': verification_label}
        elif type == "spontaneous":
            text = read_text(
                corpus_dir / 'trans' / type / prompt / cat / spk /
                f'{uttid}.txt',
                normalize=normalize_text,
            )
            custom = {'type': type}
        supervisions.append(
            SupervisionSegment(
                id=uttid,
                recording_id=uttid,
                start=0,
                duration=recording.duration,
                speaker=spk,
                language='English',
                text=text,
                custom=custom,
            ))

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        'recordings': recordings,
        'supervisions': supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / 'recordings.json')
        manifests["supervisions"].to_json(output_dir / 'supervisions.json')

    return manifests
Exemplo n.º 23
0
def prepare_timit(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_phones: int = 48,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consists of the Recodings and Supervisions.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write and save the manifests.
    :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    manifests = defaultdict(dict)
    dataset_parts = ["TRAIN", "DEV", "TEST"]

    phones_dict = {}

    if num_phones in [60, 48, 39]:
        phones_dict = get_phonemes(num_phones)
    else:
        raise ValueError("The value of num_phones must be in [60, 48, 39].")

    dev_spks, test_spks = get_speakers()

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in dataset_parts:
            wav_files = []

            if part == "TRAIN":
                print("starting....")
                wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)
                )
            elif part == "DEV":
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)
                )
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in dev_spks, wav_files)
                )
            else:
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)
                )
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in test_spks, wav_files)
                )

            logging.debug(f"{part} dataset manifest generation.")
            recordings = []
            supervisions = []

            for wav_file in tqdm(wav_files):
                items = str(wav_file).strip().split("/")
                idx = items[-2] + "-" + items[-1][:-4]
                speaker = items[-2]
                transcript_file = Path(wav_file).with_suffix(".PHN")
                if not Path(wav_file).is_file():
                    logging.warning(f"No such file: {wav_file}")
                    continue
                if not Path(transcript_file).is_file():
                    logging.warning(f"No transcript: {transcript_file}")
                    continue
                text = []
                with open(transcript_file, "r") as f:
                    lines = f.readlines()
                    for line in lines:
                        phone = line.rstrip("\n").split(" ")[-1]
                        if num_phones != 60:
                            phone = phones_dict[str(phone)]
                        text.append(phone)

                    text = " ".join(text).replace("h#", "sil")

                recording = Recording.from_file(path=wav_file, recording_id=idx)
                recordings.append(recording)
                segment = SupervisionSegment(
                    id=idx,
                    recording_id=idx,
                    start=0.0,
                    duration=recording.duration,
                    channel=0,
                    language="English",
                    speaker=speaker,
                    text=text.strip(),
                )

                supervisions.append(segment)

                recording_set = RecordingSet.from_recordings(recordings)
                supervision_set = SupervisionSet.from_segments(supervisions)
                validate_recordings_and_supervisions(recording_set, supervision_set)

                if output_dir is not None:
                    supervision_set.to_json(output_dir / f"supervisions_{part}.json")
                    recording_set.to_json(output_dir / f"recordings_{part}.json")

                manifests[part] = {
                    "recordings": recording_set,
                    "supervisions": supervision_set,
                }

    return manifests
Exemplo n.º 24
0
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Optional[Tuple[str]] = dataset_parts_mini,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace(
                        '-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(
                            audio_path=audio_path,
                            audio_info=info[0],
                            text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.rate),
                      num_samples=metadata[idx].audio_info.length,
                      duration=(metadata[idx].audio_info.length /
                                metadata[idx].audio_info.rate))
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='English',
                               speaker=re.sub(r'-.*', r'', idx),
                               text=metadata[idx].text.strip())
            for idx in audio.recordings)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 25
0
def dummy_recording(unique_id: int) -> Recording:
    return Recording(id=f'dummy-recording-{unique_id:04d}',
                     sources=[],
                     sampling_rate=16000,
                     num_samples=16000,
                     duration_seconds=1.0)
Exemplo n.º 26
0
def load_kaldi_data_dir(
        path: Pathlike,
        sampling_rate: int) -> Tuple[RecordingSet, Optional[SupervisionSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    audio_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    # must exist for SupervisionSet
    segments = path / 'segments'
    if not segments.is_file():
        return audio_set, None

    with segments.open() as f:
        supervision_segments = [l.strip().split() for l in f]

    texts = load_kaldi_text_mapping(path / 'text')
    speakers = load_kaldi_text_mapping(path / 'utt2spk')
    genders = load_kaldi_text_mapping(path / 'spk2gender')
    languages = load_kaldi_text_mapping(path / 'utt2lang')

    supervision_set = SupervisionSet.from_segments(
        SupervisionSegment(id=segment_id,
                           recording_id=recording_id,
                           start=float(start),
                           duration=float(end) - float(start),
                           channel=0,
                           text=texts[segment_id],
                           language=languages[segment_id],
                           speaker=speakers[segment_id],
                           gender=genders[speakers[segment_id]])
        for segment_id, recording_id, start, end in supervision_segments)

    return audio_set, supervision_set
Exemplo n.º 27
0
def prepare_mobvoihotwords(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    dataset_parts = ["train", "dev", "test"]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    for part in dataset_parts:
        logging.info(f"Preparing MobvoiHotwords subset: {part}")
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(
                f"MobvoiHotwords subset: {part} already prepared - skipping.")
            continue
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        for prefix in ["p_", "n_"]:
            prefixed_part = prefix + part
            json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" /
                         f"{prefixed_part}.json")
            with open(json_path, "r", encoding="utf-8") as f:
                json_data = json.load(f)
                for entry in json_data:
                    idx = entry["utt_id"]
                    speaker = (idx if entry["speaker_id"] is None else
                               entry["speaker_id"])
                    audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav"
                    text = "FREETEXT"
                    if entry["keyword_id"] == 0:
                        text = "HiXiaowen"
                    elif entry["keyword_id"] == 1:
                        text = "NihaoWenwen"
                    else:
                        assert entry["keyword_id"] == -1
                    if not audio_path.is_file():
                        logging.warning(f"No such file: {audio_path}")
                        continue
                    recording = Recording.from_file(audio_path)
                    recordings.append(recording)
                    segment = SupervisionSegment(
                        id=idx,
                        recording_id=idx,
                        start=0.0,
                        duration=recording.duration,
                        channel=0,
                        language="Chinese",
                        speaker=speaker,
                        text=text.strip(),
                    )
                    supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests