Exemplo n.º 1
0
def prepare_aishell4(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AISHELL-4 data, please 'pip install textgrid' first.")
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    global_spk_id = {}
    for part in ["train_L", "train_M", "train_S", "test"]:
        recordings = []
        supervisions = []
        wav_path = corpus_dir / part / "wav"
        for audio_path in wav_path.rglob("*.flac"):
            idx = audio_path.stem

            try:
                tg = textgrid.TextGrid.fromFile(
                    f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid")
            except ValueError:
                logging.warning(
                    f"{idx} has annotation issues. Skipping this recording.")
                continue

            recording = Recording.from_file(audio_path)
            recordings.append(recording)

            for tier in tg.tiers:
                local_spk_id = tier.name
                key = (idx, local_spk_id)
                if key not in global_spk_id:
                    global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}"
                spk_id = global_spk_id[key]
                for j, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{idx}-{spk_id}-{j}",
                            recording_id=idx,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"aishell4_supervisions_{part}.jsonl.gz")
            recording_set.to_file(output_dir /
                                  f"aishell4_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 2
0
def prepare_icsi(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "ihm",
    normalize_text: str = "kaldi",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param audio_dir: Pathlike, the path which holds the audio data
    :param transcripts_dir: Pathlike, the path which holds the transcripts data
    :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.
    """
    audio_dir = Path(audio_dir)
    transcripts_dir = Path(transcripts_dir)

    assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
    assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
    assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info("Parsing ICSI transcripts")
    annotations, channel_to_idx_map = parse_icsi_annotations(
        transcripts_dir, normalize=normalize_text)

    # Audio
    logging.info("Preparing recording manifests")

    channels = "".join(MIC_TO_CHANNELS[mic])
    if mic == "ihm" or mic == "mdm":
        audio_paths = audio_dir.rglob(f"chan[{channels}].sph")
        audio = prepare_audio_grouped(
            list(audio_paths), channel_to_idx_map if mic == "ihm" else None)
    elif mic == "sdm" or mic == "ihm-mix":
        audio_paths = (audio_dir.rglob(f"chan[{channels}].sph")
                       if len(channels) else audio_dir.rglob("*.wav"))
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info("Preparing supervision manifests")
    supervision = (prepare_supervision_ihm(
        audio, annotations, channel_to_idx_map) if mic == "ihm" else
                   prepare_supervision_other(audio, annotations))

    manifests = defaultdict(dict)

    for part in ["train", "dev", "test"]:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in PARTITIONS[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in PARTITIONS[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_file(output_dir /
                               f"icsi-{mic}_recordings_{part}.jsonl.gz")
            supervision_part.to_file(
                output_dir / f"icsi-{mic}_supervisions_{part}.jsonl.gz")

        audio_part, supervision_part = fix_manifests(audio_part,
                                                     supervision_part)
        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            "recordings": audio_part,
            "supervisions": supervision_part
        }

    return dict(manifests)
Exemplo n.º 3
0
def prepare_ami(
    data_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param data_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'.
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f'No such directory: {data_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip')
    # Create a mapping from a tuple of (session_id, channel) to the list of annotations.
    # This way we can map the supervisions to the right channels in a multi-channel recording.
    annotation_by_id_and_channel = {
        (filename.split('.')[0], int(filename[-5])): annotations
        for filename, annotations in anotation_lists.items()
    }
    wav_dir = data_dir / 'wav_db'
    audio_paths = wav_dir.rglob('*.wav')
    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby
    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    manifests = defaultdict(dict)

    for part in dataset_parts:
        # Audio
        recordings = []
        for session_name, channel_paths in channel_wavs.items():
            if session_name not in dataset_parts[part]:
                continue
            audio_info = torchaudio.info(str(channel_paths[0]))[0]
            recordings.append(
                Recording(
                    id=session_name,
                    sources=[
                        AudioSource(type='file',
                                    channels=[idx],
                                    source=str(audio_path))
                        for idx, audio_path in enumerate(sorted(channel_paths))
                    ],
                    sampling_rate=int(audio_info.rate),
                    num_samples=audio_info.length,
                    duration=audio_info.length / audio_info.rate,
                ))
        audio = RecordingSet.from_recordings(recordings)

        # Supervisions
        segments_by_pause = []
        for recording in audio:
            for source in recording.sources:
                # In AMI "source.channels" will always be a one-element list
                channel, = source.channels
                anotation = annotation_by_id_and_channel.get(
                    (recording.id, channel))
                if anotation is None:
                    logging.warning(
                        f'No annotation found for recording "{recording.id}" channel {channel} '
                        f'(file {source.source})')
                    continue
                for seg_idx, seg_info in enumerate(anotation):
                    for subseg_idx, subseg_info in enumerate(seg_info):
                        duration = subseg_info.end_time - subseg_info.begin_time
                        if duration > 0:
                            segments_by_pause.append(
                                SupervisionSegment(
                                    id=f'{recording.id}-{seg_idx}-{subseg_idx}',
                                    recording_id=recording.id,
                                    start=subseg_info.begin_time,
                                    duration=duration,
                                    channel=channel,
                                    language='English',
                                    speaker=subseg_info.speaker,
                                    gender=subseg_info.gender,
                                    text=subseg_info.text))
        supervision = SupervisionSet.from_segments(segments_by_pause)
        if output_dir is not None:
            audio.to_json(output_dir / f'recordings_{part}.json')
            supervision.to_json(output_dir / f'supervisions_{part}.json')

        manifests[part] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 4
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = 'auto',
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    if dataset_parts == 'auto':
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir,
                                             prefix='libritts')

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split('|') for line in (
            corpus_dir / 'SPEAKERS.txt').read_text().splitlines()
                                   if not line.startswith(';'))
    }

    for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'):
        if manifests_exist(part=part, output_dir=output_dir,
                           prefix='libritts'):
            logging.info(
                f'LibriTTS subset: {part} already prepared - skipping.')
            continue
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           '*.wav',
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob('*.trans.tsv'),
                desc='Scanning transcript files (progbar per speaker)',
                leave=False):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = {
                rec_id: float(snr)
                for rec_id, *_, snr in map(str.split, (
                    trans_path.parent /
                    trans_path.name.replace('.trans.tsv', '.book.tsv')
                ).read_text().splitlines())
            }
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split('\t')
                spk_id = rec_id.split('_')[0]
                supervisions.append(
                    SupervisionSegment(id=rec_id,
                                       recording_id=rec_id,
                                       start=0.0,
                                       duration=recordings[rec_id].duration,
                                       channel=0,
                                       text=norm_text,
                                       language='English',
                                       speaker=spk_id,
                                       gender=spk2gender[spk_id],
                                       custom={
                                           'orig_text': orig_text,
                                           'snr': utt2snr[rec_id]
                                       }))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_json(output_dir /
                                 f'libritts_supervisions_{part}.json')
            recordings.to_json(output_dir / f'libritts_recordings_{part}.json')

        manifests[part] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests
Exemplo n.º 5
0
def prepare_ami(
    data_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = 'ihm',
    partition: Optional[str] = 'full-corpus',
    max_pause: Optional[float] = 0.0
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param data_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split
    :param max_pause: float (default = 0.0), max pause allowed between word segments to combine segments
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys 
        'recordings' and 'supervisions'.
    
    The `partition` and `max_pause` must be chosen depending on the task. For example:
    - Speaker diarization: set `partition="full-corpus"` and `max_pause=0`
    - ASR: set `partition="full-corpus-asr"` and `max_pause=0.3` (or some value in the range 0.2-0.5)
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f'No such directory: {data_dir}'
    assert mic in MICS, f'Mic {mic} not supported'
    assert partition in PARTITIONS, f'Partition {partition} not supported'

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info('Parsing AMI annotations')
    annotations = parse_ami_annotations(data_dir / 'annotations.zip',
                                        max_pause=max_pause)

    # Audio
    logging.info('Preparing recording manifests')
    wav_dir = data_dir / 'wav_db'

    if mic in ['ihm', 'mdm']:
        audio_paths = wav_dir.rglob('*Headset-?.wav') if mic == 'ihm' else \
            wav_dir.rglob('*Array?-0?.wav')
        audio = prepare_audio_grouped(list(audio_paths))
    elif mic in ['ihm-mix', 'sdm']:
        audio_paths = wav_dir.rglob('*Mix-Headset.wav') if mic == 'ihm-mix' else \
            wav_dir.rglob('*Array1-01.wav')
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info('Preparing supervision manifests')
    supervision = prepare_supervision_ihm(audio, annotations) if mic == 'ihm' \
        else prepare_supervision_other(audio, annotations)

    manifests = defaultdict(dict)

    dataset_parts = PARTITIONS[partition]
    for part in ['train', 'dev', 'test']:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in dataset_parts[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in dataset_parts[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_json(output_dir / f'recordings_{part}.json')
            supervision_part.to_json(output_dir / f'supervisions_{part}.json')

        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            'recordings': audio_part,
            'supervisions': supervision_part
        }

    return dict(manifests)
Exemplo n.º 6
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
    link_previous_utt: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :param link_previous_utt: If true adds previous utterance id to supervisions.
        Useful for reconstructing chains of utterances as they were read.
        If previous utterance was skipped from LibriTTS datasets previous_utt label is None.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "auto":
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir,
                                             prefix="libritts")

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split("|") for line in (
            corpus_dir / "SPEAKERS.txt").read_text().splitlines()
                                   if not line.startswith(";"))
    }

    for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"):
        if manifests_exist(part=part, output_dir=output_dir,
                           prefix="libritts"):
            logging.info(
                f"LibriTTS subset: {part} already prepared - skipping.")
            continue
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           "*.wav",
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob("*.trans.tsv"),
                desc="Scanning transcript files (progbar per speaker)",
                leave=False,
        ):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map(
                str.split,
                (trans_path.parent /
                 trans_path.name.replace(".trans.tsv", ".book.tsv")
                 ).read_text().splitlines(),
            )]
            # keeps the order of uttids as they appear in book.tsv
            uttids = [r for r, _ in utt2snr]
            utt2snr = dict(utt2snr)

            if link_previous_utt:
                # Using the property of sorted keys to find previous utterance
                # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001
                utt2prevutt = dict(zip(uttids + [None], [None] + uttids))

            prev_rec_id = None
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split("\t")
                spk_id = rec_id.split("_")[0]
                customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]}
                if link_previous_utt:
                    # all recordings ids should be in the book.csv
                    # but they are some missing e.g. 446_123502_000030_000003
                    prev_utt = utt2prevutt.get(rec_id, None)
                    # previous utterance has to be present in trans.csv - otherwise it was skipped
                    prev_utt = prev_utt if prev_utt == prev_rec_id else None
                    customd["prev_utt"] = prev_utt
                    prev_rec_id = rec_id
                supervisions.append(
                    SupervisionSegment(
                        id=rec_id,
                        recording_id=rec_id,
                        start=0.0,
                        duration=recordings[rec_id].duration,
                        channel=0,
                        text=norm_text,
                        language="English",
                        speaker=spk_id,
                        gender=spk2gender[spk_id],
                        custom=customd,
                    ))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir /
                                 f"libritts_supervisions_{part}.jsonl.gz")
            recordings.to_file(output_dir /
                               f"libritts_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Exemplo n.º 7
0
def prepare_aspire(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: str = "single"
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21).
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, the microphone type, either "single" or "multi".
    :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    assert mic in [
        "single",
        "multi",
    ], f"mic must be either 'single' or 'multi', got {mic}"
    corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data"
    audio_dir = corpus_dir / "dev_and_dev_test_audio"
    stm_dir = corpus_dir / "dev_and_dev_test_STM_files"

    if mic == "single":
        audio_paths = {
            "dev": audio_dir / "ASpIRE_single_dev",
            "dev_test": audio_dir / "ASpIRE_single_dev_test",
        }
        stm_file = {
            "dev": stm_dir / "dev.stm",
            "dev_test": stm_dir / "dev_test.stm",
        }
    else:
        audio_paths = {
            "dev": audio_dir / "ASpIRE_multi_dev",
            "dev_test": audio_dir / "ASpIRE_multi_dev_test",
        }
        stm_file = {
            "dev": stm_dir / "multi_dev.stm",
            "dev_test": stm_dir / "multi_dev_test.stm",
        }
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    for part in ["dev", "dev_test"]:
        recordings = []
        supervisions = []

        # Prepare the recordings
        if mic == "single":
            recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav")
        else:
            import soundfile as sf

            audio_groups = {
                k: list(v)
                for k, v in itertools.groupby(
                    sorted(audio_paths[part].glob("*.wav")),
                    key=lambda x: "_".join(x.stem.split("_")[:-1]),
                )
            }  # group audios so that each entry is a session containing all channels
            for session_name, audios in audio_groups.items():
                audio_sf = sf.SoundFile(str(audios[0]))
                recordings.append(
                    Recording(
                        id=session_name,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[int(audio.stem[-2:]) - 1],
                                source=str(audio),
                            ) for audio in sorted(audios)
                        ],
                        sampling_rate=audio_sf.samplerate,
                        num_samples=audio_sf.frames,
                        duration=audio_sf.frames / audio_sf.samplerate,
                    ))
            recording_set = RecordingSet.from_recordings(recordings)

        # Read STM file and prepare segments
        segments = []
        with open(stm_file[part]) as f:
            for line in f:
                session, _, speaker, start, end, text = line.strip().split(
                    maxsplit=5)
                segments.append(
                    AspireSegmentAnnotation(session, speaker, float(start),
                                            float(end), text))

        # Group the segments by session and speaker
        segments_grouped = defaultdict(list)
        for segment in segments:
            segments_grouped[(segment.session,
                              segment.speaker)].append(segment)

        # Create the supervisions
        supervisions = []
        for k, segs in segments_grouped.items():
            session, speaker = k
            supervisions += [
                SupervisionSegment(
                    id=f"{session}-{speaker}-{i:03d}",
                    recording_id=session,
                    start=seg.start,
                    duration=round(seg.end - seg.start, 4),
                    speaker=speaker,
                    text=seg.text,
                    language="English",
                ) for i, seg in enumerate(segs)
            ]
        supervision_set = SupervisionSet.from_segments(supervisions)

        recording_set, supervision_set = fix_manifests(recording_set,
                                                       supervision_set)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"aspire_supervisions_{part}.jsonl.gz")
            recording_set.to_file(output_dir /
                                  f"aspire_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 8
0
def prepare_timit(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_phones: int = 48,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consists of the Recodings and Supervisions.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write and save the manifests.
    :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    manifests = defaultdict(dict)
    dataset_parts = ["TRAIN", "DEV", "TEST"]

    phones_dict = {}

    if num_phones in [60, 48, 39]:
        phones_dict = get_phonemes(num_phones)
    else:
        raise ValueError("The value of num_phones must be in [60, 48, 39].")

    dev_spks, test_spks = get_speakers()

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in dataset_parts:
            wav_files = []

            if part == "TRAIN":
                print("starting....")
                wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
            elif part == "DEV":
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in dev_spks,
                           wav_files))
            else:
                wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV")
                # filter the SA (dialect sentences)
                wav_files = list(
                    filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files))
                wav_files = list(
                    filter(lambda x: x.split("/")[-2].lower() in test_spks,
                           wav_files))

            logging.debug(f"{part} dataset manifest generation.")
            recordings = []
            supervisions = []

            for wav_file in tqdm(wav_files):
                items = str(wav_file).strip().split("/")
                idx = items[-2] + "-" + items[-1][:-4]
                speaker = items[-2]
                transcript_file = Path(wav_file).with_suffix(".PHN")
                if not Path(wav_file).is_file():
                    logging.warning(f"No such file: {wav_file}")
                    continue
                if not Path(transcript_file).is_file():
                    logging.warning(f"No transcript: {transcript_file}")
                    continue
                text = []
                with open(transcript_file, "r") as f:
                    lines = f.readlines()
                    for line in lines:
                        phone = line.rstrip("\n").split(" ")[-1]
                        if num_phones != 60:
                            phone = phones_dict[str(phone)]
                        text.append(phone)

                    text = " ".join(text).replace("h#", "sil")

                recording = Recording.from_file(path=wav_file,
                                                recording_id=idx)
                recordings.append(recording)
                segment = SupervisionSegment(
                    id=idx,
                    recording_id=idx,
                    start=0.0,
                    duration=recording.duration,
                    channel=0,
                    language="English",
                    speaker=speaker,
                    text=text.strip(),
                )

                supervisions.append(segment)

                recording_set = RecordingSet.from_recordings(recordings)
                supervision_set = SupervisionSet.from_segments(supervisions)
                validate_recordings_and_supervisions(recording_set,
                                                     supervision_set)

                if output_dir is not None:
                    supervision_set.to_json(output_dir /
                                            f"supervisions_{part}.json")
                    recording_set.to_json(output_dir /
                                          f"recordings_{part}.json")

                manifests[part] = {
                    "recordings": recording_set,
                    "supervisions": supervision_set,
                }

    return manifests
Exemplo n.º 9
0
def prepare_earnings22(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    normalize_text: bool = False,
) -> Union[RecordingSet, SupervisionSet]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    media_dir = corpus_dir / "media"
    audio_files = list(media_dir.glob("*.mp3"))
    assert len(audio_files) == 125

    audio_files.sort()
    recording_set = RecordingSet.from_recordings(
        Recording.from_file(p) for p in audio_files)

    nlp_dir = corpus_dir / "transcripts" / "nlp_references"
    nlp_files = list(nlp_dir.glob("*.nlp"))
    assert len(nlp_files) == 125

    metadata = read_metadata(corpus_dir / "metadata.csv")

    nlp_files.sort()
    supervision_segments = list()
    for nlp_file in nlp_files:
        id = nlp_file.stem
        text = " ".join(parse_nlp_file(nlp_file))
        if normalize_text:
            text = normalize(text)

        s = SupervisionSegment(
            id=id,
            recording_id=id,
            start=0.0,
            duration=recording_set[id].duration,  # recording.duration,
            channel=0,
            language=f"English-{metadata[id][4]}",
            text=text,
        )
        supervision_segments.append(s)
    supervision_set = SupervisionSet.from_segments(supervision_segments)

    validate_recordings_and_supervisions(recording_set, supervision_set)
    if output_dir is not None:
        supervision_set.to_file(output_dir /
                                "earnings22_supervisions_all.jsonl.gz")
        recording_set.to_file(output_dir /
                              "earnings22_recordings_all.jsonl.gz")

    return recording_set, supervision_set
Exemplo n.º 10
0
def prepare_mobvoihotwords(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    dataset_parts = ["train", "dev", "test"]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    for part in dataset_parts:
        logging.info(f"Preparing MobvoiHotwords subset: {part}")
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(
                f"MobvoiHotwords subset: {part} already prepared - skipping.")
            continue
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        for prefix in ["p_", "n_"]:
            prefixed_part = prefix + part
            json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" /
                         f"{prefixed_part}.json")
            with open(json_path, "r", encoding="utf-8") as f:
                json_data = json.load(f)
                for entry in json_data:
                    idx = entry["utt_id"]
                    speaker = (idx if entry["speaker_id"] is None else
                               entry["speaker_id"])
                    audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav"
                    text = "FREETEXT"
                    if entry["keyword_id"] == 0:
                        text = "HiXiaowen"
                    elif entry["keyword_id"] == 1:
                        text = "NihaoWenwen"
                    else:
                        assert entry["keyword_id"] == -1
                    if not audio_path.is_file():
                        logging.warning(f"No such file: {audio_path}")
                        continue
                    recording = Recording.from_file(audio_path)
                    recordings.append(recording)
                    segment = SupervisionSegment(
                        id=idx,
                        recording_id=idx,
                        start=0.0,
                        duration=recording.duration,
                        channel=0,
                        language="Chinese",
                        speaker=speaker,
                        text=text.strip(),
                    )
                    supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 11
0
def prepare_mobvoihotwords(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    dataset_parts = ['train', 'dev', 'test']

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    for part in dataset_parts:
        logging.info(f'Preparing MobvoiHotwords subset: {part}')
        if manifests_exist(part=part, output_dir=output_dir):
            logging.info(
                f'MobvoiHotwords subset: {part} already prepared - skipping.')
            continue
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        for prefix in ['p_', 'n_']:
            prefixed_part = prefix + part
            json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json'
            with open(json_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                for entry in json_data:
                    idx = entry['utt_id']
                    speaker = idx if entry['speaker_id'] is None else entry[
                        'speaker_id']
                    audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav'
                    text = 'FREETEXT'
                    if entry['keyword_id'] == 0:
                        text = 'HiXiaowen'
                    elif entry['keyword_id'] == 1:
                        text = 'NihaoWenwen'
                    else:
                        assert entry['keyword_id'] == -1
                    if not audio_path.is_file():
                        logging.warning(f'No such file: {audio_path}')
                        continue
                    recording = Recording.from_file(audio_path)
                    recordings.append(recording)
                    segment = SupervisionSegment(id=idx,
                                                 recording_id=idx,
                                                 start=0.0,
                                                 duration=recording.duration,
                                                 channel=0,
                                                 language='Chinese',
                                                 speaker=speaker,
                                                 text=text.strip())
                    supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Exemplo n.º 12
0
def prepare_ami(
    data_dir: Pathlike,
    annotations_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "ihm",
    partition: Optional[str] = "full-corpus",
    normalize_text: str = "kaldi",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param data_dir: Pathlike, the path of the data dir.
    :param annotations: Pathlike, the path of the annotations dir or zip file.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.

    Example usage:
    1. Prepare IHM-Mix data for ASR:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='ihm-mix', partition='full-corpus-asr')
    2. Prepare SDM data:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='sdm', partition='full-corpus')
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f"No such directory: {data_dir}"
    assert mic in MICS, f"Mic {mic} not supported"
    assert partition in PARTITIONS, f"Partition {partition} not supported"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info("Parsing AMI annotations")
    if not annotations_dir:
        if (data_dir / "ami_public_manual_1.6.2").is_dir():
            annotations_dir = data_dir / "ami_public_manual_1.6.2"
        elif (data_dir / "ami_public_manual_1.6.2.zip").is_file():
            annotations_dir = data_dir / "ami_public_manual_1.6.2.zip"
        else:
            raise ValueError(
                f"No annotations directory specified and no zip file found in {data_dir}"
            )
    # Prepare annotations which is a list of segment-level transcriptions
    annotations = parse_ami_annotations(annotations_dir,
                                        normalize=normalize_text)

    # Audio
    logging.info("Preparing recording manifests")
    wav_dir = data_dir

    if mic in ["ihm", "mdm"]:
        audio_paths = (wav_dir.rglob("*Headset-?.wav")
                       if mic == "ihm" else wav_dir.rglob("*Array?-0?.wav"))
        audio = prepare_audio_grouped(list(audio_paths))
    elif mic in ["ihm-mix", "sdm"]:
        audio_paths = (wav_dir.rglob("*Mix-Headset.wav") if mic == "ihm-mix"
                       else wav_dir.rglob("*Array1-01.wav"))
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info("Preparing supervision manifests")
    supervision = (prepare_supervision_ihm(audio, annotations) if mic == "ihm"
                   else prepare_supervision_other(audio, annotations))

    manifests = defaultdict(dict)

    dataset_parts = PARTITIONS[partition]
    for part in ["train", "dev", "test"]:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in dataset_parts[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in dataset_parts[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_file(output_dir /
                               f"ami-{mic}_recordings_{part}.jsonl.gz")
            supervision_part.to_file(output_dir /
                                     f"ami-{mic}_supervisions_{part}.jsonl.gz")

        audio_part, supervision_part = fix_manifests(audio_part,
                                                     supervision_part)
        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            "recordings": audio_part,
            "supervisions": supervision_part
        }

    return dict(manifests)
Exemplo n.º 13
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
Exemplo n.º 14
0
def prepare_ali_meeting(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "far",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    if not is_module_available("textgrid"):
        raise ValueError(
            "To prepare AliMeeting data, please 'pip install textgrid' first."
        )
    import textgrid

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    manifests = defaultdict(dict)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    for part in ["Train", "Eval", "Test"]:
        recordings = []
        supervisions = []
        # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together)
        corpus_dir_split = corpus_dir
        if part == "Eval" or part == "Test":
            corpus_dir_split = (
                corpus_dir / f"{part}_Ali"
                if (corpus_dir / f"{part}_Ali").is_dir()
                else corpus_dir
            )
        wav_paths = corpus_dir_split / f"{part}_Ali_{mic}" / "audio_dir"
        text_paths = corpus_dir_split / f"{part}_Ali_{mic}" / "textgrid_dir"

        # For 'near' setting:
        #  - wav files have names like R0003_M0046_F_SPK0093.wav
        #  - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid
        # Speaker ID information is present in the file name itself

        # For 'far' setting:
        #  - wav files have names like R0015_M0151_MS002.wav
        #  - textgrid files have names like R0015_M015.TextGrid
        # Speaker ID information is present inside the TextGrid file

        for text_path in tqdm(
            list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}"
        ):
            session_id = text_path.stem

            if mic == "near":
                _, _, gender, spk_id = session_id.split("_")
                spk_id = spk_id[3:]  # SPK1953 -> 1953

            try:
                tg = textgrid.TextGrid.fromFile(str(text_path))
            except ValueError:
                logging.warning(
                    f"{session_id} has annotation issues. Skipping this recording."
                )
                continue

            wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0]

            recording = Recording.from_file(wav_path, recording_id=session_id)
            recordings.append(recording)

            for tier in tg.tiers:
                if mic == "far":
                    parts = tier.name.split("_")
                    if len(parts) == 4:
                        _, _, gender, spk_id = parts
                    elif len(parts) == 2:
                        gender, spk_id = parts
                    spk_id = spk_id[3:]  # SPK1953 -> 1953

                for i, interval in enumerate(tier.intervals):
                    if interval.mark != "":
                        start = interval.minTime
                        end = interval.maxTime
                        text = interval.mark
                        segment = SupervisionSegment(
                            id=f"{session_id}-{spk_id}-{i}",
                            recording_id=recording.id,
                            start=start,
                            duration=round(end - start, 4),
                            channel=0,
                            language="Chinese",
                            speaker=spk_id,
                            gender=gender,
                            text=text.strip(),
                        )
                        supervisions.append(segment)

        recording_set, supervision_set = fix_manifests(
            RecordingSet.from_recordings(recordings),
            SupervisionSet.from_segments(supervisions),
        )
        # Fix manifests
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(
                output_dir / f"alimeeting_supervisions_{part.lower()}.jsonl.gz"
            )
            recording_set.to_file(
                output_dir / f"alimeeting_recordings_{part.lower()}.jsonl.gz"
            )

        manifests[part.lower()] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Exemplo n.º 15
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")
    )
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            )
        )
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "vctk_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "vctk_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 16
0
def prepare_wenet_speech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "all",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: Which parts of dataset to prepare, all for all the
                          parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :num_jobs Number of workers to extract manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with
             the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts

    manifests = defaultdict(dict)
    for sub in subsets:
        if sub not in WETNET_SPEECH_PARTS:
            raise ValueError(f"No such part of dataset in WenetSpeech : {sub}")
        manifests[sub] = {"recordings": [], "supervisions": []}

    raw_manifests_path = corpus_dir / "WenetSpeech.json"
    assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}"
    logging.info(f"Loading raw manifests from : {raw_manifests_path}")
    raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8"))

    with ProcessPoolExecutor(num_jobs) as ex:
        for recording, segments in tqdm(
                ex.map(
                    parse_utterance,
                    raw_manifests["audios"],
                    repeat(corpus_dir),
                    repeat(subsets),
                ),
                desc="Processing WenetSpeech JSON entries",
        ):
            for part in segments:
                manifests[part]["recordings"].append(recording)
                manifests[part]["supervisions"].extend(segments[part])

    for sub in subsets:
        recordings, supervisions = fix_manifests(
            recordings=RecordingSet.from_recordings(
                manifests[sub]["recordings"]),
            supervisions=SupervisionSet.from_segments(
                manifests[sub]["supervisions"]),
        )
        validate_recordings_and_supervisions(recordings=recordings,
                                             supervisions=supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir /
                                 f"wenetspeech_supervisions_{sub}.jsonl.gz")
            recordings.to_file(output_dir /
                               f"wenetspeech_recordings_{sub}.jsonl.gz")

        manifests[sub] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }

    return manifests
Exemplo n.º 17
0
def prepare_heroico(
    speech_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    speech_dir = Path(speech_dir)
    transcript_dir = Path(transcript_dir)
    assert speech_dir.is_dir(), f'No such directory: {speech_dir}'
    assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)

    # set some patterns to match fields in transcript files and filenames
    answers_line_pattern = re.compile("\d+/\d+\t.+")
    answers_path_pattern = re.compile('Answers_Spanish')
    heroico_recitations_line_pattern = re.compile("\d+\t.+")
    heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish')
    heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish')
    usma_line_pattern = re.compile("s\d+\t.+")
    usma_native_demo_pattern = re.compile(
        "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+")
    usma_native_path_pattern = re.compile('usma/native')
    usma_native_prompt_id_pattern = re.compile('s\d+')
    usma_nonnative_demo_pattern = re.compile(
        "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+"
    )
    usma_nonnative_path_pattern = re.compile('nonnative.+\.wav')

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)

    transcripts = defaultdict(dict)
    # store answers trnscripts
    answers_trans_path = Path(transcript_dir, heroico_dataset_answers)
    with open(answers_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            # some recordings do not have a transcript, skip them here
            if not answers_line_pattern.match(line):
                continue
            # IDs have the form speaker/prompt_id
            spk_utt, text = line.split(maxsplit=1)
            spk_id, prompt_id = spk_utt.split('/')
            utt_id = '-'.join(['answers', spk_id, prompt_id])
            transcripts[utt_id] = text

    # store heroico recitations transcripts
    heroico_recitations_trans_path = Path(transcript_dir,
                                          heroico_dataset_recordings)
    with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not heroico_recitations_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['heroico-recitations', idx])
            transcripts[utt_id] = text

    # store usma transcripts
    usma_trans_path = Path(transcript_dir, usma_dataset)
    with open(usma_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not usma_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['usma', idx])
            transcripts[utt_id] = text

    # store utterance info
    audio_paths = speech_dir.rglob('*.wav')
    uttdata = {}
    for wav_file in audio_paths:
        wav_path = Path(wav_file)
        path_components = wav_path.parts
        pid = wav_path.stem
        if re.findall(answers_path_pattern, str(wav_file)):
            # store utternce info for Heroico Answers
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['answers', spk, pid])
            if utt_id not in transcripts:
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='answers',
                                             utterance_id=utt_id,
                                             transcript=transcripts[utt_id])
        elif re.findall(usma_native_path_pattern, str(wav_file)):
            # store utterance info for usma native data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_native_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
            if not usma_native_prompt_id_pattern.match(pid):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif re.findall(usma_nonnative_path_pattern, str(wav_file)):
            # store utterance data for usma nonnative data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_nonnative_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) <= 354 or int(pid) >= 562:
            # store utterance info for heroico recitations for train dataset
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations', spk, pid])
            trans_id = '-'.join(['heroico-recitations', pid])
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='heroico-recitations',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) > 354 and int(pid) < 562:
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations-repeats', spk, pid])
            trans_id = '-'.join(['heroico-recitations-repeats', pid])
            uttdata[str(wav_file)] = UttInfo(
                fold='devtest',
                speaker=spk,
                prompt_id=pid,
                subcorpus='heroico-recitations-repeats',
                utterance_id=utt_id,
                transcript=transcripts[trans_id])
        else:
            logging.warning(f'No such file: {wav_file}')

    audio_paths = speech_dir.rglob('*.wav')
    audio_files = [w for w in audio_paths]

    for fld in folds:
        metadata = {}
        for wav_file in audio_files:
            wav_path = Path(wav_file)
            # skip files with no record
            if not uttdata[str(wav_file)]:
                continue
            # only process the current fold
            if uttdata[str(wav_file)].fold != fld:
                continue
            path_components = wav_path.parts
            prompt_id = wav_path.stem
            # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
            # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
            info = soundfile.info(str(wav_file))
            spk = wav_path.parts[-2]
            utt_id = '-'.join(
                [uttdata[str(wav_file)].subcorpus, spk, prompt_id])
            metadata[utt_id] = HeroicoMetaData(
                audio_path=wav_file,
                audio_info=info,
                text=uttdata[str(wav_file)].transcript)

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.samplerate),
                      num_samples=metadata[idx].audio_info.frames,
                      duration=metadata[idx].audio_info.duration)
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='Spanish',
                               speaker=idx.split('-')[-2],
                               text=metadata[idx].text)
            for idx in audio.recordings)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{fld}.json')
            audio.to_json(output_dir / f'recordings_{fld}.json')

        manifests[fld] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 18
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / 'metadata.csv'
    assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}'
    metadata = {}
    with open(metadata_csv_path) as f:
        for line in f:
            idx, text, _ = line.split('|')
            audio_path = corpus_dir / 'wavs' / f'{idx}.wav'
            if audio_path.is_file():
                # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                info = torchaudio.info(str(audio_path))
                metadata[idx] = LJSpeechMetaData(audio_path=audio_path,
                                                 audio_info=info[0],
                                                 text=text)
            else:
                logging.warning(f'No such file: {audio_path}')

    # Audio
    audio = RecordingSet.from_recordings(
        Recording(id=idx,
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source=str(metadata[idx].audio_path))
                  ],
                  sampling_rate=int(metadata[idx].audio_info.rate),
                  num_samples=metadata[idx].audio_info.length,
                  duration=(metadata[idx].audio_info.length /
                            metadata[idx].audio_info.rate))
        for idx in metadata)

    # Supervision
    supervision = SupervisionSet.from_segments(
        SupervisionSegment(id=idx,
                           recording_id=idx,
                           start=0.0,
                           duration=audio.recordings[idx].duration,
                           channel=0,
                           language='English',
                           gender='female',
                           text=metadata[idx].text)
        for idx in audio.recordings)

    if output_dir is not None:
        supervision.to_json(output_dir / 'supervisions.json')
        audio.to_json(output_dir / 'audio.json')

    return {'audio': audio, 'supervisions': supervision}
Exemplo n.º 19
0
def prepare_rir_noise(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    parts: Sequence[str] = ("point_noise", "iso_noise", "real_rir", "sim_rir"),
) -> Dict[str, Dict[str, Union[RecordingSet, CutSet]]]:
    """
    Prepare the RIR Noise corpus.

    :param corpus_dir: Pathlike, the path of the dir to store the dataset.
    :param output_dir: Pathlike, the path of the dir to write the manifests.
    :param parts: Sequence[str], the parts of the dataset to prepare.

    The corpus contains 4 things: point-source noises (point_noise), isotropic noises (iso_noise),
    real RIRs (real_rir), and simulated RIRs (sim_rir). We will prepare these parts
    in the corresponding dict keys.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if not parts:
        raise ValueError("No parts specified for manifest preparation.")
    if isinstance(parts, str):
        parts = [parts]

    manifests = defaultdict(dict)
    for part in parts:
        logging.info(f"Preparing {part}...")
        audio_dir = corpus_dir / PARTS[part]
        assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
        if part == "sim_rir":
            # The "small", "medium", and "large" rooms have the same file names, so
            # we have to handle them separately to avoid duplicating manifests.
            recordings = []
            for room_type in ("small", "medium", "large"):
                room_dir = audio_dir / f"{room_type}room"
                recordings += [
                    Recording.from_file(
                        file, recording_id=f"{room_type}-{file.stem}")
                    for file in room_dir.rglob("*.wav")
                ]
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                recordings)
        elif part == "point_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav"))
        elif part == "iso_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "noise" in file.stem)
        elif part == "real_rir":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "rir" in file.stem)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in manifests:
            for key, manifest in manifests[part].items():
                manifest.to_file(output_dir /
                                 f"{part.replace('_','-')}_{key}_all.jsonl.gz")

    return manifests
Exemplo n.º 20
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}")
        for wav in corpus_dir.rglob("*.wav"))
    supervisions = []
    for path in corpus_dir.rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = "suitcase_corpus" in path.parts

        speaker = (path.parent.parent.name.lower()
                   )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = (f"suitcase_corpus-{speaker}"
                  if is_suitcase_corpus else f"{speaker}-{path.stem}")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=speaker_meta[speaker]["gender"],
                custom={"accent": speaker_meta[speaker]["native_lang"]},
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        "read": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" not in r.id),
            "supervisions":
            supervisions.filter(
                lambda s: "suitcase_corpus" not in s.recording_id),
        },
        "suitcase": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" in r.id),
            "supervisions":
            supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id),
        },
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests["recordings"].to_file(
                output_dir / f"l2-arctic_recordings_{key}.jsonl.gz")
            manifests["supervisions"].to_file(
                output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz")

    return splits
Exemplo n.º 21
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt'
    transcript_dict = {}
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}'
        for audio_path in wav_path.rglob('**/*.wav'):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f'No transcript: {idx}')
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f'No such file: {audio_path}')
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(id=idx,
                                         recording_id=idx,
                                         start=0.0,
                                         duration=recording.duration,
                                         channel=0,
                                         language='Chinese',
                                         speaker=speaker,
                                         text=text.strip())
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Exemplo n.º 22
0
def prepare_mini_librispeech(
        corpus_dir: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(
                id=idx,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[0],
                        source=str(metadata[idx].audio_path)
                    )
                ],
                sampling_rate=int(metadata[idx].audio_info.rate),
                num_samples=metadata[idx].audio_info.length,
                duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)
            )
            for idx in metadata
        )

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=audio.recordings[idx].duration,
                channel=0,
                language='English',
                speaker=re.sub(r'-.*', r'', idx),
                text=metadata[idx].text.strip()
            )
            for idx in audio.recordings
        )

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Exemplo n.º 23
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1] if path_or_cmd.
                    endswith("|") else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id],
                                            sampling_rate),
            duration=durations[recording_id],
        ) for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [
                sup_string.strip().split() for sup_string in f
            ]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            ) for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldi_native_io"):
        if frame_shift is not None:
            import kaldi_native_io
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldi_native_io",
                    num_frames=mat.shape[0],
                    num_features=mat.shape[1],
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat.shape[0] * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].
                    recording_id if supervision_set is not None else utt_id,
                    channels=0,
                )
                for utt_id, mat in kaldi_native_io.SequentialFloatMatrixReader(
                    f"scp:{feats_scp}"))
        else:
            warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: "
                          "frame_shift must be not None. "
                          "Feature import omitted.")

    return recording_set, supervision_set, feature_set
Exemplo n.º 24
0
def prepare_adept(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
):
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names,
        e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        Recording.from_file(
            path=path,
            # converts:
            #   path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav
            # to:
            #   propositional_attitude_surprise_ad01_0204
            recording_id=str(path.relative_to(path.parent.parent.parent))
            [:-4].replace("/", "_"),
        ) for path in (corpus_dir / "wav_44khz").rglob("*.wav"))

    supervisions = []

    with open(corpus_dir / "adept_prompts.json") as f:
        interpretation_map = json.load(f)

    for path in (corpus_dir / "txt").rglob("*.txt"):
        annotation_type, label, prompt_id = str(
            path.relative_to(path.parent.parent.parent))[:-4].split("/")
        speaker_id = "ADEPT_" + prompt_id.split("_")[0]
        recording_id = "_".join((annotation_type, label, prompt_id))
        interpretation_group = interpretation_map.get(annotation_type)
        interpretation = (interpretation_group[prompt_id][label]
                          if interpretation_group else None)
        recording = recordings[recording_id]
        custom = {
            "type": annotation_type,
            "label": label,
            "prompt_id": prompt_id
        }
        if interpretation:
            # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc
            # Interpretations' labels meaning is defined by their textual realisation:
            #  {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"}
            custom["text"] = interpretation
        supervisions.append(
            SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=path.read_text(),
                language="English",
                speaker=speaker_id,
                custom=custom,
            ))

    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        supervisions.to_file(output_dir / "adept_supervisions.json")
        recordings.to_file(output_dir / "adept_recordings.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 25
0
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "mini_librispeech":
        dataset_parts = set(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*")
        )
    elif dataset_parts == "auto":
        dataset_parts = (
            set(LIBRISPEECH)
            .union(MINI_LIBRISPEECH)
            .intersection(path.name for path in corpus_dir.glob("*"))
        )
        if not dataset_parts:
            raise ValueError(
                f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}"
            )
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(
            dataset_parts=dataset_parts, output_dir=output_dir
        )

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in tqdm(dataset_parts, desc="Dataset parts"):
            logging.info(f"Processing LibriSpeech subset: {part}")
            if manifests_exist(part=part, output_dir=output_dir):
                logging.info(f"LibriSpeech subset: {part} already prepared - skipping.")
                continue
            recordings = []
            supervisions = []
            part_path = corpus_dir / part
            futures = []
            for trans_path in tqdm(
                part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False
            ):
                alignments = {}
                ali_path = trans_path.parent / (
                    trans_path.stem.split(".")[0] + ".alignment.txt"
                )
                if ali_path.exists():
                    alignments = parse_alignments(ali_path)
                # "trans_path" file contains lines like:
                #
                #   121-121726-0000 ALSO A POPULAR CONTRIVANCE
                #   121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
                #   121-121726-0002 ANGOR PAIN PAINFUL TO HEAR
                #
                # We will create a separate Recording and SupervisionSegment for those.
                with open(trans_path) as f:
                    for line in f:
                        futures.append(
                            ex.submit(parse_utterance, part_path, line, alignments)
                        )

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segment = result
                recordings.append(recording)
                supervisions.append(segment)

            recording_set = RecordingSet.from_recordings(recordings)
            supervision_set = SupervisionSet.from_segments(supervisions)

            validate_recordings_and_supervisions(recording_set, supervision_set)

            if output_dir is not None:
                supervision_set.to_file(output_dir / f"supervisions_{part}.json")
                recording_set.to_file(output_dir / f"recordings_{part}.json")

            manifests[part] = {
                "recordings": recording_set,
                "supervisions": supervision_set,
            }

    return manifests
Exemplo n.º 26
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / "data_aishell/transcript/aishell_transcript_v0.8.txt"
    transcript_dict = {}
    with open(transcript_path, "r", encoding="utf-8") as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ["train", "dev", "test"]
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / "data_aishell" / "wav" / f"{part}"
        for audio_path in wav_path.rglob("**/*.wav"):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f"No transcript: {idx}")
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="Chinese",
                speaker=speaker,
                text=text.strip(),
            )
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_file(output_dir /
                                    f"aishell_supervisions_{part}.jsonl.gz")
            recording_set.to_file(output_dir /
                                  f"aishell_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recording_set,
            "supervisions": supervision_set
        }

    return manifests
Exemplo n.º 27
0
def prepare_mls(
        corpus_dir: Pathlike,
        output_dir: Optional[Pathlike] = None,
        opus: bool = True,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """
    Prepare Multilingual LibriSpeech corpus.

    Returns a dict structured like the following:

    .. code-block:: python

        {
            'english': {
                'train': {'recordings': RecordingSet(...), 'supervisions': SupervisionSet(...)},
                'dev': ...,
                'test': ...
            },
            'polish': { ... },
            ...
        }

    :param corpus_dir: Path to the corpus root (directories with specific languages should be inside).
    :param output_dir: Optional path where the manifests should be stored.
    :param opus: Should we scan for OPUS files (otherwise we'll look for FLAC files).
    :param num_jobs: How many jobs should be used for creating recording manifests.
    :return: A dict with structure: ``d[language][split] = {recordings, supervisions}``.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir) if output_dir is not None else None
    assert corpus_dir.is_dir()

    languages = {
        d.name.split('_')[1]: d
        for d in corpus_dir.glob('mls_*')
        if d.is_dir()
           and '_lm_' not in d.name
           and (opus or not d.name.endswith('opus'))
    }
    logging.info(f'Found MLS languages: {list(languages)}')

    manifests = defaultdict(dict)
    for lang, lang_dir in tqdm(languages.items(), desc='Langauges', total=len(languages)):
        logging.info(f'Processing language: {lang}')

        # Read the speaker to gender mapping.
        spk2gender = {}
        for line in (lang_dir / 'metainfo.txt').read_text().splitlines():
            spk, gender, *_ = line.split('|')
            spk2gender[spk.strip()] = gender.strip()

        for split in tqdm(['test', 'dev', 'train'], desc='Splits'):

            # If everything is ready, read it and skip it.
            recordings_path = None if output_dir is None else output_dir / f'recordings_{lang}_{split}.jsonl.gz'
            supervisions_path = None if output_dir is None else output_dir / f'supervisions_{lang}_{split}.jsonl.gz'
            if (
                    recordings_path is not None and recordings_path.is_file() and
                    supervisions_path is not None and supervisions_path.is_file()
            ):
                logging.info(f'Skipping - {lang}/{split} - already exists!')
                recordings = RecordingSet.from_file(recordings_path)
                supervisions = SupervisionSet.from_file(supervisions_path)
                manifests[lang][split] = {
                    'recordings': recordings,
                    'supervisions': supervisions
                }
                continue

            # Create recordings manifest.
            split_dir = lang_dir / split
            recordings = RecordingSet.from_dir(
                path=split_dir,
                pattern='*.opus' if opus else '*.flac',
                num_jobs=num_jobs
            )

            # Create supervisions manifest.
            supervisions = []
            for line in (split_dir / 'transcripts.txt').read_text().splitlines():
                recording_id, text = line.split('\t')
                speaker = recording_id.split('_')[0]
                supervisions.append(SupervisionSegment(
                    id=recording_id,
                    recording_id=recording_id,
                    text=text,
                    speaker=speaker,
                    gender=spk2gender[speaker],
                    start=0.0,
                    duration=recordings.duration(recording_id),
                    language=lang
                ))
            supervisions = SupervisionSet.from_segments(supervisions)

            # Fix any missing recordings/supervisions.
            recordings, supervisions = fix_manifests(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            # Save for return.
            manifests[lang][split] = {
                'recordings': recordings,
                'supervisions': supervisions
            }

            # Optional storage on disk.
            if output_dir is not None:
                output_dir.mkdir(exist_ok=True, parents=True)
                recordings.to_jsonl(recordings_path)
                supervisions.to_jsonl(supervisions_path)

    return dict(manifests)
Exemplo n.º 28
0
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav,
            recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}")
        for wav in corpus_dir.rglob("*.wav"))
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[
            0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int((path.parent /
                       "voice.feats").read_text().splitlines()[1].replace(
                           "age ", "").strip())
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_indic_recordings.json")
        supervisions.to_json(output_dir / "cmu_indic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}