Exemplo n.º 1
0
def test_mix_same_recording_channels():
    recording = Recording('rec',
                          sampling_rate=8000,
                          num_samples=30 * 8000,
                          duration=30,
                          sources=[
                              AudioSource('file',
                                          channels=[0],
                                          source='irrelevant1.wav'),
                              AudioSource('file',
                                          channels=[1],
                                          source='irrelevant2.wav')
                          ])
    cut_set = CutSet.from_cuts([
        Cut('cut1', start=0, duration=30, channel=0, recording=recording),
        Cut('cut2', start=0, duration=30, channel=1, recording=recording)
    ])

    mixed = cut_set.mix_same_recording_channels()
    assert len(mixed) == 1

    cut = mixed[0]
    assert isinstance(cut, MixedCut)
    assert len(cut.tracks) == 2
    assert cut.tracks[0].cut == cut_set[0]
    assert cut.tracks[1].cut == cut_set[1]
Exemplo n.º 2
0
def test_serialization(format, compressed):
    recording_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channels=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration=0.5
        )
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            recording_set.to_yaml(f.name)
            deserialized = RecordingSet.from_yaml(f.name)
        if format == 'json':
            recording_set.to_json(f.name)
            deserialized = RecordingSet.from_json(f.name)
    assert deserialized == recording_set
Exemplo n.º 3
0
def test_serialization():
    audio_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channel_ids=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channel_ids=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration_seconds=0.5
        )
    ])
    with NamedTemporaryFile() as f:
        audio_set.to_yaml(f.name)
        deserialized = RecordingSet.from_yaml(f.name)
    assert deserialized == audio_set
Exemplo n.º 4
0
def test_audio_url_downloading():
    audio_source = AudioSource(
        type="url",
        channels=[0],
        source=
        "https://github.com/lhotse-speech/lhotse/blob/master/test/fixtures/mono_c0.wav?raw=true",
    )
    audio_source.load_audio()
Exemplo n.º 5
0
def parse_utterance(
    audio: Any, root_path: Path
) -> Optional[Tuple[Recording, List[SupervisionSegment]]]:
    sampling_rate = int(audio["sample_rate"])
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=list(range(int(audio["channels"]))),
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(
            duration=Seconds(audio["duration"]), sampling_rate=sampling_rate
        ),
        sampling_rate=sampling_rate,
        duration=Seconds(audio["duration"]),
    )
    segments = []
    for seg in audio["segments"]:
        segments.append(
            SupervisionSegment(
                id=seg["sid"],
                recording_id=audio["aid"],
                start=Seconds(seg["begin_time"]),
                duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8),
                channel=0,
                language="English",
                speaker=seg["speaker"],
                text=seg["text_tn"],
            )
        )
    return recording, segments
Exemplo n.º 6
0
def libri_cut():
    return Cut(
        channel=0,
        duration=16.04,
        features=Features(
            channels=0,
            duration=16.04,
            num_features=40,
            num_frames=1604,
            recording_id='recording-1',
            sampling_rate=16000,
            start=0.0,
            storage_path=
            'test/fixtures/libri/storage/fc37eb69-43a8-4e6f-a302-646a76606b38.llc',
            storage_type='lilcom',
            type='fbank',
        ),
        recording=Recording(
            id='recording-1',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='test/fixtures/libri/libri-1088-134315-0000.wav',
                )
            ],
            sampling_rate=16000,
            num_samples=256640,
            duration=1604,
        ),
        id='849e13d8-61a2-4d09-a542-dac1aee1b544',
        start=0.0,
        supervisions=[],
    )
Exemplo n.º 7
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recordings = []
    for audio_path in tqdm(audio_paths, desc="Preparing audio"):
        session_name = audio_path.parts[-2]
        if audio_path.suffix == ".wav":
            audio_sf = sf.SoundFile(str(audio_path))
            num_frames = audio_sf.frames
            num_channels = audio_sf.channels
            samplerate = audio_sf.samplerate
        else:
            audio_sf, samplerate = read_sph(audio_path)
            num_channels, num_frames = audio_sf.shape
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=list(range(num_channels)),
                        source=str(audio_path),
                    )
                ],
                sampling_rate=samplerate,
                num_samples=num_frames,
                duration=num_frames / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 8
0
def dummy_recording():
    return Recording(
        id='irrelevant',
        sources=[AudioSource(type='file', channels=[0], source='irrelevant')],
        sampling_rate=16000,
        num_samples=160000,
        duration=10.0)
Exemplo n.º 9
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in channel_wavs.items():
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(type="file",
                                channels=[idx],
                                source=str(audio_path))
                    for idx, audio_path in enumerate(sorted(channel_paths))
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 10
0
def parse_utterance(
        audio: Any,
        root_path: Path
) -> Optional[Tuple[Recording, List[SupervisionSegment]]]:
    # Opus-format audio would be decoded at 48kHz by force, with the original sampling rate being ignored.
    opus_decoding_sample_rate = 48000

    recording = Recording(id=audio['aid'],
                          sources=[AudioSource(type='file',
                                               channels=list(range(int(audio['channels']))),
                                               source=f'{root_path}/{audio["path"]}')],
                          num_samples=round(opus_decoding_sample_rate * Seconds(audio['duration']), ndigits=8),
                          sampling_rate=opus_decoding_sample_rate,
                          duration=Seconds(audio['duration'])).resample(int(audio['sample_rate']))
    segments = []
    for seg in audio['segments']:
        segments.append(SupervisionSegment(id=seg['sid'],
                                           recording_id=audio['aid'],
                                           start=Seconds(seg['begin_time']),
                                           duration=round(Seconds(seg['end_time'] - seg['begin_time']), ndigits=8),
                                           channel=0,
                                           language='English',
                                           speaker=seg['speaker'],
                                           text=seg['text_tn']))
    return recording, segments
Exemplo n.º 11
0
def parse_utterance(
    audio: Any, root_path: Path, subsets: Sequence
) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]:
    sampling_rate = 16000
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=[0],
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(duration=audio["duration"],
                                        sampling_rate=sampling_rate),
        sampling_rate=sampling_rate,
        duration=audio["duration"],
    )
    segments = defaultdict(dict)
    for sub in subsets:
        segments[sub] = []
    for seg in audio["segments"]:
        segment = SupervisionSegment(
            id=seg["sid"],
            recording_id=audio["aid"],
            start=seg["begin_time"],
            duration=add_durations(seg["end_time"], -seg["begin_time"],
                                   sampling_rate),
            language="Chinese",
            text=seg["text"].strip(),
        )
        for sub in seg["subsets"]:
            if sub in subsets:
                segments[sub].append(segment)
    return recording, segments
Exemplo n.º 12
0
def libri_cut():
    return Cut(
        channel=0,
        duration=16.04,
        features=Features(
            channels=0,
            duration=16.04,
            num_features=40,
            num_frames=1604,
            frame_shift=0.01,
            recording_id='recording-1',
            sampling_rate=16000,
            start=0.0,
            storage_path='test/fixtures/libri/storage',
            storage_key='30c2440c-93cb-4e83-b382-f2a59b3859b4.llc',
            storage_type='lilcom_files',
            type='fbank',
        ),
        recording=Recording(
            id='recording-1',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='test/fixtures/libri/libri-1088-134315-0000.wav',
                )],
            sampling_rate=16000,
            num_samples=256640,
            duration=1604,
        ),
        id='849e13d8-61a2-4d09-a542-dac1aee1b544',
        start=0.0,
        supervisions=[],
    )
Exemplo n.º 13
0
def dummy_recording():
    return Recording(
        id="irrelevant",
        sources=[AudioSource(type="file", channels=[0], source="irrelevant")],
        sampling_rate=16000,
        num_samples=160000,
        duration=10.0,
    )
Exemplo n.º 14
0
def recording():
    return Recording(
        id='rec',
        sources=[AudioSource(type='file', channels=[0, 1], source='test/fixtures/stereo.wav')],
        sampling_rate=8000,
        num_samples=8000,
        duration=1.0
    )
Exemplo n.º 15
0
def cut_with_relative_paths():
    return Cut('cut', 0, 10, 0,
               features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000,
                                 storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc',
                                 start=0,
                                 duration=10),
               recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0)
               )
Exemplo n.º 16
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(
            id="rec1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10,
            sources=[
                AudioSource(type="file", channels=[0], source="dummy.wav")
            ],
        )
    ])
Exemplo n.º 17
0
def recording():
    return Recording(
        id="rec",
        sources=[
            AudioSource(type="file",
                        channels=[0, 1],
                        source="test/fixtures/stereo.wav")
        ],
        sampling_rate=8000,
        num_samples=8000,
        duration=1.0,
    )
Exemplo n.º 18
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(id='rec1',
                  sampling_rate=16000,
                  num_samples=160000,
                  duration=10,
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source='dummy.wav')
                  ])
    ])
Exemplo n.º 19
0
def test_recording_from_sphere(relative_path_depth, expected_source_path):
    rec = Recording.from_sphere('test/fixtures/stereo.sph',
                                relative_path_depth=relative_path_depth)
    assert rec == Recording(id='stereo',
                            sampling_rate=8000,
                            num_samples=8000,
                            duration=1.0,
                            sources=[
                                AudioSource(type='file',
                                            channels=[0, 1],
                                            source=expected_source_path)
                            ])
Exemplo n.º 20
0
def test_mix_same_recording_channels():
    recording = Recording(
        "rec",
        sampling_rate=8000,
        num_samples=30 * 8000,
        duration=30,
        sources=[
            AudioSource("file", channels=[0], source="irrelevant1.wav"),
            AudioSource("file", channels=[1], source="irrelevant2.wav"),
        ],
    )
    cut_set = CutSet.from_cuts([
        MonoCut("cut1", start=0, duration=30, channel=0, recording=recording),
        MonoCut("cut2", start=0, duration=30, channel=1, recording=recording),
    ])

    mixed = cut_set.mix_same_recording_channels()
    assert len(mixed) == 1

    cut = mixed[0]
    assert isinstance(cut, MixedCut)
    assert len(cut.tracks) == 2
    assert cut.tracks[0].cut == cut_set[0]
    assert cut.tracks[1].cut == cut_set[1]
Exemplo n.º 21
0
def dummy_recording_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        recs = RecordingSet.from_recordings([
            Recording(
                id="rec1",
                sampling_rate=16000,
                num_samples=160000,
                duration=10,
                sources=[
                    AudioSource(type="file", channels=[0], source="dummy.wav")
                ],
            )
        ])
        recs.to_file(f.name)
        f.flush()
        yield RecordingSet.from_jsonl_lazy(f.name)
Exemplo n.º 22
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Processing audio files"):
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        sources = []
        all_mono = True
        for idx, audio_path in enumerate(sorted(channel_paths)):
            audio = sf.SoundFile(str(audio_path))
            if audio.channels > 1:
                logging.warning(
                    f"Skipping recording {session_name} since it has a stereo channel"
                )
                all_mono = False
                break
            sources.append(
                AudioSource(type="file",
                            channels=[idx],
                            source=str(audio_path)))

        if not all_mono:
            continue

        recordings.append(
            Recording(
                id=session_name,
                sources=sources,
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 23
0
def prepare_audio_grouped(
    audio_paths: List[Pathlike],
    channel_to_idx_map: Dict[str, Dict[str, int]] = None,
) -> RecordingSet:

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-2], audio_paths)

    if channel_to_idx_map is None:
        channel_to_idx_map = defaultdict(dict)
    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Preparing audio"):
        if session_name not in channel_to_idx_map:
            channel_to_idx_map[session_name] = {
                c: idx
                for idx, c in enumerate(["chanE", "chanF", "chan6", "chan7"])
            }
        audio_sf, samplerate = read_sph(channel_paths[0])

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=[
                            channel_to_idx_map[session_name][audio_path.stem]
                        ],
                        source=str(audio_path),
                    ) for audio_path in sorted(channel_paths)
                    if audio_path.stem in channel_to_idx_map[session_name]
                ],
                sampling_rate=samplerate,
                num_samples=audio_sf.shape[1],
                duration=audio_sf.shape[1] / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 24
0
def cut_with_relative_paths():
    return MonoCut(
        "cut",
        0,
        10,
        0,
        features=Features(
            type="fbank",
            num_frames=1000,
            num_features=40,
            sampling_rate=8000,
            storage_type="lilcom_files",
            storage_path="storage_dir",
            storage_key="feats.llc",
            start=0,
            duration=10,
            frame_shift=0.01,
        ),
        recording=Recording("rec", [AudioSource("file", [0], "audio.wav")],
                            8000, 80000, 10.0),
    )
Exemplo n.º 25
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recording_manifest = defaultdict(dict)

    recordings = []
    for audio_path in audio_paths:
        session_name = audio_path.parts[-3]
        audio_sf = sf.SoundFile(str(audio_path))
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(type='file',
                                channels=list(range(audio_sf.channels)),
                                source=str(audio_path))
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 26
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recordings = []
    for audio_path in tqdm(audio_paths, desc="Processing audio files"):
        session_name = audio_path.parts[-3]
        audio_sf = sf.SoundFile(str(audio_path))
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=list(range(audio_sf.channels)),
                        source=str(audio_path),
                    )
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Exemplo n.º 27
0
def prepare_librispeech(
        corpus_dir: Pathlike,
        dataset_parts: Optional[Tuple[str]] = dataset_parts_mini,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(
                id=idx,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[0],
                        source=str(metadata[idx].audio_path)
                    )
                ],
                sampling_rate=int(metadata[idx].audio_info.rate),
                num_samples=metadata[idx].audio_info.length,
                duration=metadata[idx].audio_info.length / metadata[idx].audio_info.rate
            )
            for idx in metadata
        )

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=audio.recordings[idx].duration,
                channel=0,
                language='English',
                speaker=re.sub(r'-.*', r'', idx),
                text=metadata[idx].text.strip()
            )
            for idx in audio.recordings
        )

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Exemplo n.º 28
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Exemplo n.º 29
0
def prepare_heroico(
    speech_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    speech_dir = Path(speech_dir)
    transcript_dir = Path(transcript_dir)
    assert speech_dir.is_dir(), f'No such directory: {speech_dir}'
    assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)

    # set some patterns to match fields in transcript files and filenames
    answers_line_pattern = re.compile("\d+/\d+\t.+")
    answers_path_pattern = re.compile('Answers_Spanish')
    heroico_recitations_line_pattern = re.compile("\d+\t.+")
    heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish')
    heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish')
    usma_line_pattern = re.compile("s\d+\t.+")
    usma_native_demo_pattern = re.compile(
        "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+")
    usma_native_path_pattern = re.compile('usma/native')
    usma_native_prompt_id_pattern = re.compile('s\d+')
    usma_nonnative_demo_pattern = re.compile(
        "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+"
    )
    usma_nonnative_path_pattern = re.compile('nonnative.+\.wav')

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)

    transcripts = defaultdict(dict)
    # store answers trnscripts
    answers_trans_path = Path(transcript_dir, heroico_dataset_answers)
    with open(answers_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            # some recordings do not have a transcript, skip them here
            if not answers_line_pattern.match(line):
                continue
            # IDs have the form speaker/prompt_id
            spk_utt, text = line.split(maxsplit=1)
            spk_id, prompt_id = spk_utt.split('/')
            utt_id = '-'.join(['answers', spk_id, prompt_id])
            transcripts[utt_id] = text

    # store heroico recitations transcripts
    heroico_recitations_trans_path = Path(transcript_dir,
                                          heroico_dataset_recordings)
    with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not heroico_recitations_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['heroico-recitations', idx])
            transcripts[utt_id] = text

    # store usma transcripts
    usma_trans_path = Path(transcript_dir, usma_dataset)
    with open(usma_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not usma_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['usma', idx])
            transcripts[utt_id] = text

    # store utterance info
    audio_paths = speech_dir.rglob('*.wav')
    uttdata = {}
    for wav_file in audio_paths:
        wav_path = Path(wav_file)
        path_components = wav_path.parts
        pid = wav_path.stem
        if re.findall(answers_path_pattern, str(wav_file)):
            # store utternce info for Heroico Answers
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['answers', spk, pid])
            if utt_id not in transcripts:
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='answers',
                                             utterance_id=utt_id,
                                             transcript=transcripts[utt_id])
        elif re.findall(usma_native_path_pattern, str(wav_file)):
            # store utterance info for usma native data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_native_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
            if not usma_native_prompt_id_pattern.match(pid):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif re.findall(usma_nonnative_path_pattern, str(wav_file)):
            # store utterance data for usma nonnative data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_nonnative_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) <= 354 or int(pid) >= 562:
            # store utterance info for heroico recitations for train dataset
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations', spk, pid])
            trans_id = '-'.join(['heroico-recitations', pid])
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='heroico-recitations',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) > 354 and int(pid) < 562:
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations-repeats', spk, pid])
            trans_id = '-'.join(['heroico-recitations-repeats', pid])
            uttdata[str(wav_file)] = UttInfo(
                fold='devtest',
                speaker=spk,
                prompt_id=pid,
                subcorpus='heroico-recitations-repeats',
                utterance_id=utt_id,
                transcript=transcripts[trans_id])
        else:
            logging.warning(f'No such file: {wav_file}')

    audio_paths = speech_dir.rglob('*.wav')
    audio_files = [w for w in audio_paths]

    for fld in folds:
        metadata = {}
        for wav_file in audio_files:
            wav_path = Path(wav_file)
            # skip files with no record
            if not uttdata[str(wav_file)]:
                continue
            # only process the current fold
            if uttdata[str(wav_file)].fold != fld:
                continue
            path_components = wav_path.parts
            prompt_id = wav_path.stem
            # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
            # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
            info = soundfile.info(str(wav_file))
            spk = wav_path.parts[-2]
            utt_id = '-'.join(
                [uttdata[str(wav_file)].subcorpus, spk, prompt_id])
            metadata[utt_id] = HeroicoMetaData(
                audio_path=wav_file,
                audio_info=info,
                text=uttdata[str(wav_file)].transcript)

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.samplerate),
                      num_samples=metadata[idx].audio_info.frames,
                      duration=metadata[idx].audio_info.duration)
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='Spanish',
                               speaker=idx.split('-')[-2],
                               text=metadata[idx].text)
            for idx in audio.recordings)

        validate_recordings_and_supervisions(audio, supervision)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{fld}.json')
            audio.to_json(output_dir / f'recordings_{fld}.json')

        manifests[fld] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Exemplo n.º 30
0
def prepare_librimix(
    librimix_csv: Pathlike,
    output_dir: Optional[Pathlike] = None,
    with_precomputed_mixtures: bool = False,
    sampling_rate: int = 16000,
    min_segment_seconds: Seconds = 3.0,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    import pandas as pd

    assert Path(librimix_csv).is_file(), f"No such file: {librimix_csv}"
    df = pd.read_csv(librimix_csv)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    manifests = defaultdict(dict)

    # First, create the audio manifest that specifies the pairs of source recordings
    # to be mixed together.
    audio_sources = RecordingSet.from_recordings(
        Recording(
            id=row["mixture_ID"],
            sources=[
                AudioSource(
                    type="file", channels=[0], source=row["source_1_path"]),
                AudioSource(
                    type="file", channels=[1], source=row["source_2_path"]),
            ],
            sampling_rate=sampling_rate,
            num_samples=int(row["length"]),
            duration=row["length"] / sampling_rate,
        ) for idx, row in df.iterrows()
        if row["length"] / sampling_rate > min_segment_seconds)
    supervision_sources = make_corresponding_supervisions(audio_sources)
    validate_recordings_and_supervisions(audio_sources, supervision_sources)
    if output_dir is not None:
        audio_sources.to_file(output_dir /
                              "librimix_recordings_sources.jsonl.gz")
        supervision_sources.to_file(output_dir /
                                    "librimix_supervisions_sources.jsonl.gz")
    manifests["sources"] = {
        "recordings": audio_sources,
        "supervisions": supervision_sources,
    }

    # When requested, create an audio manifest for the pre-computed mixtures.
    # A different way of performing the mix would be using Lhotse's on-the-fly
    # overlaying of audio Cuts.
    if with_precomputed_mixtures:
        audio_mix = RecordingSet.from_recordings(
            Recording(
                id=row["mixture_ID"],
                sources=[
                    AudioSource(
                        type="file", channels=[0], source=row["mixture_path"]),
                ],
                sampling_rate=sampling_rate,
                num_samples=int(row["length"]),
                duration=row["length"] / sampling_rate,
            ) for idx, row in df.iterrows()
            if row["length"] / sampling_rate > min_segment_seconds)
        supervision_mix = make_corresponding_supervisions(audio_mix)
        validate_recordings_and_supervisions(audio_mix, supervision_mix)
        if output_dir is not None:
            audio_mix.to_file(output_dir / "librimix_recordings_mix.jsonl.gz")
            supervision_mix.to_file(output_dir /
                                    "librimix_supervisions_mix.jsonl.gz")
        manifests["premixed"] = {
            "recordings": audio_mix,
            "supervisions": supervision_mix,
        }

    # When the LibriMix CSV specifies noises, we create a separate RecordingSet for them,
    # so that we can extract their features and overlay them as Cuts later.
    if "noise_path" in df:
        audio_noise = RecordingSet.from_recordings(
            Recording(
                id=row["mixture_ID"],
                sources=[
                    AudioSource(
                        type="file", channels=[0], source=row["noise_path"]),
                ],
                sampling_rate=sampling_rate,
                num_samples=int(row["length"]),
                duration=row["length"] / sampling_rate,
            ) for idx, row in df.iterrows()
            if row["length"] / sampling_rate > min_segment_seconds)
        supervision_noise = make_corresponding_supervisions(audio_noise)
        validate_recordings_and_supervisions(audio_noise, supervision_noise)
        if output_dir is not None:
            audio_noise.to_file(output_dir /
                                "librimix_recordings_noise.jsonl.gz")
            supervision_noise.to_file(output_dir /
                                      "libirmix_supervisions_noise.jsonl.gz")
        manifests["noise"] = {
            "recordings": audio_noise,
            "supervisions": supervision_noise,
        }

    return manifests