Пример #1
0
def parse_utterance(
    audio: Any, root_path: Path, subsets: Sequence
) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]:
    sampling_rate = 16000
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=[0],
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(duration=audio["duration"],
                                        sampling_rate=sampling_rate),
        sampling_rate=sampling_rate,
        duration=audio["duration"],
    )
    segments = defaultdict(dict)
    for sub in subsets:
        segments[sub] = []
    for seg in audio["segments"]:
        segment = SupervisionSegment(
            id=seg["sid"],
            recording_id=audio["aid"],
            start=seg["begin_time"],
            duration=add_durations(seg["end_time"], -seg["begin_time"],
                                   sampling_rate),
            language="Chinese",
            text=seg["text"].strip(),
        )
        for sub in seg["subsets"]:
            if sub in subsets:
                segments[sub].append(segment)
    return recording, segments
Пример #2
0
def test_cut_load_custom_recording_pad_left():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=60.0, direction="left")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]])
        np.testing.assert_almost_equal(audio, restored_audio[:,
                                                             -audio.shape[1]:])
Пример #3
0
def parse_utterance(
    audio: Any, root_path: Path
) -> Optional[Tuple[Recording, List[SupervisionSegment]]]:
    sampling_rate = int(audio["sample_rate"])
    recording = Recording(
        id=audio["aid"],
        sources=[
            AudioSource(
                type="file",
                channels=list(range(int(audio["channels"]))),
                source=str(root_path / audio["path"]),
            )
        ],
        num_samples=compute_num_samples(
            duration=Seconds(audio["duration"]), sampling_rate=sampling_rate
        ),
        sampling_rate=sampling_rate,
        duration=Seconds(audio["duration"]),
    )
    segments = []
    for seg in audio["segments"]:
        segments.append(
            SupervisionSegment(
                id=seg["sid"],
                recording_id=audio["aid"],
                start=Seconds(seg["begin_time"]),
                duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8),
                channel=0,
                language="English",
                speaker=seg["speaker"],
                text=seg["text_tn"],
            )
        )
    return recording, segments
Пример #4
0
def test_cut_load_custom_recording_truncate():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = dummy_cut(0, duration=duration)
        cut.my_favorite_song = recording

        cut_trunc = cut.truncate(duration=5.0)

        restored_audio = cut_trunc.load_my_favorite_song()
        assert restored_audio.shape == (1, 80000)

        np.testing.assert_almost_equal(audio[:, :80000], restored_audio)