Exemplo n.º 1
0
def test_augmentation_chain_randomized(
    recording: Recording,
    rir: Recording,
    target_sampling_rate: int,
    sp_factor: float,
    vp_factor: float,
    reverb: bool,
    resample_first: bool,
    cut_duration: Seconds,
):
    if resample_first:
        recording_aug = (recording.resample(target_sampling_rate).
                         perturb_speed(sp_factor).perturb_volume(vp_factor))
    else:
        recording_aug = (recording.perturb_speed(sp_factor).resample(
            target_sampling_rate).perturb_volume(vp_factor))
    if reverb:
        recording_aug = recording_aug.reverb_rir(rir)

    audio_aug = recording_aug.load_audio()
    assert audio_aug.shape[1] == recording_aug.num_samples

    cut_aug = MonoCut(
        id="dummy",
        start=0.5125,
        duration=cut_duration,
        channel=0,
        recording=recording_aug,
    )
    assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
Exemplo n.º 2
0
def mixed_overlapping_cut_set():
    """
    Input mixed cut::
        |---------------mixedcut--------------------|
        |--------rec1 0-30s--------|
                     |-------rec2 15-45s--------|
         |---sup1--|         |-----sup3-----|
                 |sup2|
    """
    cut_set = CutSet.from_cuts([
        MonoCut(
            'cut1', start=0, duration=30, channel=0,
            recording=Recording(
                id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0
            ),
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=10.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=6),
            ]
        ).mix(
            MonoCut(
                'cut2', start=15, duration=30, channel=0,
                recording=Recording(
                    id='rec2', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0
                ),
                supervisions=[
                    SupervisionSegment('sup3', 'rec2', start=8, duration=18),
                ]
            ),
            offset_other_by=15.0
        )
    ])
    assert isinstance(cut_set[0], MixedCut)
    return cut_set
Exemplo n.º 3
0
def test_trim_to_supervisions_mixed_cuts():
    cut_set = CutSet.from_cuts([
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            recording=Recording(id='rec1',
                                sources=[],
                                sampling_rate=16000,
                                num_samples=160000,
                                duration=10.0),
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]).append(
                Cut('cut2',
                    start=0,
                    duration=30,
                    channel=0,
                    recording=Recording(id='rec1',
                                        sources=[],
                                        sampling_rate=16000,
                                        num_samples=160000,
                                        duration=10.0),
                    supervisions=[
                        SupervisionSegment('sup4',
                                           'rec1',
                                           start=0,
                                           duration=30),
                    ]))
    ])
    assert isinstance(cut_set[0], MixedCut)
    cuts = cut_set.trim_to_supervisions()
    assert len(cuts) == 4
    # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap
    assert all(isinstance(cut, Cut) for cut in cuts)
    assert all(len(cut.supervisions) == 1 for cut in cuts)
    assert all(cut.supervisions[0].start == 0 for cut in cuts)
    cut = cuts[0]
    # Check that the cuts preserved their start/duration/supervisions after trimming
    assert cut.start == 1.5
    assert cut.duration == 8.5
    assert cut.supervisions[0].id == 'sup1'
    cut = cuts[1]
    assert cut.start == 10
    assert cut.duration == 5
    assert cut.supervisions[0].id == 'sup2'
    cut = cuts[2]
    assert cut.start == 20
    assert cut.duration == 8
    assert cut.supervisions[0].id == 'sup3'
    cut = cuts[3]
    assert cut.start == 0
    assert cut.duration == 30
    assert cut.supervisions[0].id == 'sup4'
Exemplo n.º 4
0
def test_resample_opus():
    # Test that reading OPUS files after resampling
    # does not raise an exception.
    r = Recording.from_file("test/fixtures/mono_c0.opus")
    r.load_audio()
    r1 = r.resample(24000)
    r1.load_audio()
Exemplo n.º 5
0
def test_audio_caching_disabled_works():
    lhotse.set_caching_enabled(False)  # Disable caching.

    np.random.seed(89)  # Reproducibility.

    # Prepare two different waveforms.
    noise1 = np.random.rand(1, 32000).astype(np.float32)
    noise2 = np.random.rand(1, 32000).astype(np.float32)
    # Sanity check -- the noises are different
    assert np.abs(noise1 - noise2).sum() != 0

    # Save the first waveform in a file.
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(noise1), sample_rate=16000)
        recording = Recording.from_file(f.name)

        # Read the audio -- should be equal to noise1.
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise1)

        # Save noise2 to the same location.
        torchaudio.save(f.name, torch.from_numpy(noise2), sample_rate=16000)

        # Read the audio -- should be equal to noise2,
        # and the caching is ignored (doesn't happen).
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise2)
Exemplo n.º 6
0
def deserialize_item(data: dict) -> Any:
    # Figures out what type of manifest is being decoded with some heuristics
    # and returns a Lhotse manifest object rather than a raw dict.
    from lhotse import Features, MonoCut, Recording, SupervisionSegment
    from lhotse.array import deserialize_array
    from lhotse.cut import MixedCut

    if "shape" in data or "array" in data:
        return deserialize_array(data)
    if "sources" in data:
        return Recording.from_dict(data)
    if "num_features" in data:
        return Features.from_dict(data)
    if "type" not in data:
        return SupervisionSegment.from_dict(data)
    cut_type = data.pop("type")
    if cut_type == "MonoCut":
        return MonoCut.from_dict(data)
    if cut_type == "Cut":
        warnings.warn(
            "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. "
            "Please re-generate it with Lhotse v0.8 as it might stop working in a future version "
            "(using manifest.from_file() and then manifest.to_file() should be sufficient)."
        )
        return MonoCut.from_dict(data)
    if cut_type == "MixedCut":
        return MixedCut.from_dict(data)
    raise ValueError(
        f"Unexpected cut type during deserialization: '{cut_type}'")
Exemplo n.º 7
0
def _process_file(
    file_path: Pathlike,
    speaker_metadata: Dict[str, SpeakerMetadata],
) -> Tuple[Recording, SupervisionSegment]:
    """
    Process a single wav file and return a Recording and a SupervisionSegment.
    """
    speaker_id = file_path.parent.parent.stem
    session_id = file_path.parent.stem
    uttid = file_path.stem
    recording_id = f"{speaker_id}-{session_id}-{uttid}"
    recording = Recording.from_file(file_path, recording_id=recording_id)
    supervision = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        speaker=speaker_id,
        gender=speaker_metadata[speaker_id].gender,
        start=0.0,
        duration=recording.duration,
        custom={
            "speaker_name": speaker_metadata[speaker_id].name,
            "nationality": speaker_metadata[speaker_id].nationality,
            "split": speaker_metadata[speaker_id].split,
        },
    )
    return recording, supervision
Exemplo n.º 8
0
def test_cut_load_custom_recording_pad_left():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=60.0, direction="left")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]])
        np.testing.assert_almost_equal(audio, restored_audio[:,
                                                             -audio.shape[1]:])
Exemplo n.º 9
0
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ("train", "dev", "test"):
        root = tedlium_root / "legacy" / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / "sph").glob("*.sph"))
        stms = list((root / "stm").glob("*.stm"))
        assert len(stms) == len(recordings), (
            f"Mismatch: found {len(recordings)} "
            f"sphere files and {len(stms)} STM files. "
            f"You might be missing some parts of TEDLIUM...")
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
                    if text == "ignore_time_segment_in_scoring":
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f"{rec_id}-{idx}",
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language="English",
                            speaker=rec_id,
                        ))
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_file(output_dir /
                               f"tedlium_recordings_{split}.jsonl.gz")
            supervisions.to_file(output_dir /
                                 f"tedlium_supervisions_{split}.jsonl.gz")

    return corpus
Exemplo n.º 10
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav)
        for wav in (corpus_dir / "wav48").rglob("*.wav"))
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 11
0
def process_file(
    file_path: Any,
    transcript_dict : Dict
) -> Tuple[Recording, SupervisionSegment]:
    """
    Process a single wav file and return a Recording and a SupervisionSegment.
    """
    try:
        speaker_id = Path(file_path).stem
        uttid = speaker_id
        recording_id = speaker_id
        recording = Recording.from_file(file_path)
        text = transcript_dict[uttid]
        supervision = SupervisionSegment(
            id=recording_id,
            recording_id=recording_id,
            speaker=speaker_id,
            start=0.0,
            duration=recording.duration,
            channel=0,
            language="Chinese",
            text=text.strip(),
        )
        return recording, supervision
    except Exception as e:
        logging.info(f"process_file err: {file_path}")
        return None, None
Exemplo n.º 12
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
Exemplo n.º 13
0
def test_mix_same_recording_channels():
    recording = Recording('rec',
                          sampling_rate=8000,
                          num_samples=30 * 8000,
                          duration=30,
                          sources=[
                              AudioSource('file',
                                          channels=[0],
                                          source='irrelevant1.wav'),
                              AudioSource('file',
                                          channels=[1],
                                          source='irrelevant2.wav')
                          ])
    cut_set = CutSet.from_cuts([
        Cut('cut1', start=0, duration=30, channel=0, recording=recording),
        Cut('cut2', start=0, duration=30, channel=1, recording=recording)
    ])

    mixed = cut_set.mix_same_recording_channels()
    assert len(mixed) == 1

    cut = mixed[0]
    assert isinstance(cut, MixedCut)
    assert len(cut.tracks) == 2
    assert cut.tracks[0].cut == cut_set[0]
    assert cut.tracks[1].cut == cut_set[1]
Exemplo n.º 14
0
def prepare_tedlium(
        tedlium_root: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ('train', 'dev', 'test'):
        root = tedlium_root / 'legacy' / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / 'sph').glob('*.sph')
        )
        stms = list((root / 'stm').glob('*.stm'))
        assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \
                                             f'sphere files and {len(stms)} STM files. ' \
                                             f'You might be missing some parts of TEDLIUM...'
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = ' '.join(words).replace('{NOISE}', '[NOISE]')
                    if text == 'ignore_time_segment_in_scoring':
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f'{rec_id}-{idx}',
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language='English',
                            speaker=rec_id,
                        )
                    )
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_json(output_dir / f'{split}_recordings.json')
            supervisions.to_json(output_dir / f'{split}_supervisions.json')

    return corpus
Exemplo n.º 15
0
def test_cut_with_audio_move_to_memory():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.5).drop_recording()
    cut.recording = Recording.from_file(path)

    memory_cut = cut.move_to_memory()

    np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
Exemplo n.º 16
0
def cut_with_relative_paths():
    return Cut('cut', 0, 10, 0,
               features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000,
                                 storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc',
                                 start=0,
                                 duration=10),
               recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0)
               )
Exemplo n.º 17
0
def recording():
    return Recording(
        id='rec',
        sources=[AudioSource(type='file', channels=[0, 1], source='test/fixtures/stereo.wav')],
        sampling_rate=8000,
        num_samples=8000,
        duration=1.0
    )
Exemplo n.º 18
0
def recording(file_source):
    return Recording(
        id="rec",
        sources=[file_source],
        sampling_rate=8000,
        num_samples=4000,
        duration=0.5,
    )
Exemplo n.º 19
0
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None):
    manifests = defaultdict(dict)
    for split in ('dev', 'eval', 'training'):
        audio_dir = corpus_dir / f'conversational/{split}/audio'
        recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph'))
        if len(recordings) == 0:
            logging.warning(f"No SPHERE files found in {audio_dir}")
        manifests[split]['recordings'] = recordings

        supervisions = []
        text_dir = corpus_dir / f'conversational/{split}/transcription'
        for p in text_dir.glob('*'):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_')
            channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A')
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines = p.read_text().splitlines() + [None]
            for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                start = float(timestamp[1:-1])
                end = float(next_timestamp[1:-1])
                supervisions.append(
                    SupervisionSegment(
                        id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}',
                        recording_id=p.stem,
                        start=start,
                        duration=round(end - start, ndigits=8),
                        channel=0,
                        text=normalize_text(text),
                        language=BABELCODE2LANG[lang_code],
                        speaker=speaker,
                    )
                )
        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(
            manifests[split]['recordings'],
            manifests[split]['superevisions']
        )

        if output_dir is not None:
            language = BABELCODE2LANG[lang_code]
            if split == 'training':
                split = 'train'
            manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json')
            manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json')

    return manifests
Exemplo n.º 20
0
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Exemplo n.º 21
0
    def __post_init__(self):
        if isinstance(self.rir, dict):
            from lhotse import Recording

            # Pass a shallow copy of the RIR dict since `from_dict()` pops the `sources` key.
            self.rir = Recording.from_dict(self.rir.copy())
        assert all(
            c < self.rir.num_channels for c in
            self.rir_channels), "Invalid channel index in `rir_channels`"
Exemplo n.º 22
0
def test_cut_with_audio_move_to_memory_large_offset():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.1).drop_recording()
    cut.recording = Recording.from_file(path)
    cut.start = 0.4
    assert isclose(cut.end, 0.5)

    memory_cut = cut.move_to_memory()

    np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
Exemplo n.º 23
0
def random_cut_set(n_cuts=100) -> CutSet:
    return CutSet.from_cuts(
        MonoCut(id=uuid4(),
                start=round(random.uniform(0, 5), ndigits=8),
                duration=round(random.uniform(3, 10), ndigits=8),
                channel=0,
                recording=Recording(id=uuid4(),
                                    sources=[],
                                    sampling_rate=16000,
                                    num_samples=1600000,
                                    duration=100.0)) for _ in range(n_cuts))
def test_cut_trim_to_supervisions_extend_handles_end_of_recording(mono_cut):
    """
    Scenario::

        |----------Recording---------|
        |---Sup1----|       |--Sup2--|
        |------------Cut-------------|

    Into::

        |----------Recording---------|
        |---Cut1----|     |---Cut2---|
        |---Sup1----|       |--Sup2--|
    """
    cut = MonoCut(
        id="X",
        start=0.0,
        duration=10.0,
        channel=0,
        supervisions=[
            SupervisionSegment(id="X",
                               recording_id="X",
                               start=0.0,
                               duration=4.0),
            SupervisionSegment(id="X",
                               recording_id="X",
                               start=7.0,
                               duration=3.0),
        ],
        recording=Recording(id="X",
                            sources=[],
                            sampling_rate=8000,
                            num_samples=80000,
                            duration=10.0),
    )

    cuts = cut.trim_to_supervisions(min_duration=4.0)

    assert len(cuts) == 2
    c1, c2 = cuts

    assert c1.start == 0
    assert c1.duration == 4.0
    assert len(c1.supervisions) == 1
    (c1_s1, ) = c1.supervisions
    assert c1_s1.start == 0.0
    assert c1_s1.duration == 4.0

    assert c2.start == 6.5
    assert c2.duration == 3.5
    assert len(c2.supervisions) == 1
    (c2_s1, ) = c2.supervisions
    assert c2_s1.start == 0.5
    assert c2_s1.duration == 3.0
Exemplo n.º 25
0
def recording():
    return Recording(
        id="rec",
        sources=[
            AudioSource(type="file",
                        channels=[0, 1],
                        source="test/fixtures/stereo.wav")
        ],
        sampling_rate=8000,
        num_samples=8000,
        duration=1.0,
    )
Exemplo n.º 26
0
 def with_recording(self, sampling_rate: int,
                    num_samples: int) -> Recording:
     f = NamedTemporaryFile('wb', suffix='.wav')
     self.files.append(f)
     duration = num_samples / sampling_rate
     samples = np.random.rand(num_samples)
     soundfile.write(f.name, samples, samplerate=sampling_rate)
     return Recording(
         id=str(uuid4()),
         sources=[AudioSource(type='file', channels=[0], source=f.name)],
         sampling_rate=sampling_rate,
         num_samples=num_samples,
         duration=duration)
Exemplo n.º 27
0
def make_recording(sampling_rate: int, num_samples: int) -> Recording:
    # The idea is that we're going to write to a temporary file with a sine wave recording
    # of specified duration and sampling rate, and clean up only after the test is executed.
    with NamedTemporaryFile('wb', suffix='.wav') as f:
        duration = num_samples / sampling_rate
        samples: np.ndarray = np.sin(2 * np.pi * np.arange(0, num_samples) /
                                     sampling_rate)
        soundfile.write(f, samples, samplerate=sampling_rate)
        yield Recording(
            id=f'recording-{sampling_rate}-{duration}',
            sources=[AudioSource(type='file', channels=[0], source=f.name)],
            sampling_rate=sampling_rate,
            num_samples=num_samples,
            duration=duration)
Exemplo n.º 28
0
def test_cut_move_to_memory_audio_serialization():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.5).drop_recording()
    cut.recording = Recording.from_file(path)

    cut_with_audio = cut.move_to_memory()

    assert cut.custom is None  # original cut is unmodified

    data = cut_with_audio.to_dict()
    cut_deserialized = MonoCut.from_dict(data)

    np.testing.assert_equal(cut_deserialized.load_audio(),
                            cut_with_audio.load_audio())
Exemplo n.º 29
0
def recording_set():
    return RecordingSet.from_recordings([
        Recording(id='x',
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source='text/fixtures/mono_c0.wav'),
                      AudioSource(type='command',
                                  channels=[1],
                                  source='cat text/fixtures/mono_c1.wav')
                  ],
                  sampling_rate=8000,
                  num_samples=4000,
                  duration=0.5)
    ])
Exemplo n.º 30
0
def cut_set():
    cut = MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=Features(
            type="fbank",
            num_frames=100,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            start=0.0,
            duration=10.0,
            storage_type="lilcom",
            storage_path="irrelevant",
            storage_key="irrelevant",
        ),
        recording=Recording(
            id="rec-1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10.0,
            sources=[
                AudioSource(type="file", channels=[0], source="irrelevant")
            ],
        ),
        supervisions=[
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
        ],
    )
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id="cut-nosup", supervisions=[]),
        fastcopy(cut, id="cut-norec", recording=None),
        fastcopy(cut, id="cut-nofeat", features=None),
        cut.pad(duration=30.0, direction="left"),
        cut.pad(duration=30.0, direction="right"),
        cut.pad(duration=30.0, direction="both"),
        cut.mix(cut, offset_other_by=5.0, snr=8),
    ])