def test_augmentation_chain_randomized( recording: Recording, rir: Recording, target_sampling_rate: int, sp_factor: float, vp_factor: float, reverb: bool, resample_first: bool, cut_duration: Seconds, ): if resample_first: recording_aug = (recording.resample(target_sampling_rate). perturb_speed(sp_factor).perturb_volume(vp_factor)) else: recording_aug = (recording.perturb_speed(sp_factor).resample( target_sampling_rate).perturb_volume(vp_factor)) if reverb: recording_aug = recording_aug.reverb_rir(rir) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples cut_aug = MonoCut( id="dummy", start=0.5125, duration=cut_duration, channel=0, recording=recording_aug, ) assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
def mixed_overlapping_cut_set(): """ Input mixed cut:: |---------------mixedcut--------------------| |--------rec1 0-30s--------| |-------rec2 15-45s--------| |---sup1--| |-----sup3-----| |sup2| """ cut_set = CutSet.from_cuts([ MonoCut( 'cut1', start=0, duration=30, channel=0, recording=Recording( id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0 ), supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=10.5), SupervisionSegment('sup2', 'rec1', start=10, duration=6), ] ).mix( MonoCut( 'cut2', start=15, duration=30, channel=0, recording=Recording( id='rec2', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0 ), supervisions=[ SupervisionSegment('sup3', 'rec2', start=8, duration=18), ] ), offset_other_by=15.0 ) ]) assert isinstance(cut_set[0], MixedCut) return cut_set
def test_trim_to_supervisions_mixed_cuts(): cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, recording=Recording(id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=10.0), supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5), SupervisionSegment('sup2', 'rec1', start=10, duration=5), SupervisionSegment('sup3', 'rec1', start=20, duration=8), ]).append( Cut('cut2', start=0, duration=30, channel=0, recording=Recording(id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=10.0), supervisions=[ SupervisionSegment('sup4', 'rec1', start=0, duration=30), ])) ]) assert isinstance(cut_set[0], MixedCut) cuts = cut_set.trim_to_supervisions() assert len(cuts) == 4 # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap assert all(isinstance(cut, Cut) for cut in cuts) assert all(len(cut.supervisions) == 1 for cut in cuts) assert all(cut.supervisions[0].start == 0 for cut in cuts) cut = cuts[0] # Check that the cuts preserved their start/duration/supervisions after trimming assert cut.start == 1.5 assert cut.duration == 8.5 assert cut.supervisions[0].id == 'sup1' cut = cuts[1] assert cut.start == 10 assert cut.duration == 5 assert cut.supervisions[0].id == 'sup2' cut = cuts[2] assert cut.start == 20 assert cut.duration == 8 assert cut.supervisions[0].id == 'sup3' cut = cuts[3] assert cut.start == 0 assert cut.duration == 30 assert cut.supervisions[0].id == 'sup4'
def test_resample_opus(): # Test that reading OPUS files after resampling # does not raise an exception. r = Recording.from_file("test/fixtures/mono_c0.opus") r.load_audio() r1 = r.resample(24000) r1.load_audio()
def test_audio_caching_disabled_works(): lhotse.set_caching_enabled(False) # Disable caching. np.random.seed(89) # Reproducibility. # Prepare two different waveforms. noise1 = np.random.rand(1, 32000).astype(np.float32) noise2 = np.random.rand(1, 32000).astype(np.float32) # Sanity check -- the noises are different assert np.abs(noise1 - noise2).sum() != 0 # Save the first waveform in a file. with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(noise1), sample_rate=16000) recording = Recording.from_file(f.name) # Read the audio -- should be equal to noise1. audio = recording.load_audio() np.testing.assert_almost_equal(audio, noise1) # Save noise2 to the same location. torchaudio.save(f.name, torch.from_numpy(noise2), sample_rate=16000) # Read the audio -- should be equal to noise2, # and the caching is ignored (doesn't happen). audio = recording.load_audio() np.testing.assert_almost_equal(audio, noise2)
def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import Features, MonoCut, Recording, SupervisionSegment from lhotse.array import deserialize_array from lhotse.cut import MixedCut if "shape" in data or "array" in data: return deserialize_array(data) if "sources" in data: return Recording.from_dict(data) if "num_features" in data: return Features.from_dict(data) if "type" not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop("type") if cut_type == "MonoCut": return MonoCut.from_dict(data) if cut_type == "Cut": warnings.warn( "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. " "Please re-generate it with Lhotse v0.8 as it might stop working in a future version " "(using manifest.from_file() and then manifest.to_file() should be sufficient)." ) return MonoCut.from_dict(data) if cut_type == "MixedCut": return MixedCut.from_dict(data) raise ValueError( f"Unexpected cut type during deserialization: '{cut_type}'")
def _process_file( file_path: Pathlike, speaker_metadata: Dict[str, SpeakerMetadata], ) -> Tuple[Recording, SupervisionSegment]: """ Process a single wav file and return a Recording and a SupervisionSegment. """ speaker_id = file_path.parent.parent.stem session_id = file_path.parent.stem uttid = file_path.stem recording_id = f"{speaker_id}-{session_id}-{uttid}" recording = Recording.from_file(file_path, recording_id=recording_id) supervision = SupervisionSegment( id=recording_id, recording_id=recording_id, speaker=speaker_id, gender=speaker_metadata[speaker_id].gender, start=0.0, duration=recording.duration, custom={ "speaker_name": speaker_metadata[speaker_id].name, "nationality": speaker_metadata[speaker_id].nationality, "split": speaker_metadata[speaker_id].split, }, ) return recording, supervision
def test_cut_load_custom_recording_pad_left(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = MonoCut( id="x", start=0, duration=duration, channel=0, recording=dummy_recording(0, duration=duration), ) cut.my_favorite_song = recording cut_pad = cut.pad(duration=60.0, direction="left") restored_audio = cut_pad.load_my_favorite_song() assert restored_audio.shape == (1, 960000) # 16000 * 60 np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]]) np.testing.assert_almost_equal(audio, restored_audio[:, -audio.shape[1]:])
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ("train", "dev", "test"): root = tedlium_root / "legacy" / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / "sph").glob("*.sph")) stms = list((root / "stm").glob("*.stm")) assert len(stms) == len(recordings), ( f"Mismatch: found {len(recordings)} " f"sphere files and {len(stms)} STM files. " f"You might be missing some parts of TEDLIUM...") segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = " ".join(words).replace("{NOISE}", "[NOISE]") if text == "ignore_time_segment_in_scoring": continue segments.append( SupervisionSegment( id=f"{rec_id}-{idx}", recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language="English", speaker=rec_id, )) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { "recordings": recordings, "supervisions": supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") return corpus
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, )) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def process_file( file_path: Any, transcript_dict : Dict ) -> Tuple[Recording, SupervisionSegment]: """ Process a single wav file and return a Recording and a SupervisionSegment. """ try: speaker_id = Path(file_path).stem uttid = speaker_id recording_id = speaker_id recording = Recording.from_file(file_path) text = transcript_dict[uttid] supervision = SupervisionSegment( id=recording_id, recording_id=recording_id, speaker=speaker_id, start=0.0, duration=recording.duration, channel=0, language="Chinese", text=text.strip(), ) return recording, supervision except Exception as e: logging.info(f"process_file err: {file_path}") return None, None
def prepare_single_partition( raw_manifest_path: Path, corpus_dir: Path, speaker_id: str, clean_or_other: str, ): recordings = [] supervisions = [] for meta in load_jsonl(raw_manifest_path): recording = Recording.from_file(corpus_dir / meta["audio_filepath"]) recordings.append(recording) supervisions.append( SupervisionSegment( id=recording.id, recording_id=recording.id, start=0, duration=recording.duration, channel=0, text=meta["text"], speaker=ID2SPEAKER[speaker_id], gender=ID2GENDER[speaker_id], custom={ "text_punct": meta["text_normalized"], "split": clean_or_other }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) return recordings, supervisions
def test_mix_same_recording_channels(): recording = Recording('rec', sampling_rate=8000, num_samples=30 * 8000, duration=30, sources=[ AudioSource('file', channels=[0], source='irrelevant1.wav'), AudioSource('file', channels=[1], source='irrelevant2.wav') ]) cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, recording=recording), Cut('cut2', start=0, duration=30, channel=1, recording=recording) ]) mixed = cut_set.mix_same_recording_channels() assert len(mixed) == 1 cut = mixed[0] assert isinstance(cut, MixedCut) assert len(cut.tracks) == 2 assert cut.tracks[0].cut == cut_set[0] assert cut.tracks[1].cut == cut_set[1]
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ('train', 'dev', 'test'): root = tedlium_root / 'legacy' / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / 'sph').glob('*.sph') ) stms = list((root / 'stm').glob('*.stm')) assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \ f'sphere files and {len(stms)} STM files. ' \ f'You might be missing some parts of TEDLIUM...' segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = ' '.join(words).replace('{NOISE}', '[NOISE]') if text == 'ignore_time_segment_in_scoring': continue segments.append( SupervisionSegment( id=f'{rec_id}-{idx}', recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language='English', speaker=rec_id, ) ) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { 'recordings': recordings, 'supervisions': supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_json(output_dir / f'{split}_recordings.json') supervisions.to_json(output_dir / f'{split}_supervisions.json') return corpus
def test_cut_with_audio_move_to_memory(): path = "test/fixtures/mono_c0.wav" cut = dummy_cut(0, duration=0.5).drop_recording() cut.recording = Recording.from_file(path) memory_cut = cut.move_to_memory() np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
def cut_with_relative_paths(): return Cut('cut', 0, 10, 0, features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000, storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc', start=0, duration=10), recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0) )
def recording(): return Recording( id='rec', sources=[AudioSource(type='file', channels=[0, 1], source='test/fixtures/stereo.wav')], sampling_rate=8000, num_samples=8000, duration=1.0 )
def recording(file_source): return Recording( id="rec", sources=[file_source], sampling_rate=8000, num_samples=4000, duration=0.5, )
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def __post_init__(self): if isinstance(self.rir, dict): from lhotse import Recording # Pass a shallow copy of the RIR dict since `from_dict()` pops the `sources` key. self.rir = Recording.from_dict(self.rir.copy()) assert all( c < self.rir.num_channels for c in self.rir_channels), "Invalid channel index in `rir_channels`"
def test_cut_with_audio_move_to_memory_large_offset(): path = "test/fixtures/mono_c0.wav" cut = dummy_cut(0, duration=0.1).drop_recording() cut.recording = Recording.from_file(path) cut.start = 0.4 assert isclose(cut.end, 0.5) memory_cut = cut.move_to_memory() np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( MonoCut(id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0, recording=Recording(id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0)) for _ in range(n_cuts))
def test_cut_trim_to_supervisions_extend_handles_end_of_recording(mono_cut): """ Scenario:: |----------Recording---------| |---Sup1----| |--Sup2--| |------------Cut-------------| Into:: |----------Recording---------| |---Cut1----| |---Cut2---| |---Sup1----| |--Sup2--| """ cut = MonoCut( id="X", start=0.0, duration=10.0, channel=0, supervisions=[ SupervisionSegment(id="X", recording_id="X", start=0.0, duration=4.0), SupervisionSegment(id="X", recording_id="X", start=7.0, duration=3.0), ], recording=Recording(id="X", sources=[], sampling_rate=8000, num_samples=80000, duration=10.0), ) cuts = cut.trim_to_supervisions(min_duration=4.0) assert len(cuts) == 2 c1, c2 = cuts assert c1.start == 0 assert c1.duration == 4.0 assert len(c1.supervisions) == 1 (c1_s1, ) = c1.supervisions assert c1_s1.start == 0.0 assert c1_s1.duration == 4.0 assert c2.start == 6.5 assert c2.duration == 3.5 assert len(c2.supervisions) == 1 (c2_s1, ) = c2.supervisions assert c2_s1.start == 0.5 assert c2_s1.duration == 3.0
def recording(): return Recording( id="rec", sources=[ AudioSource(type="file", channels=[0, 1], source="test/fixtures/stereo.wav") ], sampling_rate=8000, num_samples=8000, duration=1.0, )
def with_recording(self, sampling_rate: int, num_samples: int) -> Recording: f = NamedTemporaryFile('wb', suffix='.wav') self.files.append(f) duration = num_samples / sampling_rate samples = np.random.rand(num_samples) soundfile.write(f.name, samples, samplerate=sampling_rate) return Recording( id=str(uuid4()), sources=[AudioSource(type='file', channels=[0], source=f.name)], sampling_rate=sampling_rate, num_samples=num_samples, duration=duration)
def make_recording(sampling_rate: int, num_samples: int) -> Recording: # The idea is that we're going to write to a temporary file with a sine wave recording # of specified duration and sampling rate, and clean up only after the test is executed. with NamedTemporaryFile('wb', suffix='.wav') as f: duration = num_samples / sampling_rate samples: np.ndarray = np.sin(2 * np.pi * np.arange(0, num_samples) / sampling_rate) soundfile.write(f, samples, samplerate=sampling_rate) yield Recording( id=f'recording-{sampling_rate}-{duration}', sources=[AudioSource(type='file', channels=[0], source=f.name)], sampling_rate=sampling_rate, num_samples=num_samples, duration=duration)
def test_cut_move_to_memory_audio_serialization(): path = "test/fixtures/mono_c0.wav" cut = dummy_cut(0, duration=0.5).drop_recording() cut.recording = Recording.from_file(path) cut_with_audio = cut.move_to_memory() assert cut.custom is None # original cut is unmodified data = cut_with_audio.to_dict() cut_deserialized = MonoCut.from_dict(data) np.testing.assert_equal(cut_deserialized.load_audio(), cut_with_audio.load_audio())
def recording_set(): return RecordingSet.from_recordings([ Recording(id='x', sources=[ AudioSource(type='file', channels=[0], source='text/fixtures/mono_c0.wav'), AudioSource(type='command', channels=[1], source='cat text/fixtures/mono_c1.wav') ], sampling_rate=8000, num_samples=4000, duration=0.5) ])
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type="file", channels=[0], source="irrelevant") ], ), supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], ) return CutSet.from_cuts([ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ])