def test_cut_features_mask(self, supervisions): cut = Cut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000), supervisions=supervisions) mask = cut.supervisions_feature_mask() assert (mask[:50] == 1).all() assert (mask[50:] == 0).all()
def test_cut_speakers_features_mask(self, supervisions, alignment): cut = Cut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000), supervisions=supervisions) mask = cut.speakers_feature_mask(use_alignment_if_exists=alignment) if alignment == "word": ones = [ np.index_exp[list(chain(range(0, 10), range(20, 40)))], np.index_exp[list(chain(range(60, 80)))] ] zeros = [ np.index_exp[list(chain(range(10, 20), range(40, 200)))], np.index_exp[list(chain(range(0, 60), range(80, 200)))] ] else: ones = [ np.index_exp[list(chain(range(0, 50)))], np.index_exp[list(chain(range(60, 80)))] ] zeros = [ np.index_exp[list(chain(range(50, 200)))], np.index_exp[list(chain(range(0, 60), range(80, 200)))] ] assert (mask[0, ones[0]] == 1).all() assert (mask[1, ones[1]] == 1).all() assert (mask[0, zeros[0]] == 0).all() assert (mask[1, zeros[1]] == 0).all()
def test_cut_speakers_audio_mask(self, supervisions, alignment): cut = Cut('cut', start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000), supervisions=supervisions) mask = cut.speakers_audio_mask(use_alignment_if_exists=alignment) if alignment == "word": ones = [ np.index_exp[list(chain(range(0, 1600), range(3200, 6400)))], np.index_exp[list(chain(range(9600, 12800)))] ] zeros = [ np.index_exp[list(chain(range(1600, 3200), range(6400, 32000)))], np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))] ] else: ones = [ np.index_exp[range(0, 8000)], np.index_exp[range(9600, 12800)] ] zeros = [ np.index_exp[list(chain(range(8000, 32000)))], np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))] ] assert (mask[0, ones[0]] == 1).all() assert (mask[1, ones[1]] == 1).all() assert (mask[0, zeros[0]] == 0).all() assert (mask[1, zeros[1]] == 0).all()
def with_cut( self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False ) -> Cut: duration = num_samples / sampling_rate cut = Cut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples) ) if features: cut = self._with_features(cut) if supervision: cut.supervisions.append(SupervisionSegment( id=f'sup-{cut.id}', recording_id=cut.recording_id, start=0, duration=cut.duration, text='irrelevant' )) return cut
def test_cut_features_mask(self): cut = Cut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01)) mask = cut.supervisions_feature_mask() assert mask.sum() == 0
def make_cut(sampling_rate: int, num_samples: int) -> Cut: with make_recording(sampling_rate, num_samples) as recording: duration = num_samples / sampling_rate yield Cut(id=f'cut-{sampling_rate}-{duration}', start=0, duration=duration, channel=0, recording=recording)
def test_cut_audio_mask(self): cut = Cut('cut', start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000)) mask = cut.supervisions_audio_mask() assert mask.sum() == 0
def test_mixed_cut_audio_mask(self, supervisions): cut = Cut('cut', start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000), supervisions=supervisions) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_audio_mask() assert (mask[:8000] == 1).all() assert (mask[8000:32000] == 0).all() assert (mask[32000:40000] == 1).all() assert (mask[40000:] == 0).all()
def test_mixed_cut_features_mask(self, supervisions): cut = Cut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01), supervisions=supervisions) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_feature_mask() assert (mask[:50] == 1).all() assert (mask[50:200] == 0).all() assert (mask[200:250] == 1).all() assert (mask[250:] == 0).all()
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( Cut(id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0, recording=Recording(id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0)) for _ in range(n_cuts))
def cut_with_supervision_start01(recording): return Cut(id='cut', start=0.1, duration=0.4, channel=0, supervisions=[ SupervisionSegment(id='sup', recording_id='rec', start=0.1, duration=0.3) ], recording=recording)
def with_cut(self, sampling_rate: int, num_samples: int, features: bool = True) -> Cut: duration = num_samples / sampling_rate cut = Cut(id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples)) if features: cut = self._with_features(cut) return cut
def cut_set(): cut = Cut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def test_mixed_cut_features_mask(self, supervisions): cut = Cut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01), supervisions=supervisions) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_feature_mask() ones = np.index_exp[list( chain(range(0, 50), range(60, 80), range(200, 250), range(260, 280)))] zeros = np.index_exp[list( chain(range(50, 60), range(80, 200), range(250, 260), range(280, 400)))] assert (mask[ones] == 1).all() assert (mask[zeros] == 0).all()
def test_mixed_cut_audio_mask(self, supervisions): cut = Cut('cut', start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000), supervisions=supervisions) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_audio_mask() ones = np.index_exp[list( chain(range(0, 8000), range(9600, 12800), range(32000, 40000), range(41600, 44800)))] zeros = np.index_exp[list( chain(range(8000, 9600), range(12800, 32000), range(40000, 41600), range(44800, 64000)))] assert (mask[ones] == 1).all() assert (mask[zeros] == 0).all()
def test_augmentation_chain_randomized(target_sampling_rate: int, sp_factor: float, resample_first: bool, cut_duration: Seconds): recording = Recording.from_file( 'test/fixtures/libri/libri-1088-134315-0000.wav') if resample_first: recording_aug = recording.resample(target_sampling_rate).perturb_speed( sp_factor) else: recording_aug = recording.perturb_speed(sp_factor).resample( target_sampling_rate) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples cut_aug = Cut(id='dummy', start=0.5125, duration=cut_duration, channel=0, recording=recording_aug) assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
def cut(recording): return Cut(id='cut', start=0, duration=1.0, channel=0, recording=recording)
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( Cut(id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0) for _ in range(n_cuts))