def test_combine(manifest_type): expected = DummyManifest(manifest_type, begin_id=0, end_id=200) combined = combine( DummyManifest(manifest_type, begin_id=0, end_id=68), DummyManifest(manifest_type, begin_id=68, end_id=136), DummyManifest(manifest_type, begin_id=136, end_id=200), ) assert combined == expected
def test_split_randomize(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=100) manifest_subsets = manifest.split(num_splits=2, randomize=True) assert len(manifest_subsets) == 2 recombined_items = list(manifest_subsets[0]) + list(manifest_subsets[1]) assert len(recombined_items) == len(manifest) # Different ordering (we convert to lists first because the *Set classes might internally # re-order after concatenation, e.g. by using dict or post-init sorting) assert recombined_items != list(manifest)
def test_split_even(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=100) manifest_subsets = split(manifest, num_splits=2) assert len(manifest_subsets) == 2 assert manifest_subsets[0] == DummyManifest(manifest_type, begin_id=0, end_id=50) assert manifest_subsets[1] == DummyManifest(manifest_type, begin_id=50, end_id=100)
def test_split_odd(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=100) manifest_subsets = split(manifest, num_splits=3) assert len(manifest_subsets) == 3 assert manifest_subsets[0] == DummyManifest(manifest_type, begin_id=0, end_id=34) assert manifest_subsets[1] == DummyManifest(manifest_type, begin_id=34, end_id=68) assert manifest_subsets[2] == DummyManifest(manifest_type, begin_id=68, end_id=100)
def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionIterableDataset( cuts=cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000 ) dloader = DataLoader(dataset, batch_size=None, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(list(batch['supervisions']['cut_id'])) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_add_feature_sets(): expected = DummyManifest(FeatureSet, begin_id=0, end_id=10) feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5) feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10) combined = feature_set_1 + feature_set_2 assert combined == expected
def test_cannot_split_to_more_chunks_than_items(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=1) with pytest.raises(ValueError): manifest.split(num_splits=10)
def test_add_audio_sets(): expected = DummyManifest(RecordingSet, begin_id=0, end_id=10) audio_set_1 = DummyManifest(RecordingSet, begin_id=0, end_id=5) audio_set_2 = DummyManifest(RecordingSet, begin_id=5, end_id=10) combined = audio_set_1 + audio_set_2 assert combined == expected
def test_add_recording_sets(): expected = DummyManifest(RecordingSet, begin_id=0, end_id=10) recording_set_1 = DummyManifest(RecordingSet, begin_id=0, end_id=5) recording_set_2 = DummyManifest(RecordingSet, begin_id=5, end_id=10) combined = recording_set_1 + recording_set_2 assert combined == expected
def test_add_supervision_sets(): expected = DummyManifest(SupervisionSet, begin_id=0, end_id=10) supervision_set_1 = DummyManifest(SupervisionSet, begin_id=0, end_id=5) supervision_set_2 = DummyManifest(SupervisionSet, begin_id=5, end_id=10) combined = supervision_set_1 + supervision_set_2 assert combined == expected