def test_cutmix(preserve_id: bool): speech_cuts = DummyManifest(CutSet, begin_id=0, end_id=10) for c in speech_cuts: c.duration = 10.0 noise_cuts = DummyManifest(CutSet, begin_id=100, end_id=102) for c in noise_cuts: c.duration = 1.5 tfnm = CutMix(noise_cuts, snr=None, prob=1.0, preserve_id=preserve_id) tfnm_cuts = tfnm(speech_cuts) for c in tfnm_cuts: assert isinstance(c, MixedCut) assert c.tracks[0].cut.duration == 10.0 assert sum(t.cut.duration for t in c.tracks[1:]) == 10.0 if preserve_id: assert all( cut.id == cut_noisy.id for cut, cut_noisy in zip(speech_cuts, tfnm_cuts) ) else: assert all( cut.id != cut_noisy.id for cut, cut_noisy in zip(speech_cuts, tfnm_cuts) )
def test_cut_pairs_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_source_frames=1000, ) sampled_src_cuts = [] sampled_tgt_cuts = [] for src_batch, tgt_batch in sampler: # Invariant 0: The order of source and target cut IDs is preserved within each batch. assert list(src_batch.ids) == list(tgt_batch.ids) sampled_src_cuts.extend(src_batch) sampled_tgt_cuts.extend(tgt_batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_src_cuts) == len(cut_set) assert len(sampled_tgt_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_src_cuts)) == len(sampled_src_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_src_cuts] != [c.id for c in lazy_cuts]
def test_repeat(manifest_type, preserve_id): data = DummyManifest(manifest_type, begin_id=0, end_id=10) expected = data + data eager_result = data.repeat(times=2, preserve_id=preserve_id) if preserve_id or manifest_type == FeatureSet: assert list(eager_result) == list(expected) else: items = list(eager_result) ref_items = list(expected) assert len(items) == len(ref_items) for i, refi in zip(items, ref_items): assert i.id.endswith("_repeat0") or i.id.endswith("_repeat1") i_modi = fastcopy(i, id=refi.id) assert i_modi == refi with as_lazy(data) as lazy_data: lazy_result = lazy_data.repeat(times=2, preserve_id=preserve_id) if preserve_id or manifest_type == FeatureSet: assert list(lazy_result) == list(expected) else: items = list(lazy_result) ref_items = list(expected) assert len(items) == len(ref_items) for i, refi in zip(items, ref_items): assert i.id.endswith("_repeat0") or i.id.endswith("_repeat1") i_modi = fastcopy(i, id=refi.id) assert i_modi == refi
def test_bucketing_sampler_buckets_have_different_durations(): cut_set_1s = DummyManifest(CutSet, begin_id=0, end_id=10) cut_set_2s = DummyManifest(CutSet, begin_id=10, end_id=20) for c in cut_set_2s: c.duration = 2.0 cut_set = cut_set_1s + cut_set_2s # The bucketing sampler should return 5 batches with two 1s cuts, and 10 batches with one 2s cut. sampler = BucketingSampler( cut_set, sampler_type=SingleCutSampler, max_frames=200, num_buckets=2 ) batches = list(sampler) assert len(batches) == 15 # All cuts have the same durations (i.e. are from the same bucket in this case) for batch in batches: batch_durs = [cut_set[cid].duration for cid in batch] assert all(d == batch_durs[0] for d in batch_durs) batches = sorted(batches, key=len) assert all(len(b) == 1 for b in batches[:10]) assert all(len(b) == 2 for b in batches[10:])
def test_bucketing_sampler_cut_pairs_equal_duration(shuffle): cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000) for idx, c in enumerate(cut_set): c.duration = ( 3 + idx * 1 / 50 ) # each cut has a different duration between [3, 23] # Target CutSet is going to have different durations # -- make sure the bucketing works well with that. cut_set_tgt = cut_set.map(lambda c: fastcopy(c, duration=1 / c.duration)) sampler = BucketingSampler( cut_set, cut_set_tgt, sampler_type=CutPairsSampler, bucket_method="equal_duration", num_buckets=10, shuffle=shuffle, ) # Ensure that each consecutive bucket has less cuts than the previous one prev_len = float("inf") bucket_cum_durs = [] for bucket_src, bucket_tgt in sampler.buckets: assert list(bucket_src.ids) == list(bucket_tgt.ids) bucket_cum_durs.append(sum(c.duration for c in bucket_src)) curr_len = len(bucket_src) assert curr_len < prev_len prev_len = curr_len # Assert that all bucket cumulative durations are within 1/10th of the mean mean_bucket_dur = mean(bucket_cum_durs) # ~ 1300s for d in bucket_cum_durs: assert abs(d - mean_bucket_dur) < 0.1 * mean_bucket_dur
def test_single_cut_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_duration=10.0, ) sampled_cuts = [] for batch in sampler: sampled_cuts.extend(batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_cuts)) == len(sampled_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_cuts] != [c.id for c in lazy_cuts]
def test_single_cut_sampler_time_constraints( max_duration, max_frames, max_samples, exception_expectation ): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) if max_frames is None: cut_set = cut_set.drop_features() with exception_expectation: sampler = SimpleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=max_frames, max_samples=max_samples, max_duration=max_duration, ) sampler_cut_ids = [] for batch in sampler: sampler_cut_ids.extend(batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampler_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampler_cut_ids)) == len(sampler_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert [c.id for c in sampler_cut_ids] != [c.id for c in cut_set]
def test_bucketing_sampler_cut_pairs_equal_len(shuffle): cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000) for idx, c in enumerate(cut_set): c.duration = ( 3 + idx * 1 / 50 ) # each cut has a different duration between [3, 23] # Target CutSet is going to have different durations # -- make sure the bucketing works well with that. cut_set_tgt = cut_set.map(lambda c: fastcopy(c, duration=1 / c.duration)) sampler = BucketingSampler( cut_set, cut_set_tgt, sampler_type=CutPairsSampler, bucket_method="equal_len", num_buckets=10, shuffle=shuffle, ) bucket_cum_durs = [] for bucket_src, bucket_tgt in sampler.buckets: bucket_cum_durs.append(sum(c.duration for c in bucket_src)) assert len(bucket_src) == 100 assert list(bucket_src.ids) == list(bucket_tgt.ids) # The variations in duration are over 10% of the mean bucket duration (because of equal lengths). mean_bucket_dur = mean(bucket_cum_durs) assert not all( abs(d - mean_bucket_dur) < 0.1 * mean_bucket_dur for d in bucket_cum_durs )
def test_round_robin_sampler(): cuts1 = DummyManifest(CutSet, begin_id=0, end_id=30) cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1100) sampler = RoundRobinSampler( # Note: each cut is 1s duration in this test. SimpleCutSampler(cuts1, max_duration=10), SimpleCutSampler(cuts2, max_duration=2), ) batches = [b for b in sampler] assert len(batches) == 3 + 50 batches_10cuts = [b for b in batches if len(b) == 10] assert len(batches_10cuts) == 3 batches_2cuts = [b for b in batches if len(b) == 2] assert len(batches_2cuts) == 50 assert len(batches[0]) == 10 assert len(batches[1]) == 2 assert len(batches[2]) == 10 assert len(batches[3]) == 2 assert len(batches[4]) == 10 assert len(batches[5]) == 2 assert len(batches[6]) == 2 assert len(batches[7]) == 2 assert len(batches[8]) == 2 assert len(batches[9]) == 2
def test_remove_missing_recordings_and_supervisions(): recordings = DummyManifest(RecordingSet, begin_id=0, end_id=100) supervisions = DummyManifest(SupervisionSet, begin_id=50, end_id=150) fix_recs, fix_sups = remove_missing_recordings_and_supervisions( recordings, supervisions) expected_ids = [f'dummy-recording-{idx:04d}' for idx in range(50, 100)] assert [r.id for r in fix_recs] == expected_ids assert [s.recording_id for s in fix_sups] == expected_ids
def test_cut_set_subset_cut_ids_preserves_order(): cuts = DummyManifest(CutSet, begin_id=0, end_id=1000) cut_ids = ["dummy-cut-0010", "dummy-cut-0171", "dummy-cut-0009"] subcuts = cuts.subset(cut_ids=cut_ids) cut1, cut2, cut3 = subcuts assert cut1.id == "dummy-cut-0010" assert cut2.id == "dummy-cut-0171" assert cut3.id == "dummy-cut-0009"
def test_combine(manifest_type): expected = DummyManifest(manifest_type, begin_id=0, end_id=200) combined = combine( DummyManifest(manifest_type, begin_id=0, end_id=68), DummyManifest(manifest_type, begin_id=68, end_id=136), DummyManifest(manifest_type, begin_id=136, end_id=200), ) assert combined == expected
def test_multiplexer_with_cuts_pickling(): cuts1 = DummyManifest(CutSet, begin_id=0, end_id=10) cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1005) mux = LazyIteratorMultiplexer(cuts1, cuts2, seed=0) data = pickle.dumps(mux) mux_rec = pickle.loads(data) assert list(mux) == list(mux_rec)
def test_split_randomize(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=100) manifest_subsets = manifest.split(num_splits=2, shuffle=True) assert len(manifest_subsets) == 2 recombined_items = list(manifest_subsets[0]) + list(manifest_subsets[1]) assert len(recombined_items) == len(manifest) # Different ordering (we convert to lists first because the *Set classes might internally # re-order after concatenation, e.g. by using dict or post-init sorting) assert recombined_items != list(manifest)
def test_bucketing_sampler_raises_value_error_on_lazy_cuts_input(): cut_set = DummyManifest(CutSet, begin_id=0, end_id=2) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) with pytest.raises(ValueError): sampler = BucketingSampler( lazy_cuts, max_duration=10.0, )
def test_split_even(manifest_type): manifest = DummyManifest(manifest_type, begin_id=0, end_id=100) manifest_subsets = manifest.split(num_splits=2) assert len(manifest_subsets) == 2 assert manifest_subsets[0] == DummyManifest(manifest_type, begin_id=0, end_id=50) assert manifest_subsets[1] == DummyManifest(manifest_type, begin_id=50, end_id=100)
def test_combine_lazy(manifest_type): expected = DummyManifest(manifest_type, begin_id=0, end_id=200) with as_lazy(DummyManifest(manifest_type, begin_id=0, end_id=68)) as part1, as_lazy( DummyManifest(manifest_type, begin_id=68, end_id=136) ) as part2, as_lazy( DummyManifest(manifest_type, begin_id=136, end_id=200) ) as part3: combined = combine(part1, part2, part3) # Equivalent under iteration assert list(combined) == list(expected)
def test_bucketing_sampler_cut_pairs(): cut_set1 = DummyManifest(CutSet, begin_id=0, end_id=1000) cut_set2 = DummyManifest(CutSet, begin_id=0, end_id=1000) sampler = BucketingSampler(cut_set1, cut_set2, sampler_type=CutPairsSampler) cut_ids = [] for batch in sampler: cut_ids.extend(batch) assert set(cut_set1.ids) == set(cut_ids) assert set(cut_set2.ids) == set(cut_ids)
def test_lazy_cuts_combine_split_issue(): # Test for lack of exception cuts = DummyManifest(CutSet, begin_id=0, end_id=1000) with TemporaryDirectory() as d, NamedTemporaryFile(suffix=".jsonl.gz") as f: cuts.to_file(f.name) f.flush() cuts_lazy = load_manifest_lazy(f.name) cuts_lazy = combine(cuts_lazy, cuts_lazy.perturb_speed(0.9)) cuts_lazy.split_lazy(d, chunk_size=100)
def test_bucketing_sampler_cut_pairs(): cut_set1 = DummyManifest(CutSet, begin_id=0, end_id=1000) cut_set2 = DummyManifest(CutSet, begin_id=0, end_id=1000) sampler = BucketingSampler(cut_set1, cut_set2, sampler_type=CutPairsSampler) src_cuts, tgt_cuts = [], [] for src_batch, tgt_batch in sampler: src_cuts.extend(src_batch) tgt_cuts.extend(tgt_batch) assert set(cut_set1.ids) == set(c.id for c in src_cuts) assert set(cut_set2.ids) == set(c.id for c in tgt_cuts)
def test_repeat(manifest_type): data = DummyManifest(manifest_type, begin_id=0, end_id=10) expected = data + data eager_result = data.repeat(times=2) assert list(eager_result) == list(expected) with as_lazy(data) as lazy_data: lazy_result = lazy_data.repeat(times=2) assert list(lazy_result) == list(expected)
def test_cut_set_mux_stop_early(): cuts1 = DummyManifest(CutSet, begin_id=0, end_id=10) cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1005) cuts_mux = CutSet.mux(cuts1, cuts2, seed=0, stop_early=True) def cid(i: int) -> str: return f"dummy-cut-{i:04d}" assert sorted([c.id for c in cuts_mux]) == [ cid(i) for i in (0, 1, 2, 3, 4, 1000, 1001, 1002, 1003, 1004) ] assert sorted([c.id for c in cuts_mux]) != [c.id for c in cuts_mux]
def test_sequential_jsonl_writer_overwrite(overwrite): cuts = DummyManifest(CutSet, begin_id=0, end_id=100) half = cuts.split(num_splits=2)[0] with NamedTemporaryFile(suffix='.jsonl') as jsonl_f: # Store the first half half.to_file(jsonl_f.name) # Open sequential writer with CutSet.open_writer(jsonl_f.name, overwrite=overwrite) as writer: if overwrite: assert all(not writer.contains(id_) for id_ in half.ids) else: assert all(writer.contains(id_) for id_ in half.ids)
def test_bucketing_sampler_chooses_buckets_randomly(): # Construct a CutSet that has 1000 cuts with 100 unique durations. # Makes it simple to track which bucket was selected. cut_set = CutSet({}) # empty for i in range(100): new_cuts = DummyManifest(CutSet, begin_id=i * 10, end_id=(i + 1) * 10) for c in new_cuts: c.duration = i cut_set = cut_set + new_cuts # Sampler that always select one cut. sampler = BucketingSampler( cut_set, sampler_type=SimpleCutSampler, max_cuts=1, max_frames=1000000000, num_buckets=100, ) # Batches of 1 guarantee that item is always a single-element list of cut IDs. durations = [cut_set[item[0].id].duration for item in sampler] # This is the "trick" part - 'groupby' groups the cuts together by their duration. # If there is a group that has a size of 10, that means the same bucket was chosen # for 10 consecutive batches, which is not what BucketingSampler is supposed to do # (the probability of that is extremely low). # We're actually setting that threshold lower to 8 which should never be triggered # anyway. lens = [] for key, group in groupby(durations): lens.append(len(list(group))) assert all(l < 8 for l in lens) print(lens)
def test_shuffle(manifest_type): data = DummyManifest(manifest_type, begin_id=0, end_id=4) for idx, item in enumerate(data): item.duration = idx expected_durations = [2, 1, 3, 0] rng = random.Random(42) eager_result = data.shuffle(rng=rng) assert [item.duration for item in eager_result] == list(expected_durations) with as_lazy(data) as lazy_data: lazy_result = lazy_data.shuffle(rng=rng) assert [item.duration for item in lazy_result] == list(expected_durations)
def test_repeat_infinite(manifest_type): data = DummyManifest(manifest_type, begin_id=0, end_id=10) # hard to test infinite iterables, iterate it 10x more times than the original size eager_result = data.repeat() for idx, item in enumerate(eager_result): if idx == 105: break assert idx == 105 with as_lazy(data) as lazy_data: lazy_result = lazy_data.repeat() for idx, item in enumerate(lazy_result): if idx == 105: break assert idx == 105
def test_extra_padding_frames(randomized: bool, preserve_id: bool): cuts = DummyManifest(CutSet, begin_id=0, end_id=10) transform = ExtraPadding( extra_frames=4, randomized=randomized, preserve_id=preserve_id ) padded_cuts = transform(cuts) # Non-randomized test -- check that all cuts are processed # in the same way. if not randomized: for cut, padded in zip(cuts, padded_cuts): # first track is for padding assert padded.tracks[0].cut.num_frames == 2 # second track is for padding assert padded.tracks[-1].cut.num_frames == 2 # total num frames is OK assert padded.num_frames == cut.num_frames + 4 # Randomized test -- check that cuts have different properties. if randomized: nums_frames = [c.num_frames for c in padded_cuts] assert len(set(nums_frames)) > 1 if preserve_id: assert all(cut.id == cut_pad.id for cut, cut_pad in zip(cuts, padded_cuts)) else: # Note: using any(), not all(), since some cuts may be unaffected # as the transform may be randomized. assert any(cut.id != cut_pad.id for cut, cut_pad in zip(cuts, padded_cuts))
def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[ CutConcatenate(), ], ) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"]) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_bucketing_sampler_time_constraints(constraint): cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000) sampler = BucketingSampler(cut_set, sampler_type=SimpleCutSampler, **constraint) sampled_cuts = [] for batch in sampler: sampled_cuts.extend(batch) assert set(cut_set.ids) == set(c.id for c in sampled_cuts)
def test_cut_pairs_sampler_filter(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) sampler = CutPairsSampler( cut_set, cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_source_frames=1000, ) removed_cut_id = "dummy-cut-0010" sampler.filter(lambda cut: cut.id != removed_cut_id) source_cuts, target_cuts = [], [] for src_batch, tgt_batch in sampler: source_cuts.extend(src_batch) target_cuts.extend(tgt_batch) # The filtered cut is not there assert removed_cut_id in set(cut_set.ids) assert removed_cut_id not in set(c.id for c in source_cuts) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet, # minus the filtered item assert len(source_cuts) == len(cut_set) - 1 assert len(target_cuts) == len(cut_set) - 1 # Invariant 2: the items are not duplicated assert len(set(c.id for c in source_cuts)) == len(source_cuts) assert len(set(c.id for c in target_cuts)) == len(target_cuts)