def test_cut_pairs_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_source_frames=1000, ) sampled_src_cuts = [] sampled_tgt_cuts = [] for src_batch, tgt_batch in sampler: # Invariant 0: The order of source and target cut IDs is preserved within each batch. assert list(src_batch.ids) == list(tgt_batch.ids) sampled_src_cuts.extend(src_batch) sampled_tgt_cuts.extend(tgt_batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_src_cuts) == len(cut_set) assert len(sampled_tgt_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_src_cuts)) == len(sampled_src_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_src_cuts] != [c.id for c in lazy_cuts]
def test_single_cut_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_duration=10.0, ) sampled_cuts = [] for batch in sampler: sampled_cuts.extend(batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_cuts)) == len(sampled_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_cuts] != [c.id for c in lazy_cuts]
def test_bucketing_sampler_raises_value_error_on_lazy_cuts_input(): cut_set = DummyManifest(CutSet, begin_id=0, end_id=2) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) with pytest.raises(ValueError): sampler = BucketingSampler( lazy_cuts, max_duration=10.0, )