Python DummyManifest示例，lhotse.testing.dummies.DummyManifest Python示例

示例#1

0

显示文件

文件： test_cut_transforms.py 项目： underdogliu/lhotse

def test_cutmix(preserve_id: bool):
    speech_cuts = DummyManifest(CutSet, begin_id=0, end_id=10)
    for c in speech_cuts:
        c.duration = 10.0

    noise_cuts = DummyManifest(CutSet, begin_id=100, end_id=102)
    for c in noise_cuts:
        c.duration = 1.5

    tfnm = CutMix(noise_cuts, snr=None, prob=1.0, preserve_id=preserve_id)

    tfnm_cuts = tfnm(speech_cuts)
    for c in tfnm_cuts:
        assert isinstance(c, MixedCut)
        assert c.tracks[0].cut.duration == 10.0
        assert sum(t.cut.duration for t in c.tracks[1:]) == 10.0

    if preserve_id:
        assert all(
            cut.id == cut_noisy.id for cut, cut_noisy in zip(speech_cuts, tfnm_cuts)
        )
    else:
        assert all(
            cut.id != cut_noisy.id for cut, cut_noisy in zip(speech_cuts, tfnm_cuts)
        )

示例#2

0

显示文件

def test_cut_pairs_sampler_lazy_shuffle(sampler_cls):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)

        sampler = sampler_cls(
            lazy_cuts,
            lazy_cuts,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_source_frames=1000,
        )
        sampled_src_cuts = []
        sampled_tgt_cuts = []
        for src_batch, tgt_batch in sampler:
            # Invariant 0: The order of source and target cut IDs is preserved within each batch.
            assert list(src_batch.ids) == list(tgt_batch.ids)
            sampled_src_cuts.extend(src_batch)
            sampled_tgt_cuts.extend(tgt_batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampled_src_cuts) == len(cut_set)
        assert len(sampled_tgt_cuts) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampled_src_cuts)) == len(sampled_src_cuts)
        # Invariant 3: the items are shuffled
        assert [c.id for c in sampled_src_cuts] != [c.id for c in lazy_cuts]

示例#3

0

显示文件

文件： test_lazy.py 项目： AmirHussein96/lhotse

def test_repeat(manifest_type, preserve_id):
    data = DummyManifest(manifest_type, begin_id=0, end_id=10)

    expected = data + data

    eager_result = data.repeat(times=2, preserve_id=preserve_id)
    if preserve_id or manifest_type == FeatureSet:
        assert list(eager_result) == list(expected)
    else:
        items = list(eager_result)
        ref_items = list(expected)
        assert len(items) == len(ref_items)
        for i, refi in zip(items, ref_items):
            assert i.id.endswith("_repeat0") or i.id.endswith("_repeat1")
            i_modi = fastcopy(i, id=refi.id)
            assert i_modi == refi

    with as_lazy(data) as lazy_data:
        lazy_result = lazy_data.repeat(times=2, preserve_id=preserve_id)
        if preserve_id or manifest_type == FeatureSet:
            assert list(lazy_result) == list(expected)
        else:
            items = list(lazy_result)
            ref_items = list(expected)
            assert len(items) == len(ref_items)
            for i, refi in zip(items, ref_items):
                assert i.id.endswith("_repeat0") or i.id.endswith("_repeat1")
                i_modi = fastcopy(i, id=refi.id)
                assert i_modi == refi

示例#4

0

显示文件

def test_bucketing_sampler_buckets_have_different_durations():
    cut_set_1s = DummyManifest(CutSet, begin_id=0, end_id=10)
    cut_set_2s = DummyManifest(CutSet, begin_id=10, end_id=20)
    for c in cut_set_2s:
        c.duration = 2.0
    cut_set = cut_set_1s + cut_set_2s

    # The bucketing sampler should return 5 batches with two 1s cuts, and 10 batches with one 2s cut.
    sampler = BucketingSampler(
        cut_set,
        sampler_type=SingleCutSampler,
        max_frames=200,
        num_buckets=2
    )
    batches = list(sampler)
    assert len(batches) == 15

    # All cuts have the same durations (i.e. are from the same bucket in this case)
    for batch in batches:
        batch_durs = [cut_set[cid].duration for cid in batch]
        assert all(d == batch_durs[0] for d in batch_durs)

    batches = sorted(batches, key=len)
    assert all(len(b) == 1 for b in batches[:10])
    assert all(len(b) == 2 for b in batches[10:])

示例#5

0

显示文件

def test_bucketing_sampler_cut_pairs_equal_duration(shuffle):
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000)
    for idx, c in enumerate(cut_set):
        c.duration = (
            3 + idx * 1 / 50
        )  # each cut has a different duration between [3, 23]
    # Target CutSet is going to have different durations
    # -- make sure the bucketing works well with that.
    cut_set_tgt = cut_set.map(lambda c: fastcopy(c, duration=1 / c.duration))

    sampler = BucketingSampler(
        cut_set,
        cut_set_tgt,
        sampler_type=CutPairsSampler,
        bucket_method="equal_duration",
        num_buckets=10,
        shuffle=shuffle,
    )

    # Ensure that each consecutive bucket has less cuts than the previous one
    prev_len = float("inf")
    bucket_cum_durs = []
    for bucket_src, bucket_tgt in sampler.buckets:
        assert list(bucket_src.ids) == list(bucket_tgt.ids)
        bucket_cum_durs.append(sum(c.duration for c in bucket_src))
        curr_len = len(bucket_src)
        assert curr_len < prev_len
        prev_len = curr_len

    # Assert that all bucket cumulative durations are within 1/10th of the mean
    mean_bucket_dur = mean(bucket_cum_durs)  # ~ 1300s
    for d in bucket_cum_durs:
        assert abs(d - mean_bucket_dur) < 0.1 * mean_bucket_dur

示例#6

0

显示文件

def test_single_cut_sampler_lazy_shuffle(sampler_cls):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)

        sampler = sampler_cls(
            lazy_cuts,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_duration=10.0,
        )
        sampled_cuts = []
        for batch in sampler:
            sampled_cuts.extend(batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampled_cuts) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampled_cuts)) == len(sampled_cuts)
        # Invariant 3: the items are shuffled
        assert [c.id for c in sampled_cuts] != [c.id for c in lazy_cuts]

示例#7

0

显示文件

def test_single_cut_sampler_time_constraints(
    max_duration, max_frames, max_samples, exception_expectation
):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    if max_frames is None:
        cut_set = cut_set.drop_features()

    with exception_expectation:
        sampler = SimpleCutSampler(
            cut_set,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_frames=max_frames,
            max_samples=max_samples,
            max_duration=max_duration,
        )
        sampler_cut_ids = []
        for batch in sampler:
            sampler_cut_ids.extend(batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampler_cut_ids) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampler_cut_ids)) == len(sampler_cut_ids)
        # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
        assert [c.id for c in sampler_cut_ids] != [c.id for c in cut_set]

示例#8

0

显示文件

def test_bucketing_sampler_cut_pairs_equal_len(shuffle):
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000)
    for idx, c in enumerate(cut_set):
        c.duration = (
            3 + idx * 1 / 50
        )  # each cut has a different duration between [3, 23]
    # Target CutSet is going to have different durations
    # -- make sure the bucketing works well with that.
    cut_set_tgt = cut_set.map(lambda c: fastcopy(c, duration=1 / c.duration))

    sampler = BucketingSampler(
        cut_set,
        cut_set_tgt,
        sampler_type=CutPairsSampler,
        bucket_method="equal_len",
        num_buckets=10,
        shuffle=shuffle,
    )

    bucket_cum_durs = []
    for bucket_src, bucket_tgt in sampler.buckets:
        bucket_cum_durs.append(sum(c.duration for c in bucket_src))
        assert len(bucket_src) == 100
        assert list(bucket_src.ids) == list(bucket_tgt.ids)

    # The variations in duration are over 10% of the mean bucket duration (because of equal lengths).
    mean_bucket_dur = mean(bucket_cum_durs)
    assert not all(
        abs(d - mean_bucket_dur) < 0.1 * mean_bucket_dur for d in bucket_cum_durs
    )

示例#9

0

显示文件

def test_round_robin_sampler():
    cuts1 = DummyManifest(CutSet, begin_id=0, end_id=30)
    cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1100)
    sampler = RoundRobinSampler(
        # Note: each cut is 1s duration in this test.
        SimpleCutSampler(cuts1, max_duration=10),
        SimpleCutSampler(cuts2, max_duration=2),
    )

    batches = [b for b in sampler]
    assert len(batches) == 3 + 50

    batches_10cuts = [b for b in batches if len(b) == 10]
    assert len(batches_10cuts) == 3

    batches_2cuts = [b for b in batches if len(b) == 2]
    assert len(batches_2cuts) == 50

    assert len(batches[0]) == 10
    assert len(batches[1]) == 2
    assert len(batches[2]) == 10
    assert len(batches[3]) == 2
    assert len(batches[4]) == 10
    assert len(batches[5]) == 2
    assert len(batches[6]) == 2
    assert len(batches[7]) == 2
    assert len(batches[8]) == 2
    assert len(batches[9]) == 2

示例#10

0

显示文件

文件： test_qa.py 项目： m-wiesner/lhotse

def test_remove_missing_recordings_and_supervisions():
    recordings = DummyManifest(RecordingSet, begin_id=0, end_id=100)
    supervisions = DummyManifest(SupervisionSet, begin_id=50, end_id=150)
    fix_recs, fix_sups = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    expected_ids = [f'dummy-recording-{idx:04d}' for idx in range(50, 100)]
    assert [r.id for r in fix_recs] == expected_ids
    assert [s.recording_id for s in fix_sups] == expected_ids

示例#11

0

显示文件

文件： test_cut_set.py 项目： AmirHussein96/lhotse

def test_cut_set_subset_cut_ids_preserves_order():
    cuts = DummyManifest(CutSet, begin_id=0, end_id=1000)
    cut_ids = ["dummy-cut-0010", "dummy-cut-0171", "dummy-cut-0009"]
    subcuts = cuts.subset(cut_ids=cut_ids)
    cut1, cut2, cut3 = subcuts
    assert cut1.id == "dummy-cut-0010"
    assert cut2.id == "dummy-cut-0171"
    assert cut3.id == "dummy-cut-0009"

示例#12

0

显示文件

文件： test_manipulation.py 项目： lql0716/lhotse

def test_combine(manifest_type):
    expected = DummyManifest(manifest_type, begin_id=0, end_id=200)
    combined = combine(
        DummyManifest(manifest_type, begin_id=0, end_id=68),
        DummyManifest(manifest_type, begin_id=68, end_id=136),
        DummyManifest(manifest_type, begin_id=136, end_id=200),
    )
    assert combined == expected

示例#13

0

显示文件

文件： test_multipexing_iterables.py 项目： oplatek/lhotse

def test_multiplexer_with_cuts_pickling():
    cuts1 = DummyManifest(CutSet, begin_id=0, end_id=10)
    cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1005)
    mux = LazyIteratorMultiplexer(cuts1, cuts2, seed=0)

    data = pickle.dumps(mux)
    mux_rec = pickle.loads(data)

    assert list(mux) == list(mux_rec)

示例#14

0

显示文件

文件： test_manipulation.py 项目： oplatek/lhotse

def test_split_randomize(manifest_type):
    manifest = DummyManifest(manifest_type, begin_id=0, end_id=100)
    manifest_subsets = manifest.split(num_splits=2, shuffle=True)
    assert len(manifest_subsets) == 2
    recombined_items = list(manifest_subsets[0]) + list(manifest_subsets[1])
    assert len(recombined_items) == len(manifest)
    # Different ordering (we convert to lists first because the *Set classes might internally
    # re-order after concatenation, e.g. by using dict or post-init sorting)
    assert recombined_items != list(manifest)

示例#15

0

显示文件

def test_bucketing_sampler_raises_value_error_on_lazy_cuts_input():
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=2)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)
        with pytest.raises(ValueError):
            sampler = BucketingSampler(
                lazy_cuts,
                max_duration=10.0,
            )

示例#16

0

显示文件

文件： test_manipulation.py 项目： oplatek/lhotse

def test_split_even(manifest_type):
    manifest = DummyManifest(manifest_type, begin_id=0, end_id=100)
    manifest_subsets = manifest.split(num_splits=2)
    assert len(manifest_subsets) == 2
    assert manifest_subsets[0] == DummyManifest(manifest_type,
                                                begin_id=0,
                                                end_id=50)
    assert manifest_subsets[1] == DummyManifest(manifest_type,
                                                begin_id=50,
                                                end_id=100)

示例#17

0

显示文件

def test_combine_lazy(manifest_type):
    expected = DummyManifest(manifest_type, begin_id=0, end_id=200)
    with as_lazy(DummyManifest(manifest_type, begin_id=0, end_id=68)) as part1, as_lazy(
        DummyManifest(manifest_type, begin_id=68, end_id=136)
    ) as part2, as_lazy(
        DummyManifest(manifest_type, begin_id=136, end_id=200)
    ) as part3:
        combined = combine(part1, part2, part3)
        # Equivalent under iteration
        assert list(combined) == list(expected)

示例#18

0

显示文件

def test_bucketing_sampler_cut_pairs():
    cut_set1 = DummyManifest(CutSet, begin_id=0, end_id=1000)
    cut_set2 = DummyManifest(CutSet, begin_id=0, end_id=1000)
    sampler = BucketingSampler(cut_set1, cut_set2, sampler_type=CutPairsSampler)

    cut_ids = []
    for batch in sampler:
        cut_ids.extend(batch)
    assert set(cut_set1.ids) == set(cut_ids)
    assert set(cut_set2.ids) == set(cut_ids)

示例#19

0

显示文件

def test_lazy_cuts_combine_split_issue():
    # Test for lack of exception
    cuts = DummyManifest(CutSet, begin_id=0, end_id=1000)
    with TemporaryDirectory() as d, NamedTemporaryFile(suffix=".jsonl.gz") as f:
        cuts.to_file(f.name)
        f.flush()

        cuts_lazy = load_manifest_lazy(f.name)
        cuts_lazy = combine(cuts_lazy, cuts_lazy.perturb_speed(0.9))
        cuts_lazy.split_lazy(d, chunk_size=100)

示例#20

0

显示文件

def test_bucketing_sampler_cut_pairs():
    cut_set1 = DummyManifest(CutSet, begin_id=0, end_id=1000)
    cut_set2 = DummyManifest(CutSet, begin_id=0, end_id=1000)
    sampler = BucketingSampler(cut_set1, cut_set2, sampler_type=CutPairsSampler)

    src_cuts, tgt_cuts = [], []
    for src_batch, tgt_batch in sampler:
        src_cuts.extend(src_batch)
        tgt_cuts.extend(tgt_batch)
    assert set(cut_set1.ids) == set(c.id for c in src_cuts)
    assert set(cut_set2.ids) == set(c.id for c in tgt_cuts)

示例#21

0

显示文件

def test_repeat(manifest_type):
    data = DummyManifest(manifest_type, begin_id=0, end_id=10)

    expected = data + data

    eager_result = data.repeat(times=2)
    assert list(eager_result) == list(expected)

    with as_lazy(data) as lazy_data:
        lazy_result = lazy_data.repeat(times=2)
        assert list(lazy_result) == list(expected)

示例#22

0

显示文件

文件： test_multipexing_iterables.py 项目： oplatek/lhotse

def test_cut_set_mux_stop_early():
    cuts1 = DummyManifest(CutSet, begin_id=0, end_id=10)
    cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1005)

    cuts_mux = CutSet.mux(cuts1, cuts2, seed=0, stop_early=True)

    def cid(i: int) -> str:
        return f"dummy-cut-{i:04d}"

    assert sorted([c.id for c in cuts_mux]) == [
        cid(i) for i in (0, 1, 2, 3, 4, 1000, 1001, 1002, 1003, 1004)
    ]
    assert sorted([c.id for c in cuts_mux]) != [c.id for c in cuts_mux]

示例#23

0

显示文件

文件： test_serialization.py 项目： jimbozhang/lhotse

def test_sequential_jsonl_writer_overwrite(overwrite):
    cuts = DummyManifest(CutSet, begin_id=0, end_id=100)
    half = cuts.split(num_splits=2)[0]
    with NamedTemporaryFile(suffix='.jsonl') as jsonl_f:
        # Store the first half
        half.to_file(jsonl_f.name)

        # Open sequential writer
        with CutSet.open_writer(jsonl_f.name, overwrite=overwrite) as writer:
            if overwrite:
                assert all(not writer.contains(id_) for id_ in half.ids)
            else:
                assert all(writer.contains(id_) for id_ in half.ids)

示例#24

0

显示文件

def test_bucketing_sampler_chooses_buckets_randomly():
    # Construct a CutSet that has 1000 cuts with 100 unique durations.
    # Makes it simple to track which bucket was selected.
    cut_set = CutSet({})  # empty
    for i in range(100):
        new_cuts = DummyManifest(CutSet, begin_id=i * 10, end_id=(i + 1) * 10)
        for c in new_cuts:
            c.duration = i
        cut_set = cut_set + new_cuts

    # Sampler that always select one cut.
    sampler = BucketingSampler(
        cut_set,
        sampler_type=SimpleCutSampler,
        max_cuts=1,
        max_frames=1000000000,
        num_buckets=100,
    )

    # Batches of 1 guarantee that item is always a single-element list of cut IDs.
    durations = [cut_set[item[0].id].duration for item in sampler]

    # This is the "trick" part - 'groupby' groups the cuts together by their duration.
    # If there is a group that has a size of 10, that means the same bucket was chosen
    # for 10 consecutive batches, which is not what BucketingSampler is supposed to do
    # (the probability of that is extremely low).
    # We're actually setting that threshold lower to 8 which should never be triggered
    # anyway.
    lens = []
    for key, group in groupby(durations):
        lens.append(len(list(group)))
    assert all(l < 8 for l in lens)
    print(lens)

示例#25

0

显示文件

文件： test_lazy.py 项目： AmirHussein96/lhotse

def test_shuffle(manifest_type):
    data = DummyManifest(manifest_type, begin_id=0, end_id=4)
    for idx, item in enumerate(data):
        item.duration = idx

    expected_durations = [2, 1, 3, 0]

    rng = random.Random(42)

    eager_result = data.shuffle(rng=rng)
    assert [item.duration for item in eager_result] == list(expected_durations)

    with as_lazy(data) as lazy_data:
        lazy_result = lazy_data.shuffle(rng=rng)
        assert [item.duration
                for item in lazy_result] == list(expected_durations)

示例#26

0

显示文件

文件： test_lazy.py 项目： AmirHussein96/lhotse

def test_repeat_infinite(manifest_type):
    data = DummyManifest(manifest_type, begin_id=0, end_id=10)

    # hard to test infinite iterables, iterate it 10x more times than the original size
    eager_result = data.repeat()
    for idx, item in enumerate(eager_result):
        if idx == 105:
            break
    assert idx == 105

    with as_lazy(data) as lazy_data:
        lazy_result = lazy_data.repeat()
        for idx, item in enumerate(lazy_result):
            if idx == 105:
                break
        assert idx == 105

示例#27

0

显示文件

文件： test_cut_transforms.py 项目： underdogliu/lhotse

def test_extra_padding_frames(randomized: bool, preserve_id: bool):
    cuts = DummyManifest(CutSet, begin_id=0, end_id=10)
    transform = ExtraPadding(
        extra_frames=4, randomized=randomized, preserve_id=preserve_id
    )
    padded_cuts = transform(cuts)

    # Non-randomized test -- check that all cuts are processed
    # in the same way.
    if not randomized:
        for cut, padded in zip(cuts, padded_cuts):
            # first track is for padding
            assert padded.tracks[0].cut.num_frames == 2
            # second track is for padding
            assert padded.tracks[-1].cut.num_frames == 2
            # total num frames is OK
            assert padded.num_frames == cut.num_frames + 4

    # Randomized test -- check that cuts have different properties.
    if randomized:
        nums_frames = [c.num_frames for c in padded_cuts]
        assert len(set(nums_frames)) > 1

    if preserve_id:
        assert all(cut.id == cut_pad.id for cut, cut_pad in zip(cuts, padded_cuts))
    else:
        # Note: using any(), not all(), since some cuts may be unaffected
        #       as the transform may be randomized.
        assert any(cut.id != cut_pad.id for cut, cut_pad in zip(cuts, padded_cuts))

示例#28

0

显示文件

文件： test_speech_recognition_dataset.py 项目： underdogliu/lhotse

def test_k2_speech_recognition_iterable_dataset_shuffling():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    dataset = K2SpeechRecognitionDataset(
        return_cuts=True,
        cut_transforms=[
            CutConcatenate(),
        ],
    )
    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000,
    )
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=2)
    dloader_cut_ids = []
    batches = []
    for batch in dloader:
        batches.append(batch)
        dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"])

    # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
    assert len(dloader_cut_ids) == len(cut_set)
    # Invariant 2: the items are not duplicated
    assert len(set(dloader_cut_ids)) == len(dloader_cut_ids)
    # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
    assert dloader_cut_ids != [c.id for c in cut_set]

示例#29

0

显示文件

def test_bucketing_sampler_time_constraints(constraint):
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=1000)
    sampler = BucketingSampler(cut_set, sampler_type=SimpleCutSampler, **constraint)
    sampled_cuts = []
    for batch in sampler:
        sampled_cuts.extend(batch)
    assert set(cut_set.ids) == set(c.id for c in sampled_cuts)

示例#30

0

显示文件

def test_cut_pairs_sampler_filter():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    sampler = CutPairsSampler(
        cut_set,
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_source_frames=1000,
    )
    removed_cut_id = "dummy-cut-0010"
    sampler.filter(lambda cut: cut.id != removed_cut_id)

    source_cuts, target_cuts = [], []
    for src_batch, tgt_batch in sampler:
        source_cuts.extend(src_batch)
        target_cuts.extend(tgt_batch)

    # The filtered cut is not there
    assert removed_cut_id in set(cut_set.ids)
    assert removed_cut_id not in set(c.id for c in source_cuts)

    # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet,
    # minus the filtered item
    assert len(source_cuts) == len(cut_set) - 1
    assert len(target_cuts) == len(cut_set) - 1
    # Invariant 2: the items are not duplicated
    assert len(set(c.id for c in source_cuts)) == len(source_cuts)
    assert len(set(c.id for c in target_cuts)) == len(target_cuts)