Пример #1
0
def test_single_cut_sampler_order_is_deterministic_given_epoch():
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000)
    sampler.set_epoch(42)
    # calling the sampler twice without epoch update gives identical ordering
    assert [item for item in sampler] == [item for item in sampler]
Пример #2
0
def test_single_cut_sampler_len():
    # total duration is 55 seconds
    # each second has 100 frames
    cuts = CutSet.from_cuts(
        dummy_cut(idx, duration=float(idx)) for idx in range(1, 11))
    sampler = SingleCutSampler(cuts,
                               shuffle=True,
                               max_frames=10 * 100,
                               max_cuts=6)

    for epoch in range(5):
        assert len(sampler) == len([batch for batch in sampler])
        sampler.set_epoch(epoch)
Пример #3
0
def test_single_cut_sampler_time_constraints(max_duration, max_frames,
                                             max_samples,
                                             exception_expectation):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    if max_frames is None:
        cut_set = cut_set.drop_features()

    with exception_expectation:
        sampler = SingleCutSampler(
            cut_set,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_frames=max_frames,
            max_samples=max_samples,
            max_duration=max_duration,
        )
        sampler_cut_ids = []
        for batch in sampler:
            sampler_cut_ids.extend(batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampler_cut_ids) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampler_cut_ids)) == len(sampler_cut_ids)
        # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
        assert [c.id for c in sampler_cut_ids] != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_shuffling():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    dataset = K2SpeechRecognitionDataset(
        return_cuts=True,
        cut_transforms=[
            CutConcatenate(),
        ],
    )
    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000,
    )
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=2)
    dloader_cut_ids = []
    batches = []
    for batch in dloader:
        batches.append(batch)
        dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"])

    # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
    assert len(dloader_cut_ids) == len(cut_set)
    # Invariant 2: the items are not duplicated
    assert len(set(dloader_cut_ids)) == len(dloader_cut_ids)
    # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
    assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_audio_inputs_with_workers_in_input_strategy(
        k2_cut_set):
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=AudioSamples(num_workers=2), )
    # all cuts in one batch
    sampler = SingleCutSampler(k2_cut_set,
                               shuffle=False,
                               max_duration=100000.0)
    dloader = DataLoader(
        on_the_fly_dataset,
        batch_size=None,
        sampler=sampler,
        num_workers=
        0,  # has to be 0 because DataLoader workers can't spawn subprocesses
    )
    batch = next(iter(dloader))
    assert batch["inputs"].shape == (4, 320000)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch["supervisions"]["sequence_idx"] == tensor([0, 0, 1, 2,
                                                             3])).all()
    assert (batch["supervisions"]["text"] == ["EXAMPLE OF TEXT"] * 5
            )  # a list, not tensor
    assert (batch["supervisions"]["start_sample"] == tensor(
        [0, 160000, 0, 0, 0])).all()
    assert (batch["supervisions"]["num_samples"] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset_multiple_workers(
        k2_cut_set, num_workers):
    k2_cut_set = k2_cut_set.pad()
    dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=num_workers)

    # We expect a variable number of batches for each parametrized num_workers value,
    # because the dataset is small with 4 cuts that are partitioned across the workers.
    batches = [item for item in dloader]

    features = torch.cat([b["inputs"] for b in batches])
    assert features.shape == (4, 2000, 40)
    text = [t for b in batches for t in b["supervisions"]["text"]]
    assert text == ["EXAMPLE OF TEXT"] * 5  # a list, not tensor
    start_frame = torch.cat(
        [b["supervisions"]["start_frame"] for b in batches]).tolist()
    # The multi-worker dataloader might not preserve order, because the workers
    # might finish processing in different order. To compare ground truth
    # start times with actual start times, we need to sort.
    start_frame = sorted(start_frame)
    assert start_frame == [0] * 4 + [1000]
    num_frames = torch.cat([b["supervisions"]["num_frames"]
                            for b in batches]).tolist()
    assert num_frames == [1000] * 5
Пример #7
0
def test_k2_speech_recognition_on_the_fly_feature_extraction(
        k2_cut_set, use_batch_extract, fault_tolerant):
    precomputed_dataset = K2SpeechRecognitionDataset()
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(
            Fbank(FbankConfig(num_mel_bins=40)),
            use_batch_extract=use_batch_extract,
            fault_tolerant=fault_tolerant,
        ))
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc["inputs"])
        norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc["supervisions"]["start_frame"] ==
                batch_otf["supervisions"]["start_frame"]).all()
        assert (batch_pc["supervisions"]["num_frames"] ==
                batch_otf["supervisions"]["num_frames"]).all()
Пример #8
0
def test_single_cut_sampler_order_differs_between_epochs():
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000)
    last_order = [item for item in sampler]
    for epoch in range(1, 6):
        sampler.set_epoch(epoch)
        new_order = [item for item in sampler]
        assert new_order != last_order
        last_order = new_order
def test_k2_speech_recognition_iterable_dataset_low_max_frames(k2_cut_set):
    dataset = K2SpeechRecognitionDataset()
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=2)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=None)
    # Check that it does not crash
    for batch in dloader:
        # There will be only a single item in each batch as we're exceeding the limit each time.
        assert batch["inputs"].shape[0] == 1
Пример #10
0
def test_zip_sampler_merge_batches_true():
    cuts1 = DummyManifest(CutSet, begin_id=0, end_id=100)
    cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1100)
    sampler = ZipSampler(
        # Note: each cut is 1s duration in this test.
        SingleCutSampler(cuts1, max_duration=10),
        SingleCutSampler(cuts2, max_duration=2),
    )
    batches = [b for b in sampler]
    assert len(batches) == 10
    for idx, batch in enumerate(batches):
        assert len(batch) == 12  # twelve 1s items
        assert (len([c for c in batch if 0 <= int(c.id.split("-")[-1]) <= 100
                     ]) == 10)  # ten come from cuts1
        assert (len([
            c for c in batch if 1000 <= int(c.id.split("-")[-1]) <= 1100
        ]) == 2)  # two come from cuts2
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set):
    dataset = K2SpeechRecognitionDataset(
        cut_transforms=[CutConcatenate(),
                        CutMix(k2_noise_cut_set)])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=None)
    # Check that it does not crash by just running all dataloader iterations
    batches = [item for item in dloader]
    assert len(batches) > 0
Пример #12
0
 def test_no_off_by_one_errors_in_dataset_batch_collation(
         self, sampling_rate: int, data):
     ### Test data preparation ###
     # Generate 10 - 20 cut durations in numbers of samples
     nums_samples = data.draw(
         st.lists(
             st.integers(round(sampling_rate * 0.1),
                         round(sampling_rate * 5.0)),
             min_size=10,
             max_size=20,
         ),
         label="Cuts numbers of samples",
     )
     # Generate random cuts
     cuts = [
         self.with_cut(sampling_rate=sampling_rate,
                       num_samples=num_samples,
                       supervision=True) for num_samples in nums_samples
     ]
     # Mix them with random offsets
     mixed_cuts = CutSet.from_cuts(
         lhs.mix(
             rhs,
             # Sample the offset in terms of number of samples, and then divide by the sampling rate
             # to obtain "realistic" offsets
             offset_other_by=data.draw(
                 st.integers(
                     min_value=int(0.1 * sampling_rate),
                     max_value=int(lhs.duration * sampling_rate),
                 ),
                 label=f"Offset for pair {idx + 1}",
             ) / sampling_rate,
         ) for idx, (lhs, rhs) in enumerate(zip(cuts, cuts[1:])))
     # Create an ASR dataset
     dataset = K2SpeechRecognitionDataset(
         return_cuts=True,
         cut_transforms=[CutConcatenate(duration_factor=3.0)],
     )
     sampler = SingleCutSampler(
         mixed_cuts,
         shuffle=False,
     )
     dloader = DataLoader(dataset, batch_size=None, sampler=sampler)
     ### End of test data preparation ###
     # Test the invariants
     for batch in dloader:
         sups = batch["supervisions"]
         cuts = sups["cut"]
         for idx, cut in enumerate(cuts):
             assert (sups["start_frame"][idx] + sups["num_frames"][idx] <=
                     cut.num_frames), f"Error at index {idx}"
             # assert sups['start_sample'][idx] + sups['num_samples'][
             #     idx] <= cut.num_samples, f"Error at index {idx}"
     # Need to call cleanup manually to free the file handles, otherwise the test may crash
     self.cleanup()
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing(
    k2_cut_set, ):
    dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(extractor=Fbank(), ))
    rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures(
        extractor=Fbank(),
        # Use p=1.0 to ensure that smoothing is applied in this test.
        wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)],
    ))
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch = dataset[cut_ids]
        rs_batch = rs_dataset[cut_ids]
        # Additive noise should cause the energies to go up
        assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
def test_k2_speech_recognition_audio_inputs(k2_cut_set):
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        k2_cut_set,
        input_strategy=AudioSamples(),
    )
    # all cuts in one batch
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=10000000)
    cut_ids = next(iter(sampler))
    batch = on_the_fly_dataset[cut_ids]
    assert batch['inputs'].shape == (4, 320000)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all()
    assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_sample'] == tensor([0, 160000, 0, 0, 0])).all()
    assert (batch['supervisions']['num_samples'] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers):
    dataset = K2SpeechRecognitionDataset(
        k2_cut_set,
        cut_transforms=[CutConcatenate()]
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    # Note: "batch_size=None" disables the automatic batching mechanism,
    #       which is required when Dataset takes care of the collation itself.
    dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers)
    batch = next(iter(dloader))
    assert batch['inputs'].shape == (4, 2000, 40)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all()
    assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all()
    assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()
Пример #16
0
def test_single_cut_sampler_drop_last():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    sampler = SingleCutSampler(
        cut_set,
        # Set an effective batch size of 15 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1500,
        drop_last=True,
    )
    batches = []
    for batch in sampler:
        assert len(batch) == 15
        batches.append(batch)

    assert len(batches) == 6
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set):
    precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set)
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        k2_cut_set.drop_features(),
        input_strategy=OnTheFlyFeatures(Fbank())
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc['inputs'])
        norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs'])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all()
        assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
Пример #18
0
def test_report_padding_ratio_estimate():
    s = SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=1000))
    report_padding_ratio_estimate(s)  # just test that it runs
Пример #19
0
def test_single_cut_sampler_low_max_frames(libri_cut_set):
    sampler = SingleCutSampler(libri_cut_set, shuffle=False, max_frames=2)
    # Check that it does not crash
    for batch in sampler:
        # There will be only a single item in each batch as we're exceeding the limit each time.
        assert len(batch) == 1
Пример #20
0
        #   There will be one more batch with a single 3s cut.
        expected_num_batches = 17
        expected_num_cuts = 50
        expected_discarded_cuts = 0

    num_sampled_cuts = sum(len(b) for b in batches)
    num_discarded_cuts = len(cut_set) - num_sampled_cuts
    assert len(batches) == expected_num_batches
    assert num_sampled_cuts == expected_num_cuts
    assert num_discarded_cuts == expected_discarded_cuts


@pytest.mark.parametrize(
    "sampler",
    [
        SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=10)),
        CutPairsSampler(
            DummyManifest(CutSet, begin_id=0, end_id=10),
            DummyManifest(CutSet, begin_id=0, end_id=10),
        ),
        BucketingSampler(DummyManifest(CutSet, begin_id=0, end_id=10)),
        ZipSampler(
            SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=10)),
            SingleCutSampler(DummyManifest(CutSet, begin_id=10, end_id=20)),
        ),
    ],
)
def test_sampler_get_report(sampler):
    _ = [b for b in sampler]
    print(sampler.get_report())
    # It runs - voila!