예제 #1
0
def test_k2_speech_recognition_on_the_fly_feature_extraction(
        k2_cut_set, use_batch_extract, fault_tolerant):
    precomputed_dataset = K2SpeechRecognitionDataset()
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(
            Fbank(FbankConfig(num_mel_bins=40)),
            use_batch_extract=use_batch_extract,
            fault_tolerant=fault_tolerant,
        ))
    sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc["inputs"])
        norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc["supervisions"]["start_frame"] ==
                batch_otf["supervisions"]["start_frame"]).all()
        assert (batch_pc["supervisions"]["num_frames"] ==
                batch_otf["supervisions"]["num_frames"]).all()
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing(
    k2_cut_set, ):
    dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(extractor=Fbank(), ))
    rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures(
        extractor=Fbank(),
        # Use p=1.0 to ensure that smoothing is applied in this test.
        wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)],
    ))
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch = dataset[cut_ids]
        rs_batch = rs_dataset[cut_ids]
        # Additive noise should cause the energies to go up
        assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
def test_k2_speech_recognition_iterable_dataset_shuffling():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    dataset = K2SpeechRecognitionDataset(
        return_cuts=True,
        cut_transforms=[
            CutConcatenate(),
        ],
    )
    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000,
    )
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=2)
    dloader_cut_ids = []
    batches = []
    for batch in dloader:
        batches.append(batch)
        dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"])

    # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
    assert len(dloader_cut_ids) == len(cut_set)
    # Invariant 2: the items are not duplicated
    assert len(set(dloader_cut_ids)) == len(dloader_cut_ids)
    # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
    assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_multiple_workers(
        k2_cut_set, num_workers):
    k2_cut_set = k2_cut_set.pad()
    dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=num_workers)

    # We expect a variable number of batches for each parametrized num_workers value,
    # because the dataset is small with 4 cuts that are partitioned across the workers.
    batches = [item for item in dloader]

    features = torch.cat([b["inputs"] for b in batches])
    assert features.shape == (4, 2000, 40)
    text = [t for b in batches for t in b["supervisions"]["text"]]
    assert text == ["EXAMPLE OF TEXT"] * 5  # a list, not tensor
    start_frame = torch.cat(
        [b["supervisions"]["start_frame"] for b in batches]).tolist()
    # The multi-worker dataloader might not preserve order, because the workers
    # might finish processing in different order. To compare ground truth
    # start times with actual start times, we need to sort.
    start_frame = sorted(start_frame)
    assert start_frame == [0] * 4 + [1000]
    num_frames = torch.cat([b["supervisions"]["num_frames"]
                            for b in batches]).tolist()
    assert num_frames == [1000] * 5
def test_k2_speech_recognition_audio_inputs_with_workers_in_input_strategy(
        k2_cut_set):
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=AudioSamples(num_workers=2), )
    # all cuts in one batch
    sampler = SingleCutSampler(k2_cut_set,
                               shuffle=False,
                               max_duration=100000.0)
    dloader = DataLoader(
        on_the_fly_dataset,
        batch_size=None,
        sampler=sampler,
        num_workers=
        0,  # has to be 0 because DataLoader workers can't spawn subprocesses
    )
    batch = next(iter(dloader))
    assert batch["inputs"].shape == (4, 320000)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch["supervisions"]["sequence_idx"] == tensor([0, 0, 1, 2,
                                                             3])).all()
    assert (batch["supervisions"]["text"] == ["EXAMPLE OF TEXT"] * 5
            )  # a list, not tensor
    assert (batch["supervisions"]["start_sample"] == tensor(
        [0, 160000, 0, 0, 0])).all()
    assert (batch["supervisions"]["num_samples"] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset_low_max_frames(k2_cut_set):
    dataset = K2SpeechRecognitionDataset()
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=2)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=None)
    # Check that it does not crash
    for batch in dloader:
        # There will be only a single item in each batch as we're exceeding the limit each time.
        assert batch["inputs"].shape[0] == 1
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set):
    dataset = K2SpeechRecognitionDataset(
        cut_transforms=[CutConcatenate(),
                        CutMix(k2_noise_cut_set)])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=None)
    # Check that it does not crash by just running all dataloader iterations
    batches = [item for item in dloader]
    assert len(batches) > 0
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set):
    precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set)
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        k2_cut_set.drop_features(),
        input_strategy=OnTheFlyFeatures(Fbank())
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc['inputs'])
        norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs'])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all()
        assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
예제 #9
0
def test_k2_dataloader(k2_cut_set):
    from torch import tensor
    dataset = K2SpeechRecognitionDataset(k2_cut_set)
    dloader = K2DataLoader(dataset, batch_size=4)
    batch = next(iter(dloader))
    assert batch['features'].shape == (4, 308, 80)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 1, 2, 3, 3])).all()
    assert batch['supervisions']['text'] == ['IN EIGHTEEN THIRTEEN'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_frame'] == tensor([0] * 4 + [154])).all()
    assert (batch['supervisions']['num_frames'] == tensor([154] * 5)).all()
def test_k2_speech_recognition_audio_inputs(k2_cut_set):
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        k2_cut_set,
        input_strategy=AudioSamples(),
    )
    # all cuts in one batch
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=10000000)
    cut_ids = next(iter(sampler))
    batch = on_the_fly_dataset[cut_ids]
    assert batch['inputs'].shape == (4, 320000)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all()
    assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_sample'] == tensor([0, 160000, 0, 0, 0])).all()
    assert (batch['supervisions']['num_samples'] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers):
    dataset = K2SpeechRecognitionDataset(
        k2_cut_set,
        cut_transforms=[CutConcatenate()]
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    # Note: "batch_size=None" disables the automatic batching mechanism,
    #       which is required when Dataset takes care of the collation itself.
    dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers)
    batch = next(iter(dloader))
    assert batch['inputs'].shape == (4, 2000, 40)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all()
    assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all()
    assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()
def test_k2_speech_recognition_dataset(k2_cut_set):
    dataset = K2SpeechRecognitionDataset(k2_cut_set)
    for i in range(3):
        example = dataset[i]
        assert example['features'].shape == (308, 80)
        assert len(example['supervisions']) == 1
        assert example['supervisions'][0]['text'] == 'IN EIGHTEEN THIRTEEN'
        assert example['supervisions'][0]['sequence_idx'] == i
        assert example['supervisions'][0]['start_frame'] == 0
        assert example['supervisions'][0]['num_frames'] == 154
    example = dataset[3]
    assert example['features'].shape == (308, 80)
    assert len(example['supervisions']) == 2
    assert example['supervisions'][0]['text'] == 'IN EIGHTEEN THIRTEEN'
    assert example['supervisions'][0]['sequence_idx'] == 3
    assert example['supervisions'][0]['start_frame'] == 0
    assert example['supervisions'][0]['num_frames'] == 154
    assert example['supervisions'][1]['text'] == 'IN EIGHTEEN THIRTEEN'
    assert example['supervisions'][1]['sequence_idx'] == 3
    assert example['supervisions'][1]['start_frame'] == 154
    assert example['supervisions'][1]['num_frames'] == 154