def test_k2_speech_recognition_on_the_fly_feature_extraction( k2_cut_set, use_batch_extract, fault_tolerant): precomputed_dataset = K2SpeechRecognitionDataset() on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=40)), use_batch_extract=use_batch_extract, fault_tolerant=fault_tolerant, )) sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc["inputs"]) norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"]) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc["supervisions"]["start_frame"] == batch_otf["supervisions"]["start_frame"]).all() assert (batch_pc["supervisions"]["num_frames"] == batch_otf["supervisions"]["num_frames"]).all()
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing( k2_cut_set, ): dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures(extractor=Fbank(), )) rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures( extractor=Fbank(), # Use p=1.0 to ensure that smoothing is applied in this test. wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)], )) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch = dataset[cut_ids] rs_batch = rs_dataset[cut_ids] # Additive noise should cause the energies to go up assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[ CutConcatenate(), ], ) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"]) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_multiple_workers( k2_cut_set, num_workers): k2_cut_set = k2_cut_set.pad() dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) # We expect a variable number of batches for each parametrized num_workers value, # because the dataset is small with 4 cuts that are partitioned across the workers. batches = [item for item in dloader] features = torch.cat([b["inputs"] for b in batches]) assert features.shape == (4, 2000, 40) text = [t for b in batches for t in b["supervisions"]["text"]] assert text == ["EXAMPLE OF TEXT"] * 5 # a list, not tensor start_frame = torch.cat( [b["supervisions"]["start_frame"] for b in batches]).tolist() # The multi-worker dataloader might not preserve order, because the workers # might finish processing in different order. To compare ground truth # start times with actual start times, we need to sort. start_frame = sorted(start_frame) assert start_frame == [0] * 4 + [1000] num_frames = torch.cat([b["supervisions"]["num_frames"] for b in batches]).tolist() assert num_frames == [1000] * 5
def test_k2_speech_recognition_audio_inputs_with_workers_in_input_strategy( k2_cut_set): on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=AudioSamples(num_workers=2), ) # all cuts in one batch sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_duration=100000.0) dloader = DataLoader( on_the_fly_dataset, batch_size=None, sampler=sampler, num_workers= 0, # has to be 0 because DataLoader workers can't spawn subprocesses ) batch = next(iter(dloader)) assert batch["inputs"].shape == (4, 320000) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch["supervisions"]["sequence_idx"] == tensor([0, 0, 1, 2, 3])).all() assert (batch["supervisions"]["text"] == ["EXAMPLE OF TEXT"] * 5 ) # a list, not tensor assert (batch["supervisions"]["start_sample"] == tensor( [0, 160000, 0, 0, 0])).all() assert (batch["supervisions"]["num_samples"] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset_low_max_frames(k2_cut_set): dataset = K2SpeechRecognitionDataset() sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=2) dloader = DataLoader(dataset, sampler=sampler, batch_size=None) # Check that it does not crash for batch in dloader: # There will be only a single item in each batch as we're exceeding the limit each time. assert batch["inputs"].shape[0] == 1
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set): dataset = K2SpeechRecognitionDataset( cut_transforms=[CutConcatenate(), CutMix(k2_noise_cut_set)]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, sampler=sampler, batch_size=None) # Check that it does not crash by just running all dataloader iterations batches = [item for item in dloader] assert len(batches) > 0
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set): precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set) on_the_fly_dataset = K2SpeechRecognitionDataset( k2_cut_set.drop_features(), input_strategy=OnTheFlyFeatures(Fbank()) ) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc['inputs']) norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs']) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all() assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
def test_k2_dataloader(k2_cut_set): from torch import tensor dataset = K2SpeechRecognitionDataset(k2_cut_set) dloader = K2DataLoader(dataset, batch_size=4) batch = next(iter(dloader)) assert batch['features'].shape == (4, 308, 80) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 1, 2, 3, 3])).all() assert batch['supervisions']['text'] == ['IN EIGHTEEN THIRTEEN'] * 5 # a list, not tensor assert (batch['supervisions']['start_frame'] == tensor([0] * 4 + [154])).all() assert (batch['supervisions']['num_frames'] == tensor([154] * 5)).all()
def test_k2_speech_recognition_audio_inputs(k2_cut_set): on_the_fly_dataset = K2SpeechRecognitionDataset( k2_cut_set, input_strategy=AudioSamples(), ) # all cuts in one batch sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=10000000) cut_ids = next(iter(sampler)) batch = on_the_fly_dataset[cut_ids] assert batch['inputs'].shape == (4, 320000) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all() assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5 # a list, not tensor assert (batch['supervisions']['start_sample'] == tensor([0, 160000, 0, 0, 0])).all() assert (batch['supervisions']['num_samples'] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers): dataset = K2SpeechRecognitionDataset( k2_cut_set, cut_transforms=[CutConcatenate()] ) sampler = SingleCutSampler(k2_cut_set, shuffle=False) # Note: "batch_size=None" disables the automatic batching mechanism, # which is required when Dataset takes care of the collation itself. dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) batch = next(iter(dloader)) assert batch['inputs'].shape == (4, 2000, 40) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all() assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5 # a list, not tensor assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all() assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()
def test_k2_speech_recognition_dataset(k2_cut_set): dataset = K2SpeechRecognitionDataset(k2_cut_set) for i in range(3): example = dataset[i] assert example['features'].shape == (308, 80) assert len(example['supervisions']) == 1 assert example['supervisions'][0]['text'] == 'IN EIGHTEEN THIRTEEN' assert example['supervisions'][0]['sequence_idx'] == i assert example['supervisions'][0]['start_frame'] == 0 assert example['supervisions'][0]['num_frames'] == 154 example = dataset[3] assert example['features'].shape == (308, 80) assert len(example['supervisions']) == 2 assert example['supervisions'][0]['text'] == 'IN EIGHTEEN THIRTEEN' assert example['supervisions'][0]['sequence_idx'] == 3 assert example['supervisions'][0]['start_frame'] == 0 assert example['supervisions'][0]['num_frames'] == 154 assert example['supervisions'][1]['text'] == 'IN EIGHTEEN THIRTEEN' assert example['supervisions'][1]['sequence_idx'] == 3 assert example['supervisions'][1]['start_frame'] == 154 assert example['supervisions'][1]['num_frames'] == 154