def test_inference_ctc_batched(self): model = Data2VecAudioForCTC.from_pretrained( "facebook/data2vec-audio-base-960h").to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True) input_speech = self._load_datasamples(4) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about", "his instant of panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def check_ctc_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Data2VecAudioForCTC(config=config) model.to(torch_device) model.train() # freeze feature encoder model.freeze_feature_encoder() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths( torch.tensor(input_lengths)) labels = ids_tensor( (input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 if max_length_labels[i] < labels.shape[-1]: # it's important that we make sure that target lenghts are at least # one shorter than logit lenghts to prevent -inf labels[i, max_length_labels[i] - 1:] = -100 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def check_ctc_loss(self, config, input_values, *args): model = Data2VecAudioForCTC(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float))
def test_mask_time_prob_ctc(self): model = Data2VecAudioForCTC.from_pretrained( "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2 ) model.to(torch_device).train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) batch_duration_in_seconds = [1, 3, 2, 6] input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
def check_labels_out_of_vocab(self, config, input_values, *args): model = Data2VecAudioForCTC(config) model.to(torch_device) model.train() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) with self.parent.assertRaises(ValueError): model(input_values, labels=labels)
def test_inference_ctc_normal(self): model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h") model.to(torch_device) processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True) input_speech = self._load_datasamples(1) input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)