def test_inference_distilhubert(self): model = HubertModel.from_pretrained("ntu-spml/distilhubert").to( torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "ntu-spml/distilhubert") # TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572 input_speech = self._load_datasamples(1) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): outputs = model(input_values).last_hidden_state # expected outputs taken from the original SEW implementation expected_outputs_first = torch.tensor( [[ [-0.3505, 0.1167, 0.0608, 0.1294], [-0.3085, 0.0481, 0.1106, 0.0955], [-0.3107, -0.0391, 0.0739, 0.1360], [-0.2385, -0.1795, -0.0928, 0.2389], ]], device=torch_device, ) expected_outputs_last = torch.tensor( [[ [-0.0732, 0.0255, 0.0529, -0.1372], [-0.0812, 0.1259, 0.0564, -0.0438], [-0.0054, 0.0758, -0.0002, -0.1617], [0.0133, -0.0320, -0.0687, 0.0062], ]], device=torch_device, ) expected_output_sum = -3776.0730 self.assertTrue( torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) self.assertTrue( torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1)
def test_model_from_pretrained(self): model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") self.assertIsNotNone(model)