def test_inference_emotion_recognition(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-er",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-er")
        input_data = self._load_superb("er", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [1, 1, 2, 2]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=1e-1))
    def test_inference_speaker_identification(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-sid",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-sid")
        input_data = self._load_superb("si", 4)

        output_logits = []
        with torch.no_grad():
            for example in input_data["speech"]:
                input = processor(example, return_tensors="pt", padding=True)
                output = model(input.input_values.half().to(torch_device),
                               attention_mask=None)
                output_logits.append(output.logits[0])
        output_logits = torch.stack(output_logits)
        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)

        expected_labels = [5, 1, 1, 3]
        # s3prl logits for the same batch
        expected_logits = torch.tensor(
            [78231.5547, 123166.6094, 122785.4141, 84851.2969],
            dtype=torch.float16,
            device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=10))
    def test_inference_keyword_spotting(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-ks",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-ks")
        input_data = self._load_superb("ks", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [2, 6, 10, 9]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=2e-2))
Пример #4
0
    def test_inference_intent_classification(self):
        model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ic").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic")
        input_data = self._load_superb("ic", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)

        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)

        expected_labels_action = [1, 0, 4, 3]
        expected_logits_action = torch.tensor([5.9052, 12.5865, 4.4840, 10.0240], device=torch_device)
        expected_labels_object = [1, 10, 3, 4]
        expected_logits_object = torch.tensor([5.5316, 11.7946, 8.1672, 23.2415], device=torch_device)
        expected_labels_location = [0, 0, 0, 1]
        expected_logits_location = torch.tensor([5.2053, 8.9577, 10.0447, 8.1481], device=torch_device)

        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)

        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))
    def check_seq_classifier_loss(self, config, input_values, *args):
        model = HubertForSequenceClassification(config=config)
        model.to(torch_device)

        # make sure that dropout is disabled
        model.eval()

        input_values = input_values[:3]
        attention_mask = torch.ones(input_values.shape,
                                    device=torch_device,
                                    dtype=torch.long)

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        labels = ids_tensor((input_values.shape[0], 1),
                            len(model.config.id2label))

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i]:] = 0.0
            attention_mask[i, input_lengths[i]:] = 0

        masked_loss = model(input_values,
                            attention_mask=attention_mask,
                            labels=labels).loss.item()
        unmasked_loss = model(input_values, labels=labels).loss.item()

        self.parent.assertTrue(isinstance(masked_loss, float))
        self.parent.assertTrue(isinstance(unmasked_loss, float))
        self.parent.assertTrue(masked_loss != unmasked_loss)
Пример #6
0
    def check_seq_classifier_training(self, config, input_values, *args):
        config.ctc_zero_infinity = True
        model = HubertForSequenceClassification(config=config)
        model.to(torch_device)
        model.train()

        # freeze everything but the classification head
        model.freeze_base_model()

        input_values = input_values[:3]

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i] :] = 0.0

        loss = model(input_values, labels=labels).loss
        self.parent.assertFalse(torch.isinf(loss).item())

        loss.backward()