def test_push_to_hub_in_organization(self):
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
        feature_extractor.push_to_hub("valid_org/test-feature-extractor",
                                      use_auth_token=self._token)

        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "valid_org/test-feature-extractor")
        for k, v in feature_extractor.__dict__.items():
            self.assertEqual(v, getattr(new_feature_extractor, k))

        # Reset repo
        delete_repo(token=self._token,
                    repo_id="valid_org/test-feature-extractor")

        # Push to hub via save_pretrained
        with tempfile.TemporaryDirectory() as tmp_dir:
            feature_extractor.save_pretrained(
                tmp_dir,
                repo_id="valid_org/test-feature-extractor-org",
                push_to_hub=True,
                use_auth_token=self._token)

        new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "valid_org/test-feature-extractor-org")
        for k, v in feature_extractor.__dict__.items():
            self.assertEqual(v, getattr(new_feature_extractor, k))
Exemplo n.º 2
0
    def test_push_to_hub(self):
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR)
        with tempfile.TemporaryDirectory() as tmp_dir:
            feature_extractor.save_pretrained(
                os.path.join(tmp_dir, "test-feature-extractor"), push_to_hub=True, use_auth_token=self._token
            )

            new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(f"{USER}/test-feature-extractor")
            for k, v in feature_extractor.__dict__.items():
                self.assertEqual(v, getattr(new_feature_extractor, k))
Exemplo n.º 3
0
    def test_cached_files_are_used_when_internet_is_down(self):
        # A mock response for an HTTP head request to emulate server down
        response_mock = mock.Mock()
        response_mock.status_code = 500
        response_mock.headers = []
        response_mock.raise_for_status.side_effect = HTTPError

        # Download this model to make sure it's in the cache.
        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
        # Under the mock environment we get a 500 error when trying to reach the model.
        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
            # This check we did call the fake head request
            mock_head.assert_called()
Exemplo n.º 4
0
    def test_inference_pretraining(self):
        model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        model.to(torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)

        with torch.no_grad():
            torch.manual_seed(0)
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
            )

        # compute cosine similarity
        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)

        # pretrained model should have learned a high cosine similarity
        self.assertTrue(cosine_sim.mean() > 0.5)

        # fmt: off
        expected_cosine_sim_slice = torch.tensor(
            [[0.8290, 0.8335, 0.8815, 0.8580, 0.8249],
             [0.8892, 0.9221, 0.8711, 0.8601, 0.8482]],
            device=torch_device,
        )
        # fmt: on

        self.assertTrue(torch.allclose(cosine_sim[:, :5], expected_cosine_sim_slice, atol=1e-3))
Exemplo n.º 5
0
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path,
                             model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    if checkpoint["Config"]["downstream_expert"]["modelrc"][
            "select"] not in SUPPORTED_MODELS:
        raise NotImplementedError(
            f"The supported s3prl models are {SUPPORTED_MODELS}")

    downstream_dict = checkpoint["Downstream"]

    hf_congfig = HubertConfig.from_pretrained(config_path)
    hf_model = HubertForSequenceClassification.from_pretrained(
        base_model_name, config=hf_congfig)
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False)

    if hf_congfig.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    hf_model.projector.weight.data = downstream_dict["projector.weight"]
    hf_model.projector.bias.data = downstream_dict["projector.bias"]
    hf_model.classifier.weight.data = downstream_dict[
        "model.post_net.linear.weight"]
    hf_model.classifier.bias.data = downstream_dict[
        "model.post_net.linear.bias"]

    hf_feature_extractor.save_pretrained(model_dump_path)
    hf_model.save_pretrained(model_dump_path)
Exemplo n.º 6
0
    def __init__(self,
                 source,
                 save_path,
                 output_norm=True,
                 freeze=True,
                 pretrain=True):
        super().__init__()

        # Download the extractor from HuggingFace.
        # The extractor is only used to retrieve the normalisation
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            source, cache_dir=save_path)

        # Download the model from HuggingFace.
        # if pretrain is False, we do not download the pretrained weights
        # it it is True, we download and load them.
        if not (pretrain):
            config = Wav2Vec2Config.from_pretrained(source,
                                                    cache_dir=save_path)
            self.model = Wav2Vec2Model(config)
        else:
            self.model = Wav2Vec2Model.from_pretrained(source,
                                                       cache_dir=save_path)

        # We check if inputs need to be normalized w.r.t pretrained wav2vec2
        self.normalize_wav = self.feature_extractor.do_normalize

        self.freeze = freeze
        self.output_norm = output_norm
        if self.freeze:
            self.model.eval()
        else:
            self.model.train()
Exemplo n.º 7
0
    def test_inference_diarization(self):
        model = WavLMForAudioFrameClassification.from_pretrained("microsoft/wavlm-base-plus-sd").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sd")
        input_data = self._load_superb("sd", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        # labels is a one-hot array of shape (num_frames, num_speakers)
        labels = (outputs.logits > 0).long()

        # s3prl logits for the same batch
        expected_logits = torch.tensor(
            [
                [[-5.9566, -8.6554], [-5.7137, -8.9386], [-5.7906, -7.0973], [-5.7829, -5.9999]],
                [[-5.2086, -7.7878], [-4.8890, -7.9312], [-4.2004, -3.9101], [-5.4480, -4.6932]],
                [[-4.6105, -6.7178], [-5.1930, -6.1635], [-2.6228, -4.1123], [-2.7646, -3.1576]],
                [[-4.4477, -7.9206], [-3.9339, -7.3707], [-4.9528, -4.8242], [-3.6921, -2.9687]],
            ],
            device=torch_device,
        )
        self.assertEqual(labels[0, :, 0].sum(), 258)
        self.assertEqual(labels[0, :, 1].sum(), 647)
        # TODO: update the tolerance after the CI moves to torch 1.10
        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
Exemplo n.º 8
0
    def test_inference_intent_classification(self):
        model = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-ic").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic")
        input_data = self._load_superb("ic", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)

        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)

        expected_labels_action = [1, 0, 4, 3]
        expected_logits_action = torch.tensor([5.9052, 12.5865, 4.4840, 10.0240], device=torch_device)
        expected_labels_object = [1, 10, 3, 4]
        expected_logits_object = torch.tensor([5.5316, 11.7946, 8.1672, 23.2415], device=torch_device)
        expected_labels_location = [0, 0, 0, 1]
        expected_logits_location = torch.tensor([5.2053, 8.9577, 10.0447, 8.1481], device=torch_device)

        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)

        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=3e-1))
        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))
Exemplo n.º 9
0
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        r"""
        Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.

        .. note::

            This class method is simply calling Wav2Vec2FeatureExtractor's
            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
            Wav2Vec2CTCTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
            Please refer to the docstrings of the methods above for more information.

        Args:
            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
                This can be either:

                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing a feature extractor file saved using the
                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
                  ``./my_model_directory/``.
                - a path or url to a saved feature extractor JSON `file`, e.g.,
                  ``./my_model_directory/feature_extraction_config.json``.
            **kwargs
                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
                :class:`~transformers.PreTrainedTokenizer`
        """
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            pretrained_model_name_or_path, **kwargs)

        return cls(feature_extractor=feature_extractor)
Exemplo n.º 10
0
    def test_inference_large(self):
        model = WavLMModel.from_pretrained("microsoft/wavlm-large").to(
            torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "microsoft/wavlm-large", return_attention_mask=True)

        input_speech = self._load_datasamples(2)

        inputs = feature_extractor(input_speech,
                                   return_tensors="pt",
                                   padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            hidden_states_slice = (model(
                input_values,
                attention_mask=attention_mask).last_hidden_state[:, -2:,
                                                                 -2:].cpu())

        EXPECTED_HIDDEN_STATES_SLICE = torch.tensor([[[0.2122, 0.0500],
                                                      [0.2118, 0.0563]],
                                                     [[0.1353, 0.1818],
                                                      [0.2453, 0.0595]]])

        self.assertTrue(
            torch.allclose(hidden_states_slice,
                           EXPECTED_HIDDEN_STATES_SLICE,
                           rtol=5e-2))
    def test_inference_intent_classification(self):
        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
        input_data = self._load_superb("ic", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)

        predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1)
        predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1)
        predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1)

        expected_labels_action = [0, 0, 2, 3]
        expected_logits_action = torch.tensor([0.4568, 11.0848, 1.6621, 9.3841], device=torch_device)
        expected_labels_object = [3, 10, 3, 4]
        expected_logits_object = torch.tensor([1.5322, 10.7094, 5.2469, 22.1318], device=torch_device)
        expected_labels_location = [0, 0, 0, 1]
        expected_logits_location = torch.tensor([1.5335, 6.5096, 10.5704, 11.0569], device=torch_device)

        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)

        self.assertTrue(torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
        self.assertTrue(torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
        self.assertTrue(torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
    def test_inference_emotion_recognition(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-er",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-er")
        input_data = self._load_superb("er", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [1, 1, 2, 2]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=1e-1))
Exemplo n.º 13
0
    def test_inference_speaker_identification(self):
        model = Wav2Vec2ForSequenceClassification.from_pretrained(
            "superb/wav2vec2-base-superb-sid").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/wav2vec2-base-superb-sid")
        input_data = self._load_superb("si", 4)

        output_logits = []
        with torch.no_grad():
            for example in input_data["speech"]:
                input = processor(example, return_tensors="pt", padding=True)
                output = model(input.input_values.to(torch_device),
                               attention_mask=None)
                output_logits.append(output.logits[0])
        output_logits = torch.stack(output_logits)
        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)

        expected_labels = [251, 1, 1, 3]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([37.5627, 71.6362, 64.2419, 31.7778],
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=1e-2))
Exemplo n.º 14
0
    def test_inference_encoder_large(self):
        model = UniSpeechSatModel.from_pretrained(
            "microsoft/unispeech-sat-large")
        model.to(torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "facebook/wav2vec2-large-xlsr-53")
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech,
                                        return_tensors="pt",
                                        padding=True)

        with torch.no_grad():
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
            )

        # fmt: off
        expected_hidden_states_slice = torch.tensor(
            [[[-0.1172, -0.0797], [-0.0012, 0.0213]],
             [[-0.1225, -0.1277], [-0.0668, -0.0585]]],
            device=torch_device,
        )
        # fmt: on

        self.assertTrue(
            torch.allclose(outputs.last_hidden_state[:, :2, -2:],
                           expected_hidden_states_slice,
                           atol=1e-3))
    def test_inference_speaker_identification(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-sid",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-sid")
        input_data = self._load_superb("si", 4)

        output_logits = []
        with torch.no_grad():
            for example in input_data["speech"]:
                input = processor(example, return_tensors="pt", padding=True)
                output = model(input.input_values.half().to(torch_device),
                               attention_mask=None)
                output_logits.append(output.logits[0])
        output_logits = torch.stack(output_logits)
        predicted_logits, predicted_ids = torch.max(output_logits, dim=-1)

        expected_labels = [5, 1, 1, 3]
        # s3prl logits for the same batch
        expected_logits = torch.tensor(
            [78231.5547, 123166.6094, 122785.4141, 84851.2969],
            dtype=torch.float16,
            device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=10))
    def test_inference_keyword_spotting(self):
        model = HubertForSequenceClassification.from_pretrained(
            "superb/hubert-base-superb-ks",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "superb/hubert-base-superb-ks")
        input_data = self._load_superb("ks", 4)
        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1)

        expected_labels = [2, 6, 10, 9]
        # s3prl logits for the same batch
        expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232],
                                       dtype=torch.float16,
                                       device=torch_device)

        self.assertListEqual(predicted_ids.tolist(), expected_labels)
        self.assertTrue(
            torch.allclose(predicted_logits, expected_logits, atol=2e-2))
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path,
                             model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    downstream_dict = checkpoint["Downstream"]

    hf_config = UniSpeechSatConfig.from_pretrained(config_path)
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False)

    arch = hf_config.architectures[0]
    if arch.endswith("ForSequenceClassification"):
        hf_model = convert_classification(base_model_name, hf_config,
                                          downstream_dict)
    elif arch.endswith("ForAudioFrameClassification"):
        hf_model = convert_diarization(base_model_name, hf_config,
                                       downstream_dict)
    elif arch.endswith("ForXVector"):
        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
    else:
        raise NotImplementedError(
            f"S3PRL weights conversion is not supported for {arch}")

    if hf_config.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    hf_feature_extractor.save_pretrained(model_dump_path)
    hf_model.save_pretrained(model_dump_path)
    def test_inference_integration(self):
        model = Wav2Vec2ForPreTraining.from_pretrained(
            "facebook/wav2vec2-base")
        model.to(torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base", return_attention_mask=True)
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech,
                                        return_tensors="pt",
                                        padding=True)

        features_shape = (
            inputs_dict["input_values"].shape[0],
            model._get_feat_extract_output_lengths(
                torch.tensor(inputs_dict["input_values"].shape[1])),
        )

        torch.manual_seed(0)
        mask_time_indices = _compute_mask_indices(
            features_shape,
            model.config.mask_time_prob,
            model.config.mask_time_length,
            device=inputs_dict["input_values"].device,
            min_masks=2,
        ).to(torch_device)

        with torch.no_grad():
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
                mask_time_indices=mask_time_indices,
            )

        # compute cosine similarity
        cosine_sim = torch.cosine_similarity(
            outputs.projected_states,
            outputs.projected_quantized_states,
            dim=-1)

        # retrieve cosine sim of masked features
        cosine_sim_masked = cosine_sim[mask_time_indices]

        # fmt: off
        expected_cosine_sim_masked = torch.tensor(
            [
                0.7458, 0.7188, 0.6418, 0.3729, 0.3741, 0.3694, 0.3110, 0.2257,
                0.4403, 0.5415, 0.3950, 0.3701, 0.8831, 0.8613, 0.5229, 0.6696,
                0.7206, 0.7877, 0.6758, 0.8746, 0.6596, 0.6282, 0.6178, 0.5839,
                0.5926, 0.6651, 0.4635, 0.6332, 0.6572, 0.8776, 0.4999, 0.7001,
                0.7257, 0.5098, 0.6229, 0.4566, 0.5261, 0.6363, 0.5371, 0.6997
            ],
            device=torch_device,
        )
        # fmt: on

        self.assertTrue(
            torch.allclose(cosine_sim_masked,
                           expected_cosine_sim_masked,
                           atol=1e-3))
Exemplo n.º 19
0
    def test_inference_base(self):
        model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus").to(
            torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "microsoft/wavlm-base-plus", return_attention_mask=True)

        input_speech = self._load_datasamples(2)

        inputs = feature_extractor(input_speech,
                                   return_tensors="pt",
                                   padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            hidden_states_slice = (model(
                input_values,
                attention_mask=attention_mask).last_hidden_state[:, -2:,
                                                                 -2:].cpu())

        EXPECTED_HIDDEN_STATES_SLICE = torch.tensor([[[0.0577, 0.1161],
                                                      [0.0579, 0.1165]],
                                                     [[0.0199, 0.1237],
                                                      [0.0059, 0.0605]]])
        # TODO: update the tolerance after the CI moves to torch 1.10
        self.assertTrue(
            torch.allclose(hidden_states_slice,
                           EXPECTED_HIDDEN_STATES_SLICE,
                           atol=5e-2))
Exemplo n.º 20
0
    def test_inference_diarization(self):
        model = UniSpeechSatForAudioFrameClassification.from_pretrained("microsoft/unispeech-sat-base-plus-sd").to(
            torch_device
        )
        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/unispeech-sat-base-plus-sd")
        input_data = self._load_superb("sd", 4)
        inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)
        with torch.no_grad():
            outputs = model(input_values, attention_mask=attention_mask)
        # labels is a one-hot array of shape (num_frames, num_speakers)
        labels = (outputs.logits > 0).long()

        # s3prl logits for the same batch
        expected_logits = torch.tensor(
            [
                [[-5.6119, -5.5845], [-3.7772, -5.4824], [-3.6914, -5.1619], [-4.7560, -5.0496]],
                [[-6.3785, -4.8365], [-5.5863, -5.4149], [-5.5639, -4.8469], [-6.1511, -4.0052]],
                [[-6.0355, -3.7414], [-5.5968, -4.8061], [-5.4620, -4.7310], [-5.5864, -4.6078]],
                [[-5.9493, -4.8963], [-4.4050, -5.4476], [-4.1755, -5.1395], [-4.0272, -4.3705]],
            ],
            device=torch_device,
        )
        self.assertEqual(labels[0, :, 0].sum(), 270)
        self.assertEqual(labels[0, :, 1].sum(), 647)
        # TODO: update the tolerance after the CI moves to torch 1.10
        self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
Exemplo n.º 21
0
    def test_inference_speaker_verification(self):
        model = UniSpeechSatForXVector.from_pretrained(
            "microsoft/unispeech-sat-base-plus-sv").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "microsoft/unispeech-sat-base-plus-sv")
        input_data = self._load_superb("si", 4)

        inputs = processor(input_data["speech"],
                           return_tensors="pt",
                           padding=True)
        labels = torch.tensor([5, 1, 1, 3], device=torch_device).T

        with torch.no_grad():
            input_values = inputs.input_values.to(torch_device)
            attention_mask = inputs.attention_mask.to(torch_device)
            outputs = model(input_values,
                            attention_mask=attention_mask,
                            labels=labels)
        embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1)

        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
        # id10002 vs id10002
        self.assertAlmostEqual(
            cosine_sim(embeddings[1], embeddings[2]).item(), 0.9671, 3)
        # id10006 vs id10002
        self.assertAlmostEqual(
            cosine_sim(embeddings[0], embeddings[1]).item(), 0.4941, 3)
        # id10002 vs id10004
        self.assertAlmostEqual(
            cosine_sim(embeddings[2], embeddings[3]).item(), 0.5616, 3)

        # TODO: update the tolerance after the CI moves to torch 1.10
        self.assertAlmostEqual(outputs.loss.item(), 18.5925, 2)
Exemplo n.º 22
0
    def test_inference_encoder_base(self):
        model = UniSpeechSatModel.from_pretrained(
            "microsoft/unispeech-sat-base-plus")
        model.to(torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base", return_attention_mask=True)
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech,
                                        return_tensors="pt",
                                        padding=True)

        with torch.no_grad():
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
            )

        # fmt: off
        expected_hidden_states_slice = torch.tensor(
            [[[-0.0743, 0.1384], [-0.0845, 0.1704]],
             [[-0.0954, 0.1936], [-0.1123, 0.2095]]],
            device=torch_device,
        )
        # fmt: on

        self.assertTrue(
            torch.allclose(outputs.last_hidden_state[:, :2, -2:],
                           expected_hidden_states_slice,
                           atol=1e-3))
Exemplo n.º 23
0
    def test_inference_pretrained(self):
        model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60", from_pt=True)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "facebook/wav2vec2-large-lv60", return_attention_mask=True
        )
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech, return_tensors="np", padding=True)

        features_shape = (
            inputs_dict["input_values"].shape[0],
            model._get_feat_extract_output_lengths(np.array(inputs_dict["input_values"].shape[1])),
        )

        mask_time_indices = _compute_mask_indices(
            features_shape,
            model.config.mask_time_prob,
            model.config.mask_time_length,
            min_masks=2,
        )

        outputs = model(
            inputs_dict.input_values,
            attention_mask=inputs_dict.attention_mask,
            mask_time_indices=mask_time_indices,
        )

        # compute cosine similarity
        cosine_sim = optax.cosine_similarity(
            outputs.projected_states, outputs.projected_quantized_states, epsilon=1e-8
        )

        # retrieve cosine sim of masked features
        cosine_sim_masked = cosine_sim[mask_time_indices]

        # ... now compare to randomly initialized model

        config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-large-lv60")
        model_rand = FlaxWav2Vec2ForPreTraining(config)

        outputs_rand = model_rand(
            inputs_dict.input_values,
            attention_mask=inputs_dict.attention_mask,
            mask_time_indices=mask_time_indices,
        )

        # compute cosine similarity
        cosine_sim_rand = optax.cosine_similarity(
            outputs_rand.projected_states, outputs_rand.projected_quantized_states
        )

        # retrieve cosine sim of masked features
        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]

        # a pretrained wav2vec2 model has learned to predict the quantized latent states
        # => the cosine similarity between quantized states and predicted states > 0.5
        # a random wav2vec2 model has not learned to predict the quantized latent states
        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
    def test_pretrained_checkpoints_are_set_correctly(self):
        # this test makes sure that models that are using
        # group norm don't have their feature extractor return the
        # attention_mask
        for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST:
            config = Wav2Vec2Config.from_pretrained(model_id)
            feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id)

            # only "layer" feature extraction norm should make use of
            # attention_mask
            self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer")
Exemplo n.º 25
0
    def test_inference_pretrained_batched(self):
        model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-d-tiny-100k")

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            outputs = model(input_values).last_hidden_state

        # expected outputs taken from the original SEW-D implementation
        expected_outputs_first = torch.tensor(
            [
                [
                    [-0.1619, 0.6995, 0.4062, -0.1014],
                    [-0.1364, 0.5960, 0.0952, -0.0873],
                    [-0.1572, 0.5718, 0.4228, -0.0864],
                    [-0.1325, 0.6823, 0.1387, -0.0871],
                ],
                [
                    [-0.1296, 0.4008, 0.4952, -0.1450],
                    [-0.1152, 0.3693, 0.3037, -0.1290],
                    [-0.1194, 0.6074, 0.3531, -0.1466],
                    [-0.1113, 0.3135, 0.2224, -0.1338],
                ],
            ],
            device=torch_device,
        )
        expected_outputs_last = torch.tensor(
            [
                [
                    [-0.1577, 0.5108, 0.8553, 0.2550],
                    [-0.1530, 0.3580, 0.6143, 0.2672],
                    [-0.1535, 0.4954, 0.8503, 0.1387],
                    [-0.1572, 0.3363, 0.6217, 0.1490],
                ],
                [
                    [-0.1338, 0.5459, 0.9607, -0.1133],
                    [-0.1502, 0.3738, 0.7313, -0.0986],
                    [-0.0953, 0.4708, 1.0821, -0.0944],
                    [-0.1474, 0.3598, 0.7248, -0.0748],
                ],
            ],
            device=torch_device,
        )
        expected_output_sum = 54201.0469

        self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=1e-3))
        self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=1e-3))
        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 1)
Exemplo n.º 26
0
    def test_inference_pretrained_batched(self):
        model = SEWModel.from_pretrained("asapp/sew-tiny-100k").to(torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-tiny-100k")

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            outputs = model(input_values).last_hidden_state

        # expected outputs taken from the original SEW implementation
        expected_outputs_first = torch.tensor(
            [
                [
                    [0.1509, 0.5372, 0.3061, -0.1694],
                    [-0.1700, 0.5764, 0.2753, -0.1299],
                    [0.1281, 0.7949, 0.2342, -0.1624],
                    [-0.1627, 0.6710, 0.2215, -0.1317],
                ],
                [
                    [0.0408, 1.4355, 0.8605, -0.0968],
                    [0.0393, 1.2368, 0.6826, 0.0364],
                    [-0.1269, 1.9215, 1.1677, -0.1297],
                    [-0.1654, 1.6524, 0.6877, -0.0196],
                ],
            ],
            device=torch_device,
        )
        expected_outputs_last = torch.tensor(
            [
                [
                    [1.3379, -0.1450, -0.1500, -0.0515],
                    [0.8364, -0.1680, -0.1248, -0.0689],
                    [1.2791, -0.1507, -0.1523, -0.0564],
                    [0.8208, -0.1690, -0.1199, -0.0751],
                ],
                [
                    [0.6959, -0.0861, -0.1235, -0.0861],
                    [0.4700, -0.1686, -0.1141, -0.1199],
                    [1.0776, -0.1137, -0.0124, -0.0472],
                    [0.5774, -0.1675, -0.0376, -0.0823],
                ],
            ],
            device=torch_device,
        )
        expected_output_sum = 62146.7422

        self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3))
        self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3))
        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 5)
Exemplo n.º 27
0
    def __init__(
        self,
        source,
        save_path,
        output_norm=True,
        freeze=True,
        freeze_feature_extractor=False,
        pretrain=True,
        apply_spec_augment=False,
    ):
        super().__init__()

        # Download the extractor from HuggingFace.
        # The extractor is only used to retrieve the normalisation
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            source, cache_dir=save_path
        )

        # Select specific self-supervised loader (eg. Wav2Vec2, Hubert)
        if "hubert" in source:
            config = HF_config.get("hubert")
            model = HF_models.get("hubert")
        else:
            config = HF_config.get("wav2vec2")
            model = HF_models.get("wav2vec2")

        # Download the model from HuggingFace.
        # if pretrain is False, we do not download the pretrained weights
        # it it is True, we download and load them.
        if not (pretrain):
            config = config.from_pretrained(source, cache_dir=save_path)
            self.model = model(config)
        else:
            self.model = model.from_pretrained(source, cache_dir=save_path)

        # set apply_spec_augment
        self.model.config.apply_spec_augment = apply_spec_augment

        # We check if inputs need to be normalized w.r.t pretrained wav2vec2
        self.normalize_wav = self.feature_extractor.do_normalize

        self.freeze = freeze
        self.freeze_feature_extractor = freeze_feature_extractor
        self.output_norm = output_norm
        if self.freeze:
            self.model.eval()
        else:
            self.model.train()
            if self.freeze_feature_extractor:
                self.model.feature_extractor._freeze_parameters()
    def test_loss_pretraining(self):
        model = Wav2Vec2ForPreTraining.from_pretrained(
            "facebook/wav2vec2-base",
            attention_dropout=0.0,
            feat_proj_dropout=0.0,
            hidden_dropout=0.0,
            layerdrop=0.0,
        )
        model.to(torch_device).train()

        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "facebook/wav2vec2-base", return_attention_mask=True)
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech,
                                        return_tensors="pt",
                                        padding=True)

        features_shape = (
            inputs_dict["input_values"].shape[0],
            model._get_feat_extract_output_lengths(
                inputs_dict["input_values"].shape[1]),
        )

        torch.manual_seed(0)
        mask_time_indices = _compute_mask_indices(
            features_shape,
            model.config.mask_time_prob,
            model.config.mask_time_length,
            device=inputs_dict["input_values"].device,
            min_masks=2,
        ).to(torch_device)

        with torch.no_grad():
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
                mask_time_indices=mask_time_indices,
            )

        # check diversity loss
        num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
        diversity_loss = (num_codevectors -
                          outputs.codevector_perplexity) / num_codevectors
        self.assertTrue(abs(diversity_loss.item() - 0.8859) < 1e-3)

        # check overall loss (contrastive loss + diversity loss)
        expected_loss = 62.5170

        self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
Exemplo n.º 29
0
    def __init__(
        self,
        source,
        save_path,
        output_norm=True,
        freeze=True,
        freeze_feature_extractor=False,
        apply_spec_augment=False,
    ):
        super().__init__()

        # Download the extractor from HuggingFace.
        # The extractor is only used to retrieve the normalisation information
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            source, cache_dir=save_path)

        # Select specific self-supervised loader (eg. Wav2Vec2, Hubert)
        if "hubert" in source:
            config = HF_config.get("hubert")
            model = HF_models.get("hubert")
        else:
            config = HF_config.get("wav2vec2")
            model = HF_models.get("wav2vec2")

        # Download and load the model
        self._from_pretrained(source,
                              config=config,
                              model=model,
                              save_path=save_path)

        # set apply_spec_augment
        self.model.config.apply_spec_augment = apply_spec_augment

        # We check if inputs need to be normalized w.r.t pretrained wav2vec2
        self.normalize_wav = self.feature_extractor.do_normalize

        self.freeze = freeze
        self.freeze_feature_extractor = freeze_feature_extractor
        self.output_norm = output_norm
        if self.freeze:
            logger.warning(
                "speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen."
            )
            self.model.eval()
            for param in self.model.parameters():
                param.requires_grad = False
        else:
            self.model.train()
            if self.freeze_feature_extractor:
                self.model.feature_extractor._freeze_parameters()
 def __init__(self,
              model_path: str = "facebook/wav2vec2-large-xlsr-53",
              device: str = "cpu",
              target_sample_rate: int = 16000) -> None:
     super().__init__()
     self.target_sample_rate = target_sample_rate
     self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
         model_path, cache_dir=".ckpt")  #, map_location="cpu"
     if not device == "cpu" and not torch.cuda.is_available():
         logging.warning("gpu 不可用,使用cpu推理")
         device = "cpu"
     self.device = torch.device(device)
     self.model = Wav2Vec2Model.from_pretrained(model_path,
                                                cache_dir=".ckpt").to(
                                                    self.device)