def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): """ Copy/paste/tweak model's weights to transformers design. """ checkpoint = torch.load(checkpoint_path, map_location="cpu") downstream_dict = checkpoint["Downstream"] hf_config = WavLMConfig.from_pretrained(config_path) hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( base_model_name, return_attention_mask=True, do_normalize=False ) arch = hf_config.architectures[0] if arch.endswith("ForSequenceClassification"): hf_model = convert_classification(base_model_name, hf_config, downstream_dict) elif arch.endswith("ForAudioFrameClassification"): hf_model = convert_diarization(base_model_name, hf_config, downstream_dict) elif arch.endswith("ForXVector"): hf_model = convert_xvector(base_model_name, hf_config, downstream_dict) else: raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}") if hf_config.use_weighted_layer_sum: hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] hf_feature_extractor.save_pretrained(model_dump_path) hf_model.save_pretrained(model_dump_path)
def test_inference_integration(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( torch.tensor(inputs_dict["input_values"].shape[1])), ) torch.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim = torch.cosine_similarity( outputs.projected_states, outputs.projected_quantized_states, dim=-1) # retrieve cosine sim of masked features cosine_sim_masked = cosine_sim[mask_time_indices] # fmt: off expected_cosine_sim_masked = torch.tensor( [ 0.7458, 0.7188, 0.6418, 0.3729, 0.3741, 0.3694, 0.3110, 0.2257, 0.4403, 0.5415, 0.3950, 0.3701, 0.8831, 0.8613, 0.5229, 0.6696, 0.7206, 0.7877, 0.6758, 0.8746, 0.6596, 0.6282, 0.6178, 0.5839, 0.5926, 0.6651, 0.4635, 0.6332, 0.6572, 0.8776, 0.4999, 0.7001, 0.7257, 0.5098, 0.6229, 0.4566, 0.5261, 0.6363, 0.5371, 0.6997 ], device=torch_device, ) # fmt: on self.assertTrue( torch.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
def __init__(self): super(ASR_CTC, self).__init__() #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base') #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab()) #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") self.feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor( feature_extractor=self.feature_extractor, tokenizer=self.tokenizer) self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction="mean", pad_token_id=self.processor.tokenizer.pad_token_id, vocab_size=len(self.processor.tokenizer))
def test_inference_keyword_spotting(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-ks", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-ks") input_data = self._load_superb("ks", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) expected_labels = [2, 6, 10, 9] # s3prl logits for the same batch expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=2e-2))
def test_inference_diarization(self): model = WavLMForAudioFrameClassification.from_pretrained("microsoft/wavlm-base-plus-sd").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sd") input_data = self._load_superb("sd", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True, sampling_rate=16_000) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) # labels is a one-hot array of shape (num_frames, num_speakers) labels = (outputs.logits > 0).long() # s3prl logits for the same batch expected_logits = torch.tensor( [ [[-5.9566, -8.6554], [-5.7137, -8.9386], [-5.7906, -7.0973], [-5.7829, -5.9999]], [[-5.2086, -7.7878], [-4.8890, -7.9312], [-4.2004, -3.9101], [-5.4480, -4.6932]], [[-4.6105, -6.7178], [-5.1930, -6.1635], [-2.6228, -4.1123], [-2.7646, -3.1576]], [[-4.4477, -7.9206], [-3.9339, -7.3707], [-4.9528, -4.8242], [-3.6921, -2.9687]], ], device=torch_device, ) self.assertEqual(labels[0, :, 0].sum(), 258) self.assertEqual(labels[0, :, 1].sum(), 647) self.assertTrue(torch.allclose(outputs.logits[:, :4], expected_logits, atol=1e-3))
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor. .. note:: This class method is simply calling Wav2Vec2FeatureExtractor's :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and Wav2Vec2CTCTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the docstrings of the methods above for more information. Args: pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either: - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a feature extractor file saved using the :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g., ``./my_model_directory/``. - a path or url to a saved feature extractor JSON `file`, e.g., ``./my_model_directory/feature_extraction_config.json``. **kwargs Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and :class:`~transformers.PreTrainedTokenizer` """ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( pretrained_model_name_or_path, **kwargs) return cls(feature_extractor=feature_extractor)
def test_inference_encoder_large(self): model = UniSpeechSatModel.from_pretrained( "microsoft/unispeech-sat-large") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-large-xlsr-53") input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), ) # fmt: off expected_hidden_states_slice = torch.tensor( [[[-0.1172, -0.0797], [-0.0012, 0.0213]], [[-0.1225, -0.1277], [-0.0668, -0.0585]]], device=torch_device, ) # fmt: on self.assertTrue( torch.allclose(outputs.last_hidden_state[:, :2, -2:], expected_hidden_states_slice, atol=1e-3))
def test_inference_speaker_verification(self): model = UniSpeechSatForXVector.from_pretrained( "microsoft/unispeech-sat-base-plus-sv").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "microsoft/unispeech-sat-base-plus-sv") input_data = self._load_superb("si", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) labels = torch.tensor([5, 1, 1, 3], device=torch_device).T with torch.no_grad(): input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) outputs = model(input_values, attention_mask=attention_mask, labels=labels) embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1) cosine_sim = torch.nn.CosineSimilarity(dim=-1) # id10002 vs id10002 self.assertAlmostEqual( cosine_sim(embeddings[1], embeddings[2]).item(), 0.9671, 3) # id10006 vs id10002 self.assertAlmostEqual( cosine_sim(embeddings[0], embeddings[1]).item(), 0.4941, 3) # id10002 vs id10004 self.assertAlmostEqual( cosine_sim(embeddings[2], embeddings[3]).item(), 0.5616, 3) self.assertAlmostEqual(outputs.loss.item(), 18.5925, 3)
def test_inference_pretraining(self): model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53") input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) with torch.no_grad(): torch.manual_seed(0) outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), ) # compute cosine similarity cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) # pretrained model should have learned a high cosine similarity self.assertTrue(cosine_sim.mean() > 0.5) # fmt: off expected_cosine_sim_slice = torch.tensor( [[0.8290, 0.8335, 0.8815, 0.8580, 0.8249], [0.8892, 0.9221, 0.8711, 0.8601, 0.8482]], device=torch_device, ) # fmt: on self.assertTrue(torch.allclose(cosine_sim[:, :5], expected_cosine_sim_slice, atol=1e-3))
def test_inference_encoder_base(self): model = UniSpeechSatModel.from_pretrained( "microsoft/unispeech-sat-base-plus") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), ) # fmt: off expected_hidden_states_slice = torch.tensor( [[[-0.0743, 0.1384], [-0.0845, 0.1704]], [[-0.0954, 0.1936], [-0.1123, 0.2095]]], device=torch_device, ) # fmt: on self.assertTrue( torch.allclose(outputs.last_hidden_state[:, :2, -2:], expected_hidden_states_slice, atol=1e-3))
def test_inference_large(self): model = WavLMModel.from_pretrained("microsoft/wavlm-large").to( torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "microsoft/wavlm-large", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs = feature_extractor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): hidden_states_slice = (model( input_values, attention_mask=attention_mask).last_hidden_state[:, -2:, -2:].cpu()) EXPECTED_HIDDEN_STATES_SLICE = torch.tensor([[[0.2122, 0.0500], [0.2118, 0.0563]], [[0.1353, 0.1818], [0.2453, 0.0595]]]) self.assertTrue( torch.allclose(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, rtol=5e-2))
def test_inference_speaker_verification(self): model = WavLMForXVector.from_pretrained( "microsoft/wavlm-base-plus-sv").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "microsoft/wavlm-base-plus-sv") input_data = self._load_superb("si", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) labels = torch.tensor([5, 1, 1, 3], device=torch_device).T with torch.no_grad(): input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) outputs = model(input_values, attention_mask=attention_mask, labels=labels) embeddings = torch.nn.functional.normalize(outputs.embeddings, dim=-1) cosine_sim = torch.nn.CosineSimilarity(dim=-1) # id10002 vs id10002 self.assertAlmostEqual( cosine_sim(embeddings[1], embeddings[2]).item(), 0.9787, 3) # id10006 vs id10002 self.assertAlmostEqual( cosine_sim(embeddings[0], embeddings[1]).item(), 0.5064, 3) # id10002 vs id10004 self.assertAlmostEqual( cosine_sim(embeddings[2], embeddings[3]).item(), 0.4780, 3) # TODO: update the tolerance after the CI moves to torch 1.10 self.assertAlmostEqual(outputs.loss.item(), 18.4154, 2)
def test_inference_base(self): model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus").to( torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "microsoft/wavlm-base-plus", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs = feature_extractor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): hidden_states_slice = (model( input_values, attention_mask=attention_mask).last_hidden_state[:, -2:, -2:].cpu()) EXPECTED_HIDDEN_STATES_SLICE = torch.tensor([[[0.0577, 0.1161], [0.0579, 0.1165]], [[0.0199, 0.1237], [0.0059, 0.0605]]]) # TODO: update the tolerance after the CI moves to torch 1.10 self.assertTrue( torch.allclose(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, atol=5e-2))
def test_inference_emotion_recognition(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-er", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-er") input_data = self._load_superb("er", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) expected_labels = [1, 1, 2, 2] # s3prl logits for the same batch expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=1e-1))
def test_inference_speaker_identification(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-sid", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-sid") input_data = self._load_superb("si", 4) output_logits = [] with torch.no_grad(): for example in input_data["speech"]: input = processor(example, return_tensors="pt", padding=True) output = model(input.input_values.half().to(torch_device), attention_mask=None) output_logits.append(output.logits[0]) output_logits = torch.stack(output_logits) predicted_logits, predicted_ids = torch.max(output_logits, dim=-1) expected_labels = [5, 1, 1, 3] # s3prl logits for the same batch expected_logits = torch.tensor( [78231.5547, 123166.6094, 122785.4141, 84851.2969], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids.tolist(), expected_labels) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue( torch.allclose(predicted_logits, expected_logits, atol=10))
def test_push_to_hub_in_organization(self): feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR) with tempfile.TemporaryDirectory() as tmp_dir: feature_extractor.save_pretrained( os.path.join(tmp_dir, "test-feature-extractor-org"), push_to_hub=True, use_auth_token=self._token, organization="valid_org", ) new_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "valid_org/test-feature-extractor-org") for k, v in feature_extractor.__dict__.items(): self.assertEqual(v, getattr(new_feature_extractor, k))
def load_feature_extractor(): feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) return feature_extractor
def test_inference_intent_classification(self): model = HubertForSequenceClassification.from_pretrained( "superb/hubert-base-superb-ic", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/hubert-base-superb-ic") input_data = self._load_superb("ic", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits_action, predicted_ids_action = torch.max( outputs.logits[:, :6], dim=-1) predicted_logits_object, predicted_ids_object = torch.max( outputs.logits[:, 6:20], dim=-1) predicted_logits_location, predicted_ids_location = torch.max( outputs.logits[:, 20:24], dim=-1) expected_labels_action = [1, 0, 4, 3] expected_logits_action = torch.tensor( [5.9052, 12.5865, 4.4840, 10.0240], dtype=torch.float16, device=torch_device) expected_labels_object = [1, 10, 3, 4] expected_logits_object = torch.tensor( [5.5316, 11.7946, 8.1672, 23.2415], dtype=torch.float16, device=torch_device) expected_labels_location = [0, 0, 0, 1] expected_logits_location = torch.tensor( [5.2053, 8.9577, 10.0447, 8.1481], dtype=torch.float16, device=torch_device) self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action) self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object) self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue( torch.allclose(predicted_logits_action, expected_logits_action, atol=3e-1)) self.assertTrue( torch.allclose(predicted_logits_object, expected_logits_object, atol=3e-1)) self.assertTrue( torch.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))
def test_inference_pretrained(self): model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60", from_pt=True) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-large-lv60", return_attention_mask=True ) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="np", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths(np.array(inputs_dict["input_values"].shape[1])), ) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, min_masks=2, ) outputs = model( inputs_dict.input_values, attention_mask=inputs_dict.attention_mask, mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim = optax.cosine_similarity( outputs.projected_states, outputs.projected_quantized_states, epsilon=1e-8 ) # retrieve cosine sim of masked features cosine_sim_masked = cosine_sim[mask_time_indices] # ... now compare to randomly initialized model config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-large-lv60") model_rand = FlaxWav2Vec2ForPreTraining(config) outputs_rand = model_rand( inputs_dict.input_values, attention_mask=inputs_dict.attention_mask, mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim_rand = optax.cosine_similarity( outputs_rand.projected_states, outputs_rand.projected_quantized_states ) # retrieve cosine sim of masked features cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices] # a pretrained wav2vec2 model has learned to predict the quantized latent states # => the cosine similarity between quantized states and predicted states > 0.5 # a random wav2vec2 model has not learned to predict the quantized latent states # => the cosine similarity between quantized states and predicted states is very likely < 0.1 self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
def test_cached_files_are_used_when_internet_is_down(self): # A mock response for an HTTP head request to emulate server down response_mock = mock.Mock() response_mock.status_code = 500 response_mock.headers = [] response_mock.raise_for_status.side_effect = HTTPError # Download this model to make sure it's in the cache. _ = Wav2Vec2FeatureExtractor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2") # Under the mock environment we get a 500 error when trying to reach the model. with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head: _ = Wav2Vec2FeatureExtractor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2") # This check we did call the fake head request mock_head.assert_called()
def _init_processor(self, config: EasyDict): config.processor.tokenizer.vocab_file = config.common.vocab_file tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer) feature_extractor = Wav2Vec2FeatureExtractor( **config.processor.feature_extractor) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(config.common.model_path) self._processor = processor
def test_pretrained_checkpoints_are_set_correctly(self): # this test makes sure that models that are using # group norm don't have their feature extractor return the # attention_mask for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST: config = Wav2Vec2Config.from_pretrained(model_id) feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id) # only "layer" feature extraction norm should make use of # attention_mask self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer")
def test_inference_pretrained_batched(self): model = SEWModel.from_pretrained("asapp/sew-tiny-100k").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-tiny-100k") input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): outputs = model(input_values).last_hidden_state # expected outputs taken from the original SEW implementation expected_outputs_first = torch.tensor( [ [ [0.1509, 0.5372, 0.3061, -0.1694], [-0.1700, 0.5764, 0.2753, -0.1299], [0.1281, 0.7949, 0.2342, -0.1624], [-0.1627, 0.6710, 0.2215, -0.1317], ], [ [0.0408, 1.4355, 0.8605, -0.0968], [0.0393, 1.2368, 0.6826, 0.0364], [-0.1269, 1.9215, 1.1677, -0.1297], [-0.1654, 1.6524, 0.6877, -0.0196], ], ], device=torch_device, ) expected_outputs_last = torch.tensor( [ [ [1.3379, -0.1450, -0.1500, -0.0515], [0.8364, -0.1680, -0.1248, -0.0689], [1.2791, -0.1507, -0.1523, -0.0564], [0.8208, -0.1690, -0.1199, -0.0751], ], [ [0.6959, -0.0861, -0.1235, -0.0861], [0.4700, -0.1686, -0.1141, -0.1199], [1.0776, -0.1137, -0.0124, -0.0472], [0.5774, -0.1675, -0.0376, -0.0823], ], ], device=torch_device, ) expected_output_sum = 62146.7422 self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) self.assertTrue(abs(outputs.sum() - expected_output_sum) < 5)
def test_inference_pretrained_batched(self): model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-d-tiny-100k") input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): outputs = model(input_values).last_hidden_state # expected outputs taken from the original SEW-D implementation expected_outputs_first = torch.tensor( [ [ [-0.1619, 0.6995, 0.4062, -0.1014], [-0.1364, 0.5960, 0.0952, -0.0873], [-0.1572, 0.5718, 0.4228, -0.0864], [-0.1325, 0.6823, 0.1387, -0.0871], ], [ [-0.1296, 0.4008, 0.4952, -0.1450], [-0.1152, 0.3693, 0.3037, -0.1290], [-0.1194, 0.6074, 0.3531, -0.1466], [-0.1113, 0.3135, 0.2224, -0.1338], ], ], device=torch_device, ) expected_outputs_last = torch.tensor( [ [ [-0.1577, 0.5108, 0.8553, 0.2550], [-0.1530, 0.3580, 0.6143, 0.2672], [-0.1535, 0.4954, 0.8503, 0.1387], [-0.1572, 0.3363, 0.6217, 0.1490], ], [ [-0.1338, 0.5459, 0.9607, -0.1133], [-0.1502, 0.3738, 0.7313, -0.0986], [-0.0953, 0.4708, 1.0821, -0.0944], [-0.1474, 0.3598, 0.7248, -0.0748], ], ], device=torch_device, ) expected_output_sum = 54201.0469 self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=1e-3)) self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=1e-3)) self.assertTrue(abs(outputs.sum() - expected_output_sum) < 1)
def test_inference_intent_classification(self): model = Wav2Vec2ForSequenceClassification.from_pretrained( "superb/wav2vec2-base-superb-ic").to(torch_device) processor = Wav2Vec2FeatureExtractor.from_pretrained( "superb/wav2vec2-base-superb-ic") input_data = self._load_superb("ic", 4) inputs = processor(input_data["speech"], return_tensors="pt", padding=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): outputs = model(input_values, attention_mask=attention_mask) predicted_logits_action, predicted_ids_action = torch.max( outputs.logits[:, :6], dim=-1) predicted_logits_object, predicted_ids_object = torch.max( outputs.logits[:, 6:20], dim=-1) predicted_logits_location, predicted_ids_location = torch.max( outputs.logits[:, 20:24], dim=-1) expected_labels_action = [0, 0, 2, 3] expected_logits_action = torch.tensor( [0.4568, 11.0848, 1.6621, 9.3841], device=torch_device) expected_labels_object = [3, 10, 3, 4] expected_logits_object = torch.tensor( [1.5322, 10.7094, 5.2469, 22.1318], device=torch_device) expected_labels_location = [0, 0, 0, 1] expected_logits_location = torch.tensor( [1.5335, 6.5096, 10.5704, 11.0569], device=torch_device) self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action) self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object) self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location) self.assertTrue( torch.allclose(predicted_logits_action, expected_logits_action, atol=1e-2)) self.assertTrue( torch.allclose(predicted_logits_object, expected_logits_object, atol=1e-2)) self.assertTrue( torch.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
def test_loss_pretraining(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base", attention_dropout=0.0, feat_proj_dropout=0.0, hidden_dropout=0.0, layerdrop=0.0, ) model.to(torch_device).train() feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( inputs_dict["input_values"].shape[1]), ) torch.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # check diversity loss num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors self.assertTrue(abs(diversity_loss.item() - 0.8859) < 1e-3) # check overall loss (contrastive loss + diversity loss) expected_loss = 62.5170 self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
def __init__( self, source, save_path, output_norm=True, freeze=True, freeze_feature_extractor=False, apply_spec_augment=False, ): super().__init__() # Download the extractor from HuggingFace. # The extractor is only used to retrieve the normalisation information self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( source, cache_dir=save_path) # Select specific self-supervised loader (eg. Wav2Vec2, Hubert) if "hubert" in source: config = HF_config.get("hubert") model = HF_models.get("hubert") else: config = HF_config.get("wav2vec2") model = HF_models.get("wav2vec2") # Download and load the model self._from_pretrained(source, config=config, model=model, save_path=save_path) # set apply_spec_augment self.model.config.apply_spec_augment = apply_spec_augment # We check if inputs need to be normalized w.r.t pretrained wav2vec2 self.normalize_wav = self.feature_extractor.do_normalize self.freeze = freeze self.freeze_feature_extractor = freeze_feature_extractor self.output_norm = output_norm if self.freeze: logger.warning( "speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen." ) self.model.eval() for param in self.model.parameters(): param.requires_grad = False else: self.model.train() if self.freeze_feature_extractor: self.model.feature_extractor._freeze_parameters()
def processor_init(): tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained('asr_output/new_processor/') return processor
def __init__( self, source, save_path, output_norm=True, freeze=True, freeze_feature_extractor=False, pretrain=True, apply_spec_augment=False, ): super().__init__() # Download the extractor from HuggingFace. # The extractor is only used to retrieve the normalisation self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( source, cache_dir=save_path ) # Select specific self-supervised loader (eg. Wav2Vec2, Hubert) if "hubert" in source: config = HF_config.get("hubert") model = HF_models.get("hubert") else: config = HF_config.get("wav2vec2") model = HF_models.get("wav2vec2") # Download the model from HuggingFace. # if pretrain is False, we do not download the pretrained weights # it it is True, we download and load them. if not (pretrain): config = config.from_pretrained(source, cache_dir=save_path) self.model = model(config) else: self.model = model.from_pretrained(source, cache_dir=save_path) # set apply_spec_augment self.model.config.apply_spec_augment = apply_spec_augment # We check if inputs need to be normalized w.r.t pretrained wav2vec2 self.normalize_wav = self.feature_extractor.do_normalize self.freeze = freeze self.freeze_feature_extractor = freeze_feature_extractor self.output_norm = output_norm if self.freeze: self.model.eval() else: self.model.train() if self.freeze_feature_extractor: self.model.feature_extractor._freeze_parameters()
def __init__(self, model_path: str = "facebook/wav2vec2-large-xlsr-53", device: str = "cpu", target_sample_rate: int = 16000) -> None: super().__init__() self.target_sample_rate = target_sample_rate self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( model_path, cache_dir=".ckpt") #, map_location="cpu" if not device == "cpu" and not torch.cuda.is_available(): logging.warning("gpu 不可用,使用cpu推理") device = "cpu" self.device = torch.device(device) self.model = Wav2Vec2Model.from_pretrained(model_path, cache_dir=".ckpt").to( self.device)