Exemplo n.º 1
0
    def test_inference_pretraining(self):
        model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        model.to(torch_device)
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
        input_speech = self._load_datasamples(2)

        inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True)

        with torch.no_grad():
            torch.manual_seed(0)
            outputs = model(
                inputs_dict.input_values.to(torch_device),
                attention_mask=inputs_dict.attention_mask.to(torch_device),
            )

        # compute cosine similarity
        cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)

        # pretrained model should have learned a high cosine similarity
        self.assertTrue(cosine_sim.mean() > 0.5)

        # fmt: off
        expected_cosine_sim_slice = torch.tensor(
            [[0.8290, 0.8335, 0.8815, 0.8580, 0.8249],
             [0.8892, 0.9221, 0.8711, 0.8601, 0.8482]],
            device=torch_device,
        )
        # fmt: on

        self.assertTrue(torch.allclose(cosine_sim[:, :5], expected_cosine_sim_slice, atol=1e-3))
Exemplo n.º 2
0
def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = UniSpeechConfig.from_pretrained(config_path)
    else:
        config = UniSpeechConfig()

    hf_unispeech = UniSpeechForPreTraining(config)

    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
    model = model[0].eval()

    recursively_load_weights(model, hf_unispeech)

    hf_unispeech.save_pretrained(pytorch_dump_folder_path)
def convert_unispeech_checkpoint(checkpoint_path,
                                 pytorch_dump_folder_path,
                                 config_path=None,
                                 dict_path=None,
                                 is_finetuned=True):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = UniSpeechConfig.from_pretrained(config_path)
    else:
        config = UniSpeechConfig()

    if is_finetuned:
        if dict_path:
            target_dict = Dictionary.load_from_json(dict_path)

            # important change bos & pad token id since CTC symbol is <pad> and
            # not <s> as in fairseq
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error(
                    "--pytorch_dump_folder_path ({}) should be a directory".
                    format(pytorch_dump_folder_path))
                return
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            vocab_dict = target_dict.indices

            # fairseq has the <pad> and <s> switched
            vocab_dict["<pad>"] = 42
            vocab_dict["<s>"] = 43
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(vocab_dict, vocab_handle)
            tokenizer = Wav2Vec2PhonemeCTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            return_attention_mask = True if config.feat_extract_norm == "layer" else False
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                          tokenizer=tokenizer)
            processor.save_pretrained(pytorch_dump_folder_path)

        hf_unispeech = UniSpeechForCTC(config)
    else:
        hf_unispeech = UniSpeechForPreTraining(config)

    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path],
            arg_overrides={
                "data": "/".join(dict_path.split("/")[:-1]),
                "w2v_path": checkpoint_path
            })
    else:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path])

    model = model[0].eval()

    recursively_load_weights(model, hf_unispeech, is_finetuned)

    hf_unispeech.save_pretrained(pytorch_dump_folder_path)