Exemplo n.º 1
0
 def test_recreate_finetune(self, config, factory_func):
     """Imported models can be recreated via a factory function without Hugging Face transformers."""
     imported = import_huggingface_model(self._get_model(config)).eval()
     reloaded = factory_func(aux_num_out=imported.aux.out_features)
     reloaded.load_state_dict(imported.state_dict())
     reloaded.eval()
     self._test_recreate(imported, reloaded, config)
    def test_recreate(self, config, factory_func):
        """Imported models can be recreated via a factory function without Hugging Face transformers."""
        imported = import_huggingface_model(self._get_model(config)).eval()

        reloaded = factory_func(num_out=imported.encoder.readout.out_features)
        reloaded.load_state_dict(imported.state_dict())
        reloaded.eval()

        torch.manual_seed(0)
        # FeatureExtractor
        x = torch.randn(3, 1024)
        ref, _ = imported.feature_extractor(x, None)
        hyp, _ = reloaded.feature_extractor(x, None)
        self.assertEqual(ref, hyp)
        # Feature projection
        x = torch.randn(3, 10, config['conv_dim'][-1])
        ref = imported.encoder.feature_projection(x)
        hyp = reloaded.encoder.feature_projection(x)
        self.assertEqual(ref, hyp)
        # Convolutional Positional Encoder
        x = torch.randn(3, 256, config['hidden_size'])
        ref = imported.encoder.transformer.pos_conv_embed(x)
        hyp = reloaded.encoder.transformer.pos_conv_embed(x)
        self.assertEqual(ref, hyp)
        # Encoder Transformer Layer
        for imported_, reloaded_ in zip(imported.encoder.transformer.layers,
                                        reloaded.encoder.transformer.layers):
            b, l, e = 16, 3, config["hidden_size"]
            x = torch.randn(b, l, e)
            mask = torch.randn(b, 1, l, l)

            ref = imported_(x, mask)
            hyp = reloaded_(x, mask)
            self.assertEqual(ref, hyp)
        # The whole Encoder Transformer
        # TODO: Add mask pattern. Expected mask shapes and values are different.
        b, l, e = 16, 3, config["hidden_size"]
        x = torch.randn(b, l, e)
        mask = torch.randn(b, 1, l, l)
        ref = imported.encoder.transformer(x)
        hyp = reloaded.encoder.transformer(x)
        self.assertEqual(ref, hyp)
        # Readout
        x = torch.randn(3, 10, config["hidden_size"])
        ref = imported.encoder.readout(x)
        hyp = reloaded.encoder.readout(x)
        self.assertEqual(ref, hyp)
        # The whole model
        x = torch.randn(3, 1024)
        ref, _ = imported(x)
        hyp, _ = reloaded(x)
        self.assertEqual(ref, hyp)
def main(text, wav_list, checkpoint, ctm):
    original = Wav2Vec2ForCTC.from_pretrained(checkpoint)
    model = import_huggingface_model(original).to(device)

    wav_dict = pd.read_csv(wav_list, names=["id", "path"],
                           sep=" ").set_index('id')['path'].to_dict()

    with open(text, 'r', encoding='utf-8') as text_file, \
            open(ctm, 'w', encoding='utf-8') as ctm_file:
        for utterance_line in text_file:
            utterance_ID, utterance = utterance_line.rstrip().split(" ",
                                                                    maxsplit=1)
            if utterance_ID in wav_dict:
                utterance_ctm_lines = align_utterance(utterance, utterance_ID,
                                                      wav_dict[utterance_ID],
                                                      model)
                utterance_ctm_to_write = "\n".join(utterance_ctm_lines)
                ctm_file.write(utterance_ctm_to_write)
Exemplo n.º 4
0
 def test_import_finetune(self, config, _):
     """wav2vec2 models from HF transformers can be imported and yields the same results"""
     original = self._get_model(config).eval()
     imported = import_huggingface_model(original).eval()
     self._test_import_pretrain(original.wav2vec2, imported, config)
     self._test_import_finetune(original, imported, config)
    def test_import(self, config):
        """wav2vec2 models from HF transformers can be imported and yields the same results"""
        original = self._get_model(config).eval()
        imported = import_huggingface_model(original).eval()

        torch.manual_seed(0)
        # FeatureExtractor
        x = torch.randn(3, 1024)
        ref = original.wav2vec2.feature_extractor(x).transpose(1, 2)
        hyp, _ = imported.feature_extractor(x, None)
        self.assertEqual(ref, hyp)
        # Feature projection
        x = torch.randn(3, 10, config['conv_dim'][-1])
        ref = original.wav2vec2.feature_projection(x)[0]
        hyp = imported.encoder.feature_projection(x)
        self.assertEqual(ref, hyp)
        # Convolutional Positional Encoder
        x = torch.randn(3, 256, config['hidden_size'])
        ref = original.wav2vec2.encoder.pos_conv_embed(x)
        hyp = imported.encoder.transformer.pos_conv_embed(x)
        self.assertEqual(ref, hyp)
        # Encoder Transformer Layer
        for original_, imported_ in zip(original.wav2vec2.encoder.layers,
                                        imported.encoder.transformer.layers):
            b, l, e = 16, 3, config["hidden_size"]
            x = torch.randn(b, l, e)
            mask = torch.randn(b, 1, l, l)

            ref, = original_(x, attention_mask=mask, output_attentions=False)
            hyp = imported_(x, mask)
            self.assertEqual(ref, hyp)
        # The whole Encoder Transformer
        b, l, e = 16, 3, config["hidden_size"]
        x = torch.randn(b, l, e)
        ref = original.wav2vec2.encoder(x).last_hidden_state
        hyp = imported.encoder.transformer(x)
        self.assertEqual(ref, hyp)
        # Readout
        x = torch.randn(3, 10, config["hidden_size"])
        ref = original.lm_head(x)
        hyp = imported.encoder.readout(x)
        self.assertEqual(ref, hyp)
        # The whole model without mask
        x = torch.randn(3, 1024)
        ref = original(x).logits
        hyp, _ = imported(x)
        self.assertEqual(ref, hyp)
        # The whole model without mask
        batch_size, num_frames = 3, 1024
        x = torch.randn(batch_size, num_frames)
        ref = original(x).logits
        hyp, _ = imported(x)
        self.assertEqual(ref, hyp)

        # The whole model with mask
        batch_size, num_frames = 3, 1024
        x = torch.randn(batch_size, num_frames)
        lengths = torch.randint(low=0, high=num_frames, size=[
            batch_size,
        ])
        mask = torch.arange(num_frames).expand(batch_size,
                                               num_frames) < lengths[:, None]

        ref = original(x, attention_mask=mask).logits
        hyp, output_lengths = imported(x, lengths)

        for i, l in enumerate(output_lengths):
            self.assertEqual(ref[i, :l, ...], hyp[i, :l, ...])