Exemplo n.º 1
0
 def test_encodes_lm_like(self):
     """Tests that for GPT-like language models the embedding from first token is used for sequence embedding"""
     from transformers import AutoModelForPreTraining
     for model in ["gpt2", "openai-gpt"]:
         with patch.object(AutoModelForPreTraining,
                           'from_pretrained',
                           return_value=MockPtModel(model)):
             encoder = TransformerTorchEncoder(
                 pretrained_model_name_or_path=model,
                 pooling_strategy='auto',
                 metas={})
             encoded_batch = encoder.encode(self.texts)
             self.assertEqual(encoded_batch.shape, (2, 768))
Exemplo n.º 2
0
 def test_encodes_bert_like(self):
     """Tests that for BERT-like models the embedding from first token is used for sequence embedding"""
     from transformers import AutoModelForPreTraining
     for model in [
             "bert-base-uncased", "google/electra-base-discriminator",
             "roberta-base"
     ]:
         with patch.object(AutoModelForPreTraining,
                           'from_pretrained',
                           return_value=MockPtModel(model)):
             encoder = TransformerTorchEncoder(
                 pretrained_model_name_or_path=model,
                 pooling_strategy='auto',
                 metas={})
             encoded_batch = encoder.encode(np.asarray(self.texts))
             self.assertEqual(encoded_batch.shape, (2, 768))
Exemplo n.º 3
0
 def test_encodes_lm_like(self):
     """Tests that for GPT-like language models the embedding from first token is used for sequence embedding"""
     from transformers import AutoModelForPreTraining
     for model in ["gpt2", "openai-gpt"]:
         with patch.object(AutoModelForPreTraining,
                           'from_pretrained',
                           return_value=MockPtModel(model)):
             encoder = TransformerTorchEncoder(
                 pretrained_model_name_or_path=model,
                 pooling_strategy='auto',
                 metas={})
             tokenized_seq_lengths = [
                 len(encoder.tokenizer.tokenize(t)) for t in self.texts
             ]
             encoded_batch = encoder.encode(self.texts)
             assert np.array_equal(encoded_batch.squeeze(),
                                   np.asarray(tokenized_seq_lengths) - 1)
Exemplo n.º 4
0
 def _get_encoder(self):
     encoder = TransformerTorchEncoder(model_name='bert-base-uncased',
                                       pooling_strategy='cls')
     return encoder
Exemplo n.º 5
0
 def _get_encoder(self, metas):
     return TransformerTorchEncoder(model_name='bert-base-uncased',
                                    pooling_strategy='cls',
                                    metas=metas)
Exemplo n.º 6
0
 def _get_encoder(self, metas):
     return TransformerTorchEncoder(
         pretrained_model_name_or_path='bert-base-uncased', metas=metas)
Exemplo n.º 7
0
 def _get_encoder(self, metas):
     return TransformerTorchEncoder(
         polling_strategy='min',
         pretrained_model_name_or_path='xlnet-base-cased',
         metas=metas)