def test_bert_tensorizer(self): tokenizer, rand_tokens = self._mock_tokenizer() vocab = self._mock_vocab() bert = ScriptBERTTensorizer(tokenizer, vocab, max_seq_len=100) token_ids, _, _, _ = bert.numberize(["mock test"], None) self.assertEqual(token_ids[0], 201) self.assertEqual(token_ids[-1], 202) for token_id, token in zip(token_ids[1:-1], rand_tokens): self.assertEqual(token_id, int(token[0]) - 100)
def torchscriptify(self): return ScriptBERTTensorizer( tokenizer=self.tokenizer.torchscriptify(), vocab=ScriptVocabulary( list(self.vocab), pad_idx=self.vocab.get_pad_index(), bos_idx=self.vocab.get_bos_index(), eos_idx=self.vocab.get_eos_index(), ), max_seq_len=self.max_seq_len, )
def test_bert_tensorizer(self): tokenizer, rand_tokens = self._mock_tokenizer() vocab = self._mock_vocab() bert = ScriptBERTTensorizer( tokenizer, vocab, max_seq_len=100, add_bos_token=False, use_eos_token_for_bos=False, ) token_ids, _, _ = bert.numberize("mock test") self.assertEqual(token_ids[-1], 202) for token_id, token in zip(token_ids[0:-1], rand_tokens): self.assertEqual(token_id, int(token[0]) - 100)