Пример #1
0
 def test_gpt2_bpe_tokenizer(self):
     text = "Prototype"
     expected = [Token("19703", 0, 4), Token("8690", 4, 9)]
     tokenizer = GPT2BPETokenizer.from_config(
         GPT2BPETokenizer.Config(
             bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
             bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
         ))
     tokens = tokenizer.tokenize(text)
     print(tokens)
     self.assertEqual(tokens, expected)
Пример #2
0
 def test_squad_roberta_tensorizer(self):
     row = {
         "id": 0,
         "doc": "Prototype",
         "question": "otype",
         "answers": ["Prot"],
         "answer_starts": [0],
         "has_answer": True,
     }
     tensorizer = SquadForRoBERTaTensorizer.from_config(
         SquadForRoBERTaTensorizer.Config(
             tokenizer=GPT2BPETokenizer.Config(
                 bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
                 bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
             ),
             vocab_file="pytext/data/test/data/gpt2_dict.txt",
             max_seq_len=250,
         )
     )
     tokens, segments, seq_len, positions, start, end = tensorizer.numberize(row)
     # check against manually verified answer positions in tokenized output
     # there are 4 identical answers
     self.assertEqual(start, [3])
     self.assertEqual(end, [3])
     self.assertEqual(len(tokens), seq_len)
     self.assertEqual(len(segments), seq_len)
Пример #3
0
    def test_roberta_tensorizer(self):
        text = "Prototype"
        tokens = [[0, 4, 5, 2]]
        pad_masks = [[1, 1, 1, 1]]
        segment_labels = [[0, 0, 0, 0]]
        positions = [[0, 1, 2, 3]]
        expected = [tokens, pad_masks, segment_labels, positions]

        tensorizer = RoBERTaTensorizer.from_config(
            RoBERTaTensorizer.Config(
                tokenizer=GPT2BPETokenizer.Config(
                    bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
                    bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
                ),
                vocab_file="pytext/data/test/data/gpt2_dict.txt",
                max_seq_len=256,
            ))
        tensors = tensorizer.tensorize([tensorizer.numberize({"text": text})])
        for tensor, expect in zip(tensors, expected):
            self.assertEqual(tensor.tolist(), expect)

        tensorizer_impl = RoBERTaTensorizerScriptImpl(
            tokenizer=DoNothingTokenizer(),
            vocab=tensorizer.vocab,
            max_seq_len=tensorizer.max_seq_len,
        ).torchscriptify()
        per_sentence_tokens = [tensorizer.tokenizer.tokenize(text)]
        tokens_2d, segment_labels_2d, seq_lens_1d, positions_2d = zip(
            *[tensorizer_impl.numberize(per_sentence_tokens)])
        script_tensors = tensorizer_impl.tensorize(tokens_2d,
                                                   segment_labels_2d,
                                                   seq_lens_1d, positions_2d)
        for tensor, expect in zip(script_tensors, expected):
            self.assertEqual(tensor.tolist(), expect)
Пример #4
0
    def test_gpt2_bpe_tokenizer(self):
        tokenizer = GPT2BPETokenizer.from_config(
            GPT2BPETokenizer.Config(
                bpe_vocab_path="pytext/data/test/data/gpt2_vocab.bpe",
                bpe_encoder_path="pytext/data/test/data/gpt2_encoder.json",
            ))
        text_list = ["Prototype", " Prototype"]
        expected_list = [
            [Token("19703", 0, 4), Token("8690", 4, 9)],
            [Token("220", 0, 0),
             Token("19703", 1, 5),
             Token("8690", 5, 10)],
        ]

        for (text, expected) in zip(text_list, expected_list):
            tokens = tokenizer.tokenize(text)
            self.assertEqual(tokens, expected)
Пример #5
0
 class Config(BERTTensorizerBase.Config):
     # any unittest should be overriding this with a small local file
     vocab_file: str = resources.roberta.GPT2_BPE_DICT
     tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config()
     max_seq_len: int = 256
Пример #6
0
 class Config(BERTTensorizerBase.Config):
     vocab_file: str = (
         "manifold://pytext_training/tree/static/vocabs/bpe/gpt2/dict.txt")
     tokenizer: Tokenizer.Config = GPT2BPETokenizer.Config()
     max_seq_len: int = 256
Пример #7
0
 class Config(Tensorizer.Config):
     columns: List[str] = ["text"]
     tokenizer: GPT2BPETokenizer.Config = GPT2BPETokenizer.Config()
     max_seq_len: int = 256