示例#1
0
 def test_bert_pair_tensorizer(self):
     sentences = ["Focus", "Driving School"]
     expected_tokens = [101, 175, 287, 766, 462, 102, 100, 379, 102]
     expected_segment_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1]
     row = {"text1": sentences[0], "text2": sentences[1]}
     tensorizer = BERTTensorizer.from_config(
         BERTTensorizer.Config(
             columns=["text1", "text2"],
             tokenizer=WordPieceTokenizer.Config(
                 wordpiece_vocab_path=
                 "pytext/data/test/data/wordpiece_1k.txt"),
         ))
     tokens, segment_labels, seq_len = tensorizer.numberize(row)
     self.assertEqual(tokens, expected_tokens)
     self.assertEqual(segment_labels, expected_segment_labels)
     self.assertEqual(seq_len, len(expected_tokens))
    def test_bert_tensorizer(self):
        sentence = "<SOS>  Focus Driving School Mulungushi bus station along Kasuba road, wamkopeka building.  Ndola,  Zambia."
        # expected result was obtained offline by running BertModelDataHandler
        expected = [
            101,
            133,
            278,
            217,
            135,
            175,
            287,
            766,
            462,
            100,
            379,
            182,
            459,
            334,
            459,
            280,
            504,
            462,
            425,
            283,
            171,
            462,
            567,
            474,
            180,
            262,
            217,
            459,
            931,
            262,
            913,
            117,
            192,
            262,
            407,
            478,
            287,
            744,
            263,
            478,
            262,
            560,
            119,
            183,
            282,
            287,
            843,
            117,
            195,
            262,
            407,
            931,
            566,
            119,
            102,
        ]
        row = {"text": sentence}
        tensorizer = BERTTensorizer.from_config(
            BERTTensorizer.Config(
                tokenizer=WordPieceTokenizer.Config(
                    wordpiece_vocab_path="pytext/data/test/data/wordpiece_1k.txt"
                )
            )
        )
        tokens, segment_label, seq_len = tensorizer.numberize(row)
        self.assertEqual(tokens, expected)
        self.assertEqual(seq_len, len(expected))
        self.assertEqual(segment_label, [0] * len(expected))

        tokens, pad_mask, segment_labels = tensorizer.tensorize(
            [(tokens, segment_label, seq_len)]
        )
        self.assertEqual(pad_mask[0].tolist(), [1] * len(expected))