Пример #1
0
    def test__tokenize_values(self):
        # GIVEN
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        values = ['English', 'French']

        # WHEN
        all_value_tokens, value_token_lengths, segment_ids = _tokenize_values(
            values, tokenizer)

        # THEN
        self.assertEqual(['english', '[SEP]', 'french', '[SEP]'],
                         all_value_tokens)
        self.assertEqual([2, 2], value_token_lengths)
        self.assertEqual([1, 0, 1, 0], segment_ids)
Пример #2
0
    def test__tokenize_values__float(self):
        # GIVEN
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        values = ['4.5']

        # WHEN
        all_value_tokens, value_token_lengths, segment_ids = _tokenize_values(
            values, tokenizer)

        # THEN
        self.assertEqual(['4', '.', '5', '[SEP]'], all_value_tokens)

        ids = tokenizer.convert_tokens_to_ids(all_value_tokens)

        # make sure the token is not an unknown token
        self.assertNotEqual(tokenizer.unk_token_id, ids[0])
Пример #3
0
    def test__tokenize_values__subword_tokenization(self):
        # GIVEN
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        values = ['Protoporphyrinogen IX', 'dummy']

        # WHEN
        all_value_tokens, value_token_lengths, segment_ids = _tokenize_values(
            values, tokenizer)

        # THEN
        self.assertEqual([
            'proto', '##por', '##phy', '##rino', '##gen', 'ix', '[SEP]',
            'dummy', '[SEP]'
        ], all_value_tokens)
        self.assertEqual([7, 2], value_token_lengths)
        self.assertEqual([1, 1, 1, 1, 1, 1, 0, 1, 0], segment_ids)
Пример #4
0
    def test__tokenize_values__empty_string(self):
        # GIVEN
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        values = ['']

        # WHEN
        all_value_tokens, value_token_lengths, segment_ids = _tokenize_values(
            values, tokenizer)

        # THEN
        self.assertEqual(['empty', '[SEP]'], all_value_tokens)

        ids = tokenizer.convert_tokens_to_ids(all_value_tokens)

        # make sure no token is unknown
        self.assertListEqual([],
                             list(
                                 filter(lambda e: e == tokenizer.unk_token_id,
                                        ids)))