Python WordTokenizer示例，srl_model.data.tokenizers.WordTokenizer Python示例

示例#1

0

显示文件

文件： word_tokenizer_test.py 项目： sanyu12/Bert_Attempt

 def test_passes_through_correctly(self):
     tokenizer = WordTokenizer(start_tokens=['@@', '%%'], end_tokens=['^^'])
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "@@", "%%", "this", "(", "sentence", ")", "has", "'", "crazy", "'",
         "\"", "punctuation", "\"", ".", "^^"
     ]
     assert tokens == expected_tokens

示例#2

0

显示文件

文件： wikitables.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              lazy: bool = False,
              tables_directory: str = None,
              dpd_output_directory: str = None,
              max_dpd_logical_forms: int = 10,
              sort_dpd_logical_forms: bool = True,
              max_dpd_tries: int = 20,
              keep_if_no_dpd: bool = False,
              tokenizer: Tokenizer = None,
              question_token_indexers: Dict[str, TokenIndexer] = None,
              table_token_indexers: Dict[str, TokenIndexer] = None,
              use_table_for_vocab: bool = False,
              linking_feature_extractors: List[str] = None,
              include_table_metadata: bool = False,
              max_table_tokens: int = None,
              output_agendas: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._tables_directory = tables_directory
     self._dpd_output_directory = dpd_output_directory
     self._max_dpd_logical_forms = max_dpd_logical_forms
     self._sort_dpd_logical_forms = sort_dpd_logical_forms
     self._max_dpd_tries = max_dpd_tries
     self._keep_if_no_dpd = keep_if_no_dpd
     self._tokenizer = tokenizer or WordTokenizer(
         SpacyWordSplitter(pos_tags=True))
     self._question_token_indexers = question_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._table_token_indexers = table_token_indexers or self._question_token_indexers
     self._use_table_for_vocab = use_table_for_vocab
     self._linking_feature_extractors = linking_feature_extractors
     self._include_table_metadata = include_table_metadata
     self._basic_types = set(str(type_) for type_ in wt_types.BASIC_TYPES)
     self._max_table_tokens = max_table_tokens
     self._output_agendas = output_agendas

示例#3

0

显示文件

文件： atis.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              tokenizer: Tokenizer = None) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self._tokenizer = tokenizer or WordTokenizer(SpacyWordSplitter(pos_tags=True))

示例#4

0

显示文件

文件： squad.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }

示例#5

0

显示文件

文件： quora_paraphrase.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }

示例#6

0

显示文件

文件： util_test.py 项目： sanyu12/Bert_Attempt

 def test_char_span_to_token_span_handles_easy_cases(self):
     # These are _inclusive_ spans, on both sides.
     tokenizer = WordTokenizer()
     passage = "On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\
         "Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\
         "nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\
         "first performances since giving birth to Blue Ivy."
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     # "January 7, 2012"
     token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
     assert token_span == (1, 4)
     # "Lenox Hill Hospital"
     token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
     assert token_span == (22, 24)
     # "Lenox Hill Hospital in New York."
     token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
     assert token_span == (22, 28)

示例#7

0

显示文件

文件： word_tokenizer_test.py 项目： sanyu12/Bert_Attempt

 def test_batch_tokenization(self):
     tokenizer = WordTokenizer()
     sentences = [
         "This is a sentence", "This isn't a sentence.",
         "This is the 3rd sentence."
         "Here's the 'fourth' sentence."
     ]
     batch_tokenized = tokenizer.batch_tokenize(sentences)
     separately_tokenized = [
         tokenizer.tokenize(sentence) for sentence in sentences
     ]
     assert len(batch_tokenized) == len(separately_tokenized)
     for batch_sentence, separate_sentence in zip(batch_tokenized,
                                                  separately_tokenized):
         assert len(batch_sentence) == len(separate_sentence)
         for batch_word, separate_word in zip(batch_sentence,
                                              separate_sentence):
             assert batch_word.text == separate_word.text

示例#8

0

显示文件

    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()

示例#9

0

显示文件

 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              num_context_answers: int = 0) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._num_context_answers = num_context_answers

示例#10

0

显示文件

    def __init__(self, utterances: List[str], tokenizer=None) -> None:
        self.utterances: List[str] = utterances
        self.tokenizer = tokenizer if tokenizer else WordTokenizer()
        self.tokenized_utterances = [self.tokenizer.tokenize(utterance) for utterance in self.utterances]
        valid_actions, linking_scores = self.init_all_valid_actions()
        self.valid_actions: Dict[str, List[str]] = valid_actions

        # This has shape (num_entities, num_utterance_tokens).
        self.linking_scores: numpy.ndarray = linking_scores
        self.grammar_str: str = self.get_grammar_str()
        self.grammar_with_context: Grammar = Grammar(self.grammar_str)

示例#11

0

显示文件

文件： util_test.py 项目： sanyu12/Bert_Attempt

 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = WordTokenizer()
     passage = "Beyonc\u00e9 is believed to have first started a relationship with Jay Z " +\
         "after a collaboration on \"'03 Bonnie & Clyde\", which appeared on his seventh " +\
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay " +\
         "Z's girlfriend in the music video for the song, which would further fuel " +\
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were " +\
         "married without publicity. As of April 2014, the couple have sold a combined 300 " +\
         "million records together. The couple are known for their private relationship, " +\
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 " +\
         "suffered a miscarriage in 2010 or 2011, describing it as \"the saddest thing\" " +\
         "she had ever endured. She returned to the studio and wrote music in order to cope " +\
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order " +\
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)

示例#12

0

显示文件

文件： word_tokenizer_test.py 项目： sanyu12/Bert_Attempt

 def test_stems_and_filters_correctly(self):
     tokenizer = WordTokenizer.from_params(
         Params({
             'word_stemmer': {
                 'type': 'porter'
             },
             'word_filter': {
                 'type': 'stopwords'
             }
         }))
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     expected_tokens = ["sentenc", "ha", "crazi", "punctuat"]
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     assert tokens == expected_tokens

示例#13

0

显示文件

文件： seq2seq.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              source_tokenizer: Tokenizer = None,
              target_tokenizer: Tokenizer = None,
              source_token_indexers: Dict[str, TokenIndexer] = None,
              target_token_indexers: Dict[str, TokenIndexer] = None,
              source_add_start_token: bool = True,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self._source_tokenizer = source_tokenizer or WordTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token

示例#14

0

显示文件

文件： nlvr.py 项目： sanyu12/Bert_Attempt

 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              sentence_token_indexers: Dict[str, TokenIndexer] = None,
              nonterminal_indexers: Dict[str, TokenIndexer] = None,
              terminal_indexers: Dict[str, TokenIndexer] = None,
              output_agendas: bool = True) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._sentence_token_indexers = sentence_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._nonterminal_indexers = nonterminal_indexers or {
         "tokens": SingleIdTokenIndexer("rule_labels")
     }
     self._terminal_indexers = terminal_indexers or {
         "tokens": SingleIdTokenIndexer("rule_labels")
     }
     self._output_agendas = output_agendas

示例#15

0

显示文件

    def __init__(self,
                 tokens_per_instance: int = None,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer: Dict[str, TokenIndexer] = None
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {"tokens": SingleIdTokenIndexer()}

示例#16

0

显示文件

class KnowledgeGraphFieldTest(AllenNlpTestCase):
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()

    def test_count_vocab_items(self):
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        self.field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["tokens"] == {
            '-1': 1,
            '0': 1,
            '1': 1,
            'name': 1,
            'in': 2,
            'english': 2,
            'location': 1,
            'paradeniz': 1,
            'mersin': 1,
            'lake': 1,
            'gala': 1,
            'edirne': 1,
        }

    def test_index_converts_field_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field._indexed_entity_texts.keys() == {'tokens'}
        # Note that these are sorted by their _identifiers_, not their cell text, so the
        # `fb:row.rows` show up after the `fb:cells`.
        expected_array = [[self.negative_one_index], [self.zero_index],
                          [self.one_index], [self.edirne_index],
                          [self.lake_index, self.gala_index],
                          [self.mersin_index], [self.paradeniz_index],
                          [
                              self.location_index, self.in_index,
                              self.english_index
                          ],
                          [self.name_index, self.in_index, self.english_index]]
        assert self.field._indexed_entity_texts['tokens'] == expected_array

    def test_get_padding_lengths_raises_if_not_indexed(self):
        with pytest.raises(AssertionError):
            self.field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 9,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4
        }
        self.field._token_indexers[
            'token_characters'] = TokenCharactersIndexer()
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 9,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4,
            'num_token_characters': 9
        }

    def test_as_tensor_produces_correct_output(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        padding_lengths['num_utterance_tokens'] += 1
        padding_lengths['num_entities'] += 1
        tensor_dict = self.field.as_tensor(padding_lengths)
        assert tensor_dict.keys() == {'text', 'linking'}
        expected_text_tensor = [
            [self.negative_one_index, 0, 0], [self.zero_index, 0, 0],
            [self.one_index, 0, 0], [self.edirne_index, 0, 0],
            [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0],
            [self.paradeniz_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index], [0, 0, 0]
        ]
        assert_almost_equal(
            tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_text_tensor)

        linking_tensor = tensor_dict['linking'].detach().cpu().numpy()
        expected_linking_tensor = [
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "mersin"
                [0, 0, 0, 0, 0, -1, 0, 0, 0, 0]
            ],  # -1, "?"
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # 0, "?"
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # 1, "?"
            [
                [0, 0, 0, 0, 0, .2, 0, 0, 0, 0],  # fb:cell.edirne, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.edirne, "is"
                [0, 0, 0, 0, 0, .1666, 0, 0, 0, 0],  # fb:cell.edirne, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.edirne, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.edirne, padding
            [
                [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.lake_gala, "where"
                [0, 0, 0, 0, 0, -3.5, 0, 0, 0, 0],  # fb:cell.lake_gala, "is"
                [0, 0, 0, 0, 0, -.3333, 0, 0, 0,
                 0],  # fb:cell.lake_gala, "mersin"
                [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.lake_gala, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.lake_gala, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # fb:cell.mersin, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.mersin, "is"
                [0, 1, 1, 1, 1, 1, 0, 0, 1, 1],  # fb:cell.mersin, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.mersin, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.mersin, padding
            [
                [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.paradeniz, "where"
                [0, 0, 0, 0, 0, -3, 0, 0, 0, 0],  # fb:cell.paradeniz, "is"
                [0, 0, 0, 0, 0, -.1666, 0, 0, 0,
                 0],  # fb:cell.paradeniz, "mersin"
                [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.paradeniz, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.paradeniz, padding
            [
                [0, 0, 0, 0, 0, -2.6, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "where"
                [0, 0, 0, 0, 0, -7.5, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "is"
                [0, 0, 0, 0, 0, -1.8333, 1, 1, 0,
                 0],  # fb:row.row.name_in_english, "mersin"
                [0, 0, 0, 0, 0, -18, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:row.row.name_in_english, padding
            [
                [0, 0, 0, 0, 0, -1.6, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "where"
                [0, 0, 0, 0, 0, -5.5, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "is"
                [0, 0, 0, 0, 0, -1, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "mersin"
                [0, 0, 0, 0, 0, -14, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:row.row.location_in_english, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ]
        ]  # padding, padding
        for entity_index, entity_features in enumerate(
                expected_linking_tensor):
            for question_index, feature_vector in enumerate(entity_features):
                assert_almost_equal(linking_tensor[entity_index,
                                                   question_index],
                                    feature_vector,
                                    decimal=4,
                                    err_msg=f"{entity_index} {question_index}")

    def test_lemma_feature_extractor(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize("Names in English")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        lemma_feature = field._contains_lemma_match(
            entity, field._entity_text_map[entity], utterance[0], 0, utterance)
        assert lemma_feature == 1

    def test_span_overlap_fraction(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize(
            "what is the name in english of mersin?")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        entity_text = field._entity_text_map[entity]
        feature_values = [
            field._span_overlap_fraction(entity, entity_text, token, i,
                                         utterance)
            for i, token in enumerate(utterance)
        ]
        assert feature_values == [0, 0, 0, 1, 2 / 3, 1 / 3, 0, 0, 0]

    def test_batch_tensors(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        tensor_dict1 = self.field.as_tensor(padding_lengths)
        tensor_dict2 = self.field.as_tensor(padding_lengths)
        batched_tensor_dict = self.field.batch_tensors(
            [tensor_dict1, tensor_dict2])
        assert batched_tensor_dict.keys() == {'text', 'linking'}
        expected_single_tensor = [
            [self.negative_one_index, 0, 0], [self.zero_index, 0, 0],
            [self.one_index, 0, 0], [self.edirne_index, 0, 0],
            [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0],
            [self.paradeniz_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index]
        ]
        expected_batched_tensor = [
            expected_single_tensor, expected_single_tensor
        ]
        assert_almost_equal(
            batched_tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_batched_tensor)
        expected_linking_tensor = torch.stack(
            [tensor_dict1['linking'], tensor_dict2['linking']])
        assert_almost_equal(
            batched_tensor_dict['linking'].detach().cpu().numpy(),
            expected_linking_tensor.detach().cpu().numpy())