Exemplo n.º 1
0
 def test_as_tensor_handles_words(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                             numpy.array([1, 1, 1, 2, 1]))
Exemplo n.º 2
0
 def test_as_tensor_handles_characters(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                             expected_character_array)
Exemplo n.º 3
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in ["a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Exemplo n.º 4
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / 'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Exemplo n.º 5
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Exemplo n.º 6
0
 def test_token_embedder_returns_dict(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       token_indexers={"field_with_dict": DictReturningTokenIndexer(),
                                       "words": SingleIdTokenIndexer("words"),
                                       "characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
             'token_ids': 5,
             'additional_key': 2,
             'words': 2,
             'characters': 2,
             'num_token_characters': 8
     }
     padding_lengths['token_ids'] = 7
     padding_lengths['additional_key'] = 3
     padding_lengths['words'] = 4
     padding_lengths['characters'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Exemplo n.º 7
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters"),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}