def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        tokens[1] = Token("is", tag_="VBZ", pos_="VERB")
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags")
        cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags")
        none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags")
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace("DET", namespace="pos_tags")
        vocab.add_token_to_namespace("NOUN", namespace="pos_tags")
        vocab.add_token_to_namespace("PUNCT", namespace="pos_tags")

        indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab)
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False
        assert indexer.tokens_to_indices([tokens[1]], vocab) == {
            "tokens": [cop_index]
        }
示例#2
0
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [t for t in tokens] + [Token(u"</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags')
        cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags')
        none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, u"tokens")
        assert len(indices) == 1
        assert u"tokens" in indices
        assert indices[u"tokens"][1] == verb_index
        assert indices[u"tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}
示例#3
0
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}
示例#4
0
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     for token in tokens:
         token.pos_ = ""
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index('NONE', 'pos_tokens')
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
     assert {"pos": [none_index, none_index, none_index, none_index]} == indices
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index("NONE", "pos_tokens")
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab)
     assert {
         "tokens": [none_index, none_index, none_index, none_index]
     } == indices