def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2 } indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX # in some runs. tokens[2] = Token("is", tag_="VBZ", pos_="VERB") indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] tokens[1] = Token("is", tag_="VBZ", pos_="VERB") vocab = Vocabulary() verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags") cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags") none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags") # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace("DET", namespace="pos_tags") vocab.add_token_to_namespace("NOUN", namespace="pos_tags") vocab.add_token_to_namespace("PUNCT", namespace="pos_tags") indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab) assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False assert indexer.tokens_to_indices([tokens[1]], vocab) == { "tokens": [cop_index] }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_token_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) assert indexer.token_to_indices(tokens[1], vocab) == verb_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.token_to_indices(tokens[1], vocab) == cop_index
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2} indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words(u"This is a sentence.") tokens = [t for t in tokens] + [Token(u"</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags') cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags') none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags') vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags') vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags') indexer = PosTagIndexer(coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, u"tokens") assert len(indices) == 1 assert u"tokens" in indices assert indices[u"tokens"][1] == verb_index assert indices[u"tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace('DET', namespace='pos_tags') vocab.add_token_to_namespace('NOUN', namespace='pos_tags') vocab.add_token_to_namespace('PUNCT', namespace='pos_tags') indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, "tokens") assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}