def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] tokens[1] = Token("is", tag_="VBZ", pos_="VERB") vocab = Vocabulary() verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags") cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags") none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags") # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace("DET", namespace="pos_tags") vocab.add_token_to_namespace("NOUN", namespace="pos_tags") vocab.add_token_to_namespace("PUNCT", namespace="pos_tags") indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab) assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False assert indexer.tokens_to_indices([tokens[1]], vocab) == { "tokens": [cop_index] }
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.as_padded_tensor_dict( {"tokens": [1, 2, 3, 4, 5]}, {"tokens": 10}) assert padded_tokens["tokens"].tolist() == [ 1, 2, 3, 4, 5, 0, 0, 0, 0, 0 ]
def __init__(self, word_indexer: Optional[TokenIndexer] = None, is_bert: bool = False, conceptnet_path: Optional[Path] = None): super().__init__(lazy=False) self.pos_indexers = {"pos_tokens": PosTagIndexer()} self.ner_indexers = {"ner_tokens": NerTagIndexer()} self.rel_indexers = { "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens') } if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) self.word_indexers = {'tokens': word_indexer} word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True) self.word_tokeniser = WordTokenizer(word_splitter=word_splitter) bert_splitter = BertBasicWordSplitter() self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.pos_ = "" indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'pos_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos") assert {"pos": [none_index, none_index, none_index, none_index]} == indices
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index("NONE", "pos_tokens") # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab) assert { "tokens": [none_index, none_index, none_index, none_index] } == indices
def update_sentence_pos( self, fields, tokens ) -> Field: indexers = {'pos_tag': PosTagIndexer(namespace='pos_tag')} textfield = TextField(tokens, indexers) return textfield
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2 } indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX # in some runs. tokens[2] = Token("is", tag_="VBZ", pos_="VERB") indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words(u"This is a sentence.") tokens = [t for t in tokens] + [Token(u"</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags') cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags') none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags') vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags') vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags') indexer = PosTagIndexer(coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, u"tokens") assert len(indices) == 1 assert u"tokens" in indices assert indices[u"tokens"][1] == verb_index assert indices[u"tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace('DET', namespace='pos_tags') vocab.add_token_to_namespace('NOUN', namespace='pos_tags') vocab.add_token_to_namespace('PUNCT', namespace='pos_tags') indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, "tokens") assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}
def test_token_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) assert indexer.token_to_indices(tokens[1], vocab) == verb_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.token_to_indices(tokens[1], vocab) == cop_index
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2} indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
from allennlp.data.fields import TextField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer, PosTagIndexer from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter from allennlp.data import Vocabulary # Splits text into words (instead of wordpieces or characters), and do part of speech tagging with # spacy while we're at it. tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=True)) # Represents each token with (1) an id from a vocabulary, (2) a sequence of characters, and (3) # part of speech tag ids. token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='token_vocab'), 'token_characters': TokenCharactersIndexer(namespace='character_vocab'), 'pos_tags': PosTagIndexer(namespace='pos_tag_vocab') } vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') vocab.add_tokens_to_namespace( ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'], namespace='character_vocab') vocab.add_tokens_to_namespace(['DT', 'VBZ', 'NN', '.'], namespace='pos_tag_vocab') text = "This is some text." tokens = tokenizer.tokenize(text) print(tokens) print([token.tag_ for token in tokens])
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
def test_padding_functions(self): indexer = PosTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {}
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
def test_padding_functions(self): indexer = PosTagIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {}
def test_as_array_produces_token_sequence(self): indexer = PosTagIndexer() padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
model_dir = "C:/Users/t-ofarvi/PycharmProjects/tryout-model" vocab_dir = f'{model_dir}/vocabulary' # NOTE: The word tokenizer is a SpaCy tokenizer, which is a little different from the BERT tokenizer. # This was done for convince. word_tokenizer = SpacyMultilingualWhitespaceWordSplitter() bert_indexer = PretrainedBertIndexer(pretrained_model=bert_mode, do_lowercase=bert_do_lowercase, truncate_long_sequences=False) word_indexer = { "bert": bert_indexer, "deps": DepLabelIndexer(namespace="deps_tags"), "ner": NerTagIndexer(), "pos": PosTagIndexer(), "lang": LanguageIndexer() } train_ds, validation_ds = ( UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder) for folder in [train_dataset_folder, validation_dataset_folder]) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances( itertools.chain(train_ds, validation_ds)) vocab.save_to_files(vocab_dir) vocab_namespaces = vocab._index_to_token.keys()