def test_as_array_produces_token_sequence(self):
     indexer = NerTagIndexer()
     padded_tokens = indexer.as_padded_tensor_dict(
         {"tokens": [1, 2, 3, 4, 5]}, {"tokens": 10})
     assert padded_tokens["tokens"].tolist() == [
         1, 2, 3, 4, 5, 0, 0, 0, 0, 0
     ]
 def test_count_vocab_items_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
 def test_count_vocab_items_uses_ner_tags(self):
     tokens = self.tokenizer.tokenize("Larry Page is CEO of Google.")
     tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     assert counter["ner_tokens"] == {"PERSON": 2, "ORG": 1, "NONE": 6}
 def test_count_vocab_items_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
示例#5
0
 def test_token_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer()
     assert indexer.token_to_indices(tokens[1], vocab) == person_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
 def test_tokens_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace("PERSON", namespace="ner_tags")
     none_index = vocab.add_token_to_namespace("NONE", namespace="ner_tags")
     vocab.add_token_to_namespace("ORG", namespace="ner_tags")
     indexer = NerTagIndexer(namespace="ner_tags")
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [person_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
 def test_tokens_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words(u"Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token(u"</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace(u'PERSON',
                                                 namespace=u'ner_tags')
     none_index = vocab.add_token_to_namespace(u'NONE',
                                               namespace=u'ner_tags')
     vocab.add_token_to_namespace(u'ORG', namespace=u'ner_tags')
     indexer = NerTagIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, u"tokens1") == {
         u"tokens1": [person_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, u"tokens-1") == {
         u"tokens-1": [none_index]
     }
示例#8
0
 def test_blank_ner_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     for token in tokens:
         token.ent_type_ = ""
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no NER tag"
     # we convert it to "NONE"
     assert counter["ner_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index('NONE', 'ner_tokens')
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
     assert {"ner": [none_index, none_index, none_index, none_index]} == indices
 def test_blank_ner_tag(self):
     tokens = [
         Token(token)._replace(ent_type_="") for token in "allennlp is awesome .".split(" ")
     ]
     indexer = NerTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no NER tag"
     # we convert it to "NONE"
     assert counter["ner_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index("NONE", "ner_tokens")
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
     assert {"ner": [none_index, none_index, none_index, none_index]} == indices
示例#10
0
    def __init__(self,
                 word_indexer: Optional[TokenIndexer] = None,
                 is_bert: bool = False,
                 conceptnet_path: Optional[Path] = None):
        super().__init__(lazy=False)
        self.pos_indexers = {"pos_tokens": PosTagIndexer()}
        self.ner_indexers = {"ner_tokens": NerTagIndexer()}
        self.rel_indexers = {
            "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')
        }

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        self.word_indexers = {'tokens': word_indexer}
        word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True)
        self.word_tokeniser = WordTokenizer(word_splitter=word_splitter)
        bert_splitter = BertBasicWordSplitter()
        self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=False)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
示例#11
0
 def test_as_array_produces_token_sequence(self):
     indexer = NerTagIndexer()
     padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]},
                                              {'key': 10}, {})
     assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
示例#12
0
 def test_padding_functions(self):
     indexer = NerTagIndexer()
     assert indexer.get_padding_lengths(0) == {}
 def test_as_array_produces_token_sequence(self):
     indexer = NerTagIndexer()
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]},
                                                {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
示例#14
0
 def test_as_array_produces_token_sequence(self):
     indexer = NerTagIndexer()
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
 def test_padding_functions(self):
     indexer = NerTagIndexer()
     assert indexer.get_padding_token() == 0
     assert indexer.get_padding_lengths(0) == {}
示例#16
0
 def test_as_array_produces_token_sequence(self):
     indexer = NerTagIndexer()
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
示例#17
0
    validation_dataset_folder = "C:/Users/t-ofarvi/PycharmProjects/UCCA_Dataset_29-06-09/tryout-validation"  #"C:/Users/t-ofarvi/Desktop/dev_allen"

    model_dir = "C:/Users/t-ofarvi/PycharmProjects/tryout-model"
    vocab_dir = f'{model_dir}/vocabulary'

    # NOTE: The word tokenizer is a SpaCy tokenizer, which is a little different from the BERT tokenizer.
    # This was done for convince.
    word_tokenizer = SpacyMultilingualWhitespaceWordSplitter()

    bert_indexer = PretrainedBertIndexer(pretrained_model=bert_mode,
                                         do_lowercase=bert_do_lowercase,
                                         truncate_long_sequences=False)
    word_indexer = {
        "bert": bert_indexer,
        "deps": DepLabelIndexer(namespace="deps_tags"),
        "ner": NerTagIndexer(),
        "pos": PosTagIndexer(),
        "lang": LanguageIndexer()
    }

    train_ds, validation_ds = (
        UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder)
        for folder in [train_dataset_folder, validation_dataset_folder])

    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(
            itertools.chain(train_ds, validation_ds))
        vocab.save_to_files(vocab_dir)