def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = DepLabelIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1, "advmod": 3, "NONE": 2}
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels") none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels") indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab) == {"tokens": [root_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab) == {"tokens": [none_index]}
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = DepLabelIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1, "det": 1, "NONE": 2, "attr": 1, "punct": 1}
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words(u"This is a sentence.") tokens = [Token(u"<S>")] + [t for t in tokens] + [Token(u"</S>")] indexer = DepLabelIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter[u"dep_labels"] == { u"ROOT": 1, u"nsubj": 1, u"det": 1, u"NONE": 2, u"attr": 1, u"punct": 1 }
def update_sentence_dep( self, fields, tokens ) -> Field: indexers = {'dep_tag': DepLabelIndexer(namespace='dep_tag')} textfield = TextField(tokens, indexers) return textfield
def test_as_array_produces_token_sequence(self): indexer = DepLabelIndexer() padded_tokens = indexer.as_padded_tensor({"key": [1, 2, 3, 4, 5]}, {"key": 10}, {}) assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
def test_padding_functions(self): indexer = DepLabelIndexer() assert indexer.get_padding_lengths(0) == {}
def test_as_array_produces_token_sequence(self): indexer = DepLabelIndexer() padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
def test_as_array_produces_token_sequence(self): indexer = DepLabelIndexer() padded_tokens = indexer.pad_token_sequence({u'key': [1, 2, 3, 4, 5]}, {u'key': 10}, {}) assert padded_tokens == {u'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
def test_as_array_produces_token_sequence(self): indexer = DepLabelIndexer() padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
def test_padding_functions(self): indexer = DepLabelIndexer() assert indexer.get_padding_token() == 0 assert indexer.get_padding_lengths(0) == {}
train_dataset_folder = "C:/Users/t-ofarvi/PycharmProjects/UCCA_Dataset_29-06-09/tryout" #"C:/Users/t-ofarvi/Desktop/train_allen" validation_dataset_folder = "C:/Users/t-ofarvi/PycharmProjects/UCCA_Dataset_29-06-09/tryout-validation" #"C:/Users/t-ofarvi/Desktop/dev_allen" model_dir = "C:/Users/t-ofarvi/PycharmProjects/tryout-model" vocab_dir = f'{model_dir}/vocabulary' # NOTE: The word tokenizer is a SpaCy tokenizer, which is a little different from the BERT tokenizer. # This was done for convince. word_tokenizer = SpacyMultilingualWhitespaceWordSplitter() bert_indexer = PretrainedBertIndexer(pretrained_model=bert_mode, do_lowercase=bert_do_lowercase, truncate_long_sequences=False) word_indexer = { "bert": bert_indexer, "deps": DepLabelIndexer(namespace="deps_tags"), "ner": NerTagIndexer(), "pos": PosTagIndexer(), "lang": LanguageIndexer() } train_ds, validation_ds = ( UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder) for folder in [train_dataset_folder, validation_dataset_folder]) if os.path.exists(vocab_dir): vocab = Vocabulary.from_files(vocab_dir) else: vocab = Vocabulary.from_instances( itertools.chain(train_ds, validation_ds)) vocab.save_to_files(vocab_dir)