def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp()
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp()
def test_unknown_token(self): # pylint: disable=protected-access # We're putting this behavior in a test so that the behavior is documented. There is # solver code that depends in a small way on how we treat the unknown token, so any # breaking change to this behavior should break a test, so you know you've done something # that needs more consideration. vocab = Vocabulary() oov_token = vocab._oov_token oov_index = vocab.get_token_index(oov_token) assert oov_index == 1 assert vocab.get_token_index("unseen word") == oov_index
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5}, "text2": {"num_tokens": 6}} def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].asnumpy() text2 = tensors["text2"]["tokens"].asnumpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() indices = [ indexer.token_to_indices(Token(token), Vocabulary()) for token in ['Second', '.'] ] padded_tokens = indexer.pad_token_sequence(indices, desired_num_tokens=3, padding_lengths={}) expected_padded_tokens = [ [ 259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] assert padded_tokens == expected_padded_tokens
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()).asnumpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token('<S>'), Vocabulary()) expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == expected_indices
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary()) expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == expected_indices
def get_data(options, train_path, vocab_path): max_word_length = options['char_cnn']['max_characters_per_token'] batch_size = options['batch_size'] steps = options['unroll_steps'] if max_word_length != None: vocab = UnicodeCharsVocabulary(vocab_path, max_word_length=max_word_length) else: vocab = Vocabulary(vocab) data = BidirectionalLMDataset(train_path, vocab) data_gen = data.iter_batches(batch_size, steps) return data_gen
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. index = token.text_id else: text = token.text if self.lowercase_tokens: text = text.lower() index = vocabulary.get_token_index(text, self.namespace) return index
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters") }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]]
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR + 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]: indices = [] if token.text is None: raise ConfigurationError( 'TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) indices.append(index) return indices
def test_saving_and_loading(self): # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) assert vocab2._non_padded_namespaces == ["a", "c"] # Check namespace a. assert vocab2.get_vocab_size(namespace='a') == 3 assert vocab2.get_token_from_index(0, namespace='a') == 'a0' assert vocab2.get_token_from_index(1, namespace='a') == 'a1' assert vocab2.get_token_from_index(2, namespace='a') == 'a2' assert vocab2.get_token_index('a0', namespace='a') == 0 assert vocab2.get_token_index('a1', namespace='a') == 1 assert vocab2.get_token_index('a2', namespace='a') == 2 # Check namespace b. assert vocab2.get_vocab_size(namespace='b') == 4 # (unk + padding + two tokens) assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token assert vocab2.get_token_from_index(2, namespace='b') == 'b2' assert vocab2.get_token_from_index(3, namespace='b') == 'b3' assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0 assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1 assert vocab2.get_token_index('b2', namespace='b') == 2 assert vocab2.get_token_index('b3', namespace='b') == 3 # Check the dictionaries containing the reverse mapping are identical. assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters") }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].asnumpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].asnumpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].asnumpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].asnumpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].asnumpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def index(self, vocab: Vocabulary): if self._label_id is None: self._label_id = vocab.get_token_index( self.label, self._label_namespace) # type: ignore
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def test_from_params(self): # Save a vocab to check we can load it from_params. vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab = Vocabulary(non_padded_namespaces=["a", "c"]) vocab.add_token_to_namespace("a0", namespace="a") # non-padded, should start at 0 vocab.add_token_to_namespace("a1", namespace="a") vocab.add_token_to_namespace("a2", namespace="a") vocab.add_token_to_namespace("b2", namespace="b") # padded, should start at 2 vocab.add_token_to_namespace("b3", namespace="b") vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir}) vocab2 = Vocabulary.from_params(params) assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a") assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b") # Test case where we build a vocab from a dataset. vocab2 = Vocabulary.from_params(Params({}), self.dataset) assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@', 1: '@@UNKNOWN@@', 2: 'a', 3: 'c', 4: 'b'} # Test from_params raises when we have neither a dataset and a vocab_directory. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({})) # Test from_params raises when there are any other dict keys # present apart from 'vocabulary_directory' and we aren't calling from_dataset. with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def test_set_from_file_reads_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR + 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('<S>\n') vocab_file.write('</S>\n') vocab_file.write('<UNK>\n') vocab_file.write('a\n') vocab_file.write('tricky\x0bchar\n') vocab_file.write('word\n') vocab_file.write('another\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>") assert vocab._oov_token == DEFAULT_OOV_TOKEN assert vocab.get_token_index("random string") == 3 assert vocab.get_token_index("<S>") == 1 assert vocab.get_token_index("</S>") == 2 assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3 assert vocab.get_token_index("a") == 4 assert vocab.get_token_index("tricky\x0bchar") == 5 assert vocab.get_token_index("word") == 6 assert vocab.get_token_index("another") == 7 assert vocab.get_token_from_index(0) == vocab._padding_token assert vocab.get_token_from_index(1) == "<S>" assert vocab.get_token_from_index(2) == "</S>" assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN assert vocab.get_token_from_index(4) == "a" assert vocab.get_token_from_index(5) == "tricky\x0bchar" assert vocab.get_token_from_index(6) == "word" assert vocab.get_token_from_index(7) == "another"