def test_from_instances_exclusive_embeddings_file_inside_archive(self): """ Just for ensuring there are no problems when reading pretrained tokens from an archive """ # Read embeddings file from archive archive_path = str(self.TEST_DIR / "embeddings-archive.zip") with zipfile.ZipFile(archive_path, 'w') as archive: file_path = 'embedding.3d.vec' with archive.open(file_path, 'w') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) with archive.open('dummy.vec', 'w') as dummy_file: dummy_file.write("c 1.0 2.3 -1.0 3.0\n".encode('utf-8')) embeddings_file_uri = format_embeddings_file_uri(archive_path, file_path) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_file_uri}, only_include_pretrained_words=True) words = set(vocab.get_index_to_token_vocabulary().values()) assert 'a' in words assert 'b' in words assert 'c' not in words
def test_from_dataset_respects_max_vocab_size_single_int(self): max_vocab_size = 1 vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size) words = vocab.get_index_to_token_vocabulary().values() # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default assert len(words) == max_vocab_size + 2 vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert len(words) == 5
def test_from_dataset_respects_min_count(self): vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' not in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, min_count=None) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_from_dataset_respects_inclusive_embedding_file(self): embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8')) vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4}, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' not in words vocab = Vocabulary.from_instances(self.dataset, pretrained_files={'tokens': embeddings_filename}, only_include_pretrained_words=False) words = vocab.get_index_to_token_vocabulary().values() assert 'a' in words assert 'b' in words assert 'c' in words
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2