def test_make_vocab_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False make_vocab_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def test_make_vocab_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens": 3} make_vocab_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { 'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt' } with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == [ '.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2' ] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']