Exemplo n.º 1
0
    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
Exemplo n.º 2
0
 def test_make_vocab_doesnt_overwrite_vocab(self):
     vocab_path = self.TEST_DIR / 'vocabulary'
     os.mkdir(vocab_path)
     # Put something in the vocab directory
     with open(vocab_path / "test.txt", "a+") as open_file:
         open_file.write("test")
     # It should raise error if vocab dir is non-empty
     with pytest.raises(ConfigurationError):
         make_vocab_from_params(self.params, self.TEST_DIR)
Exemplo n.º 3
0
    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            '.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1',
            'some_weird_token_2'
        ]

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Exemplo n.º 4
0
    def test_make_vocab_makes_vocab_with_config(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        self.params['vocabulary'] = {}
        self.params['vocabulary']['min_count'] = {"tokens": 3}

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are']

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Exemplo n.º 5
0
    def test_make_vocab_makes_vocab(self):
        vocab_path = self.TEST_DIR / 'vocabulary'

        make_vocab_from_params(self.params, self.TEST_DIR)

        vocab_files = os.listdir(vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        tokens.sort()
        assert tokens == [
            '.', '@@UNKNOWN@@', 'animals', 'are', 'birds', 'cats', 'dogs',
            'snakes'
        ]

        with open(vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
Exemplo n.º 6
0
 def test_make_vocab_succeeds_without_vocabulary_key(self):
     make_vocab_from_params(self.params, self.TEST_DIR)