Пример #1
0
    def test_instance(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter(), 'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        single_id = SingleIdTokenIndexer(['my_word', 'glove'])
        char = CharTokenIndexer(['my_char'])
        sent = TextField('sentence', sentence, [single_id, char])
        data = Instance([sent])

        # Test count_vocab_items()
        data.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        result = data.index_fields(vocab)
        assert result['sentence']['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert result['sentence']['my_word'] == [2, 3, 3, 4, 4, 5, 6]
        assert result['sentence']['my_char'][0] == [2, 3, 4, 5]  # 'This'
        assert result['sentence']['my_char'][1] == result['sentence'][
            'my_char'][2]
        assert result['sentence']['my_char'][3] == result['sentence'][
            'my_char'][4]
Пример #2
0
    def test_single_id_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = SingleIdTokenIndexer(['my_word', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
Пример #3
0
    def test_char_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['a', 'b', 'c', 'd', 'e']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = CharTokenIndexer(['my_char', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This'
        assert sent.indexes['glove'][3] == [2]  # 'a'
        assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
Пример #4
0
    def test_extend_from_counter(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple counter
        counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])}
        vocab.extend_from_counter(counter)
        assert vocab.get_token_index('a', 'w') == 4
        assert vocab.get_token_index('.', 'w') == 7
        assert vocab.get_token_index('That', 'w') == 0

        # Test extend a vocabulary from a counter with min_count
        counter = {'w_m': Counter(['This', 'is', 'is'])}
        min_count = {'w_m': 2}
        vocab.extend_from_counter(counter, min_count)
        assert vocab.get_token_index('is', 'w_m') == 2
        assert vocab.get_token_index('This', 'w_m') == 0
        assert vocab.get_token_index('That', 'w_m') == 0

        # Test extend a vocabulary from a counter without oov token
        counter = {'w_nounk': Counter(['This', 'is'])}
        vocab.extend_from_counter(counter, no_unk_namespace={
            'w_nounk',
        })
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk') == 1

        # Test extend a vocabulary from a counter without pad & unk token
        counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])}
        vocab.extend_from_counter(counter,
                                  no_unk_namespace={'w_nounk_nopad'},
                                  no_pad_namespace={'w_nounk_nopad'})
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk_nopad')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk_nopad') == 0