def test_instance(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_word': Counter(), 'my_char': Counter()} vocab = Vocabulary() glove = ['This', 'is', 'glove', 'sentence', 'vocabulary'] vocab.extend_from_pretrained_vocab({'glove': glove}) single_id = SingleIdTokenIndexer(['my_word', 'glove']) char = CharTokenIndexer(['my_char']) sent = TextField('sentence', sentence, [single_id, char]) data = Instance([sent]) # Test count_vocab_items() data.count_vocab_items(counter) assert counter['my_word']['This'] == 1 assert counter['my_word']['is'] == 2 assert counter['my_word']['That'] == 0 assert counter['my_char']['s'] == 5 assert counter['my_char']['T'] == 1 assert counter['my_char']['t'] == 3 assert counter['my_char']['A'] == 0 vocab.extend_from_counter(counter) # Test index() result = data.index_fields(vocab) assert result['sentence']['glove'] == [2, 3, 3, 0, 0, 0, 5] assert result['sentence']['my_word'] == [2, 3, 3, 4, 4, 5, 6] assert result['sentence']['my_char'][0] == [2, 3, 4, 5] # 'This' assert result['sentence']['my_char'][1] == result['sentence'][ 'my_char'][2] assert result['sentence']['my_char'][3] == result['sentence'][ 'my_char'][4]
def test_single_id_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_word': Counter()} vocab = Vocabulary() glove = ['This', 'is', 'glove', 'sentence', 'vocabulary'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = SingleIdTokenIndexer(['my_word', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_word']['This'] == 1 assert counter['my_word']['is'] == 2 assert counter['my_word']['That'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5] assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
def test_char_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_char': Counter()} vocab = Vocabulary() glove = ['a', 'b', 'c', 'd', 'e'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = CharTokenIndexer(['my_char', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_char']['s'] == 5 assert counter['my_char']['T'] == 1 assert counter['my_char']['t'] == 3 assert counter['my_char']['A'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This' assert sent.indexes['glove'][3] == [2] # 'a' assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
def test_extend_from_counter(self): vocab = Vocabulary() # Test extend a vocabulary from a simple counter counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])} vocab.extend_from_counter(counter) assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test extend a vocabulary from a counter with min_count counter = {'w_m': Counter(['This', 'is', 'is'])} min_count = {'w_m': 2} vocab.extend_from_counter(counter, min_count) assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test extend a vocabulary from a counter without oov token counter = {'w_nounk': Counter(['This', 'is'])} vocab.extend_from_counter(counter, no_unk_namespace={ 'w_nounk', }) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test extend a vocabulary from a counter without pad & unk token counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])} vocab.extend_from_counter(counter, no_unk_namespace={'w_nounk_nopad'}, no_pad_namespace={'w_nounk_nopad'}) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0