Exemplo n.º 1
0
    def test_instance(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter(), 'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        single_id = SingleIdTokenIndexer(['my_word', 'glove'])
        char = CharTokenIndexer(['my_char'])
        sent = TextField('sentence', sentence, [single_id, char])
        data = Instance([sent])

        # Test count_vocab_items()
        data.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        result = data.index_fields(vocab)
        assert result['sentence']['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert result['sentence']['my_word'] == [2, 3, 3, 4, 4, 5, 6]
        assert result['sentence']['my_char'][0] == [2, 3, 4, 5]  # 'This'
        assert result['sentence']['my_char'][1] == result['sentence'][
            'my_char'][2]
        assert result['sentence']['my_char'][3] == result['sentence'][
            'my_char'][4]
Exemplo n.º 2
0
def main():
    # Configuration file processing
    ...

    # DyNet setting
    ...

    # Build the dataset of the training process
    ## Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rpos**\t_\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',)
    ## Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    ## Setup datasets
    datasets_settings = {
        'train': DatasetSetting(cfg.TRAIN, True),
        'dev': DatasetSetting(cfg.DEV, True),
        'test': DatasetSetting(cfg.TEST, True),}
    datasets = SingleTaskDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(
        counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'})

    # Build model
    ...

    # Train model
    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True)
    valid_batch = datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, True, cmp, False)
    test_batch  = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, True, cmp, False)
Exemplo n.º 3
0
    def test_single_id_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = SingleIdTokenIndexer(['my_word', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
Exemplo n.º 4
0
    def test_char_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['a', 'b', 'c', 'd', 'e']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = CharTokenIndexer(['my_char', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This'
        assert sent.indexes['glove'][3] == [2]  # 'a'
        assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
Exemplo n.º 5
0
    def test_extend_from_pretrained_vocab(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple pretained vocab
        pretrained_vocabs = {'glove': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs)
        assert vocab.get_token_index('a', 'glove') == 2
        assert vocab.get_token_index('c', 'glove') == 4
        assert vocab.get_token_index('d', 'glove') == 0

        # Test extend a vocabulary from a pretained vocabulary,
        # and intersect with another vocabulary.
        pretrained_vocabs = {'w2v': ['b', 'c', 'd']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'})
        assert vocab.get_token_index('b', 'w2v') == 2
        assert vocab.get_token_index('d', 'w2v') == 0
        assert vocab.get_token_from_index(2, 'w2v') == 'b'
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_from_index(4, 'w2v')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov pretained vocabulary
        pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs,
                                           no_unk_namespace={
                                               'glove_nounk',
                                           })
        assert vocab.get_token_index('a', 'glove_nounk') == 1
        assert vocab.get_token_index('c', 'glove_nounk') == 3
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov and pad pretained vocabulary
        pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(
            pretrained_vocabs,
            no_unk_namespace={
                'glove_nounk_nopad',
            },
            no_pad_namespace={"glove_nounk_nopad"})
        assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0
        assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk_nopad')
        assert excinfo.type == RuntimeError