Пример #1
0
def main():
    # Configuration file processing
    ...

    # DyNet setting
    ...

    # Build the dataset of the training process
    ## Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rpos**\t_\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',)
    ## Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    ## Setup datasets
    datasets_settings = {
        'train': DatasetSetting(cfg.TRAIN, True),
        'dev': DatasetSetting(cfg.DEV, True),
        'test': DatasetSetting(cfg.TEST, True),}
    datasets = SingleTaskDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(
        counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'})

    # Build model
    ...

    # Train model
    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True)
    valid_batch = datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, True, cmp, False)
    test_batch  = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, True, cmp, False)
Пример #2
0
    def tokens_to_indices(self, tokens: List[str],
                          vocab: Vocabulary) -> Dict[str, List[int]]:
        """
        Takes a list of tokens and converts them to one or more sets of indices.
        During the indexing process, each item corresponds to an index in the
        vocabulary.

        Parameters
        ----------
        vocab : ``Vocabulary``
            ``vocab`` is used to get the index of each item.

        Returns
        -------
        res : ``Dict[str, List[int]]``
            if the token and index list is [w1:5, w2:3, w3:0], the result will
            be {'vocab_name' : [5, 3, 0]}
        """
        res = {}
        for index_name in self.related_vocabs:
            index_list = [
                vocab.get_token_index(self.transform(tok), index_name)
                for tok in tokens
            ]
            res[index_name] = index_list
        return res
Пример #3
0
    def test_instance(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter(), 'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        single_id = SingleIdTokenIndexer(['my_word', 'glove'])
        char = CharTokenIndexer(['my_char'])
        sent = TextField('sentence', sentence, [single_id, char])
        data = Instance([sent])

        # Test count_vocab_items()
        data.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        result = data.index_fields(vocab)
        assert result['sentence']['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert result['sentence']['my_word'] == [2, 3, 3, 4, 4, 5, 6]
        assert result['sentence']['my_char'][0] == [2, 3, 4, 5]  # 'This'
        assert result['sentence']['my_char'][1] == result['sentence'][
            'my_char'][2]
        assert result['sentence']['my_char'][3] == result['sentence'][
            'my_char'][4]
    def __init__(self, model, cfg, vocabulary: Vocabulary):

        pc = model.add_subcollection()
        word_num = vocabulary.get_vocab_size('word')
        self.wlookup = pc.lookup_parameters_from_numpy(
            np.zeros((word_num, cfg.WORD_DIM), dtype=np.float32))
        tag_num = vocabulary.get_vocab_size('tag')
        self.tlookup = pc.lookup_parameters_from_numpy(
            np.random.randn(tag_num, cfg.TAG_DIM).astype(np.float32))
        _, glove_vec = glove_reader(cfg.GLOVE)
        glove_dim = len(glove_vec[0])
        unk_pad_vec = [[0.0 for _ in range(glove_dim)]]
        glove_num = vocabulary.get_vocab_size('glove')
        glove_vec = unk_pad_vec + unk_pad_vec + glove_vec
        glove_vec = np.array(glove_vec, dtype=np.float32) / np.std(glove_vec)
        self.glookup = pc.lookup_parameters_from_numpy(
            glove_vec.astype(np.float32))

        self.token_dim = cfg.WORD_DIM + cfg.TAG_DIM
        self.vocabulary = vocabulary
        self.pc, self.cfg = pc, cfg
        self.spec = (cfg, vocabulary)
Пример #5
0
    def tokens_to_indices(self, tokens: List[str],
                          vocab: Vocabulary) -> Dict[str, List[List[int]]]:
        """
        Takes a list of tokens and converts them to one or more sets of indices.
        During the indexing process, each token item corresponds to a list of
        index in the vocabulary.

        Parameters
        ----------
        vocab : ``Vocabulary``
            ``vocab`` is used to get the index of each item.
        """
        res = {}
        for vocab_name in self.related_vocabs:
            index_list = []

            for token in tokens:
                index_list.append([
                    vocab.get_token_index(self.transform(ch), vocab_name)
                    for ch in token
                ])
            res[vocab_name] = index_list
        return res
Пример #6
0
    def test_single_id_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = SingleIdTokenIndexer(['my_word', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
Пример #7
0
    def test_char_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['a', 'b', 'c', 'd', 'e']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = CharTokenIndexer(['my_char', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This'
        assert sent.indexes['glove'][3] == [2]  # 'a'
        assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
Пример #8
0
    def test_vocabulary(self):
        pretrained_vocabs = {
            'glove': ['a', 'b', 'c'],
            'w2v': ['b', 'c', 'd'],
            'glove_nounk': ['a', 'b', 'c'],
            'glove_nounk_nopad': ['a', 'b', 'c']
        }

        counters = {
            'w': Counter(["This", "is", "a", "test", "sentence", '.']),
            'w_m': Counter(['This', 'is', 'is']),
            'w_nounk': Counter(['This', 'is']),
            'w_nounk_nopad': Counter(['This', 'is', 'a'])
        }

        vocab = Vocabulary(
            counters=counters,
            min_count={'w_m': 2},
            pretrained_vocab=pretrained_vocabs,
            intersection_vocab={'w2v': 'glove'},
            no_pad_namespace={'glove_nounk_nopad', 'w_nounk_nopad'},
            no_unk_namespace={
                'glove_nounk', 'w_nounk', 'glove_nounk_nopad', 'w_nounk_nopad'
            })

        # Test glove
        print(vocab.get_vocab_size('glove'))
        assert vocab.get_token_index('a', 'glove') == 2
        assert vocab.get_token_index('c', 'glove') == 4
        assert vocab.get_token_index('d', 'glove') == 0

        # Test w2v
        assert vocab.get_token_index('b', 'w2v') == 2
        assert vocab.get_token_index('d', 'w2v') == 0
        assert vocab.get_token_from_index(2, 'w2v') == 'b'
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_from_index(4, 'w2v')
        assert excinfo.type == RuntimeError

        # Test glove_nounk
        assert vocab.get_token_index('a', 'glove_nounk') == 1
        assert vocab.get_token_index('c', 'glove_nounk') == 3
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk')
        assert excinfo.type == RuntimeError

        # Test glove_nounk_nopad
        assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0
        assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk_nopad')
        assert excinfo.type == RuntimeError

        # Test w
        assert vocab.get_token_index('a', 'w') == 4
        assert vocab.get_token_index('.', 'w') == 7
        assert vocab.get_token_index('That', 'w') == 0

        # Test w_m
        assert vocab.get_token_index('is', 'w_m') == 2
        assert vocab.get_token_index('This', 'w_m') == 0
        assert vocab.get_token_index('That', 'w_m') == 0

        # Test w_nounk
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk') == 1

        # Test w_nounk_nopad
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk_nopad')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
Пример #9
0
    def test_extend_from_pretrained_vocab(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple pretained vocab
        pretrained_vocabs = {'glove': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs)
        assert vocab.get_token_index('a', 'glove') == 2
        assert vocab.get_token_index('c', 'glove') == 4
        assert vocab.get_token_index('d', 'glove') == 0

        # Test extend a vocabulary from a pretained vocabulary,
        # and intersect with another vocabulary.
        pretrained_vocabs = {'w2v': ['b', 'c', 'd']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'})
        assert vocab.get_token_index('b', 'w2v') == 2
        assert vocab.get_token_index('d', 'w2v') == 0
        assert vocab.get_token_from_index(2, 'w2v') == 'b'
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_from_index(4, 'w2v')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov pretained vocabulary
        pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs,
                                           no_unk_namespace={
                                               'glove_nounk',
                                           })
        assert vocab.get_token_index('a', 'glove_nounk') == 1
        assert vocab.get_token_index('c', 'glove_nounk') == 3
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov and pad pretained vocabulary
        pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(
            pretrained_vocabs,
            no_unk_namespace={
                'glove_nounk_nopad',
            },
            no_pad_namespace={"glove_nounk_nopad"})
        assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0
        assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk_nopad')
        assert excinfo.type == RuntimeError
Пример #10
0
    def test_extend_from_counter(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple counter
        counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])}
        vocab.extend_from_counter(counter)
        assert vocab.get_token_index('a', 'w') == 4
        assert vocab.get_token_index('.', 'w') == 7
        assert vocab.get_token_index('That', 'w') == 0

        # Test extend a vocabulary from a counter with min_count
        counter = {'w_m': Counter(['This', 'is', 'is'])}
        min_count = {'w_m': 2}
        vocab.extend_from_counter(counter, min_count)
        assert vocab.get_token_index('is', 'w_m') == 2
        assert vocab.get_token_index('This', 'w_m') == 0
        assert vocab.get_token_index('That', 'w_m') == 0

        # Test extend a vocabulary from a counter without oov token
        counter = {'w_nounk': Counter(['This', 'is'])}
        vocab.extend_from_counter(counter, no_unk_namespace={
            'w_nounk',
        })
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk') == 1

        # Test extend a vocabulary from a counter without pad & unk token
        counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])}
        vocab.extend_from_counter(counter,
                                  no_unk_namespace={'w_nounk_nopad'},
                                  no_pad_namespace={'w_nounk_nopad'})
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk_nopad')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk_nopad') == 0