Exemplo n.º 1
0
    def test_train(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../models')  # trained model
        LOG_ROOT = os.path.join(os.path.dirname(__file__),
                                '../logs')  # checkpoint, tensorboard
        embedding_path = os.path.join(os.path.dirname(__file__),
                                      '../data/glove.6B/glove.6B.100d.txt')

        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)
        x_test, y_test = load_data_and_labels(test_path)

        p = prepare_preprocessor(np.r_[x_train, x_valid, x_test],
                                 y_train)  # np.r_ is for vocabulary expansion.
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        trainer = anago.Trainer(model_config,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_test, y_test)
Exemplo n.º 2
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Exemplo n.º 3
0
 def test_unknown_word(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = [['$unknownword$', 'あ']]
     y = [['O', 'O']]
     X, y = p.transform(X, y)
Exemplo n.º 4
0
 def test_calc_sequence_lengths(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     _, y = p.transform(X, y)
     y_t = np.argmax(y, -1)
     y_t = y_t.astype(np.int32)
     sequence_lengths = np.argmin(y_t, -1)
Exemplo n.º 5
0
    def test_predict(self):
        X, y = load_data_and_labels(self.filename)
        X, y = X[:100], y[:100]
        p = prepare_preprocessor(X, y)
        self.model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag))
        model.predict(p.transform(X))
Exemplo n.º 6
0
 def test_transform_with_padding(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     word, char = words[0][0], chars[0][0][0]
     self.assertIsInstance(int(word), int)
     self.assertIsInstance(int(char), int)
Exemplo n.º 7
0
 def test_transform_only_words(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
Exemplo n.º 8
0
 def test_preprocessor(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     tag = y[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
     self.assertIsInstance(tag, int)
     self.assertIsInstance(p.inverse_transform(y[0])[0], str)
Exemplo n.º 9
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
Exemplo n.º 10
0
    def test_load(self):
        X, y = reader.load_data_and_labels(self.filename)
        p = WordPreprocessor()
        p.fit(X, y)
        filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl')
        p.save(filepath)
        self.assertTrue(os.path.exists(filepath))

        loaded_p = WordPreprocessor.load(filepath)
        x_test1, y_test1 = p.transform(X, y)
        x_test2, y_test2 = loaded_p.transform(X, y)
        np.testing.assert_array_equal(x_test1[0], x_test2[0])  # word
        np.testing.assert_array_equal(x_test1[1], x_test2[1])  # char
        np.testing.assert_array_equal(y_test1, y_test2)
        if os.path.exists(filepath):
            os.remove(filepath)
Exemplo n.º 11
0
    def test_vocab_init(self):
        X, y = reader.load_data_and_labels(self.filename)
        unknown_word = 'unknownword'
        X_test, y_test = [[unknown_word]], [['O']]

        preprocessor = WordPreprocessor(padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertEqual(words, [p.vocab_word[UNK]])

        vocab_init = {unknown_word}
        preprocessor = WordPreprocessor(vocab_init=vocab_init, padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertNotEqual(words, [p.vocab_word[UNK]])
Exemplo n.º 12
0
    def test_eval(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        evaluator = anago.Evaluator(model_config,
                                    weights,
                                    save_path=SAVE_ROOT,
                                    preprocessor=p)
        evaluator.eval(x_test, y_test)
Exemplo n.º 13
0
"""
    说明:
        1:pip中已经安装了anago这个包,但是这里的测试代码是依托repositoriesgit库中的这个anago项目
        2:目的是跑通网络,查看比较
"""

import os
from anago import config
from anago.data import reader, preprocess

if __name__ == '__main__':

    # 设置参数
    DATA_ROOT = 'data/conll2003/en/ner'
    SAVE_ROOT = './models'  # trained model
    LOG_ROOT = './logs'  # checkpoint, tensorboard
    embedding_path = './data/glove.6B/glove.6B.100d.txt'
    model_config = config.ModelConfig()
    training_config = config.TrainingConfig()

    # 加载数据
    train_path = os.path.join(DATA_ROOT, 'train.txt')
    valid_path = os.path.join(DATA_ROOT, 'valid.txt')
    test_path = os.path.join(DATA_ROOT, 'test.txt')
    x_train, y_train = reader.load_data_and_labels(train_path)
    x_valid, y_valid = reader.load_data_and_labels(valid_path)
    x_test, y_test = reader.load_data_and_labels(test_path)
    print(x_train.shape, y_train.shape)
    print(x_valid.shape, y_valid.shape)
    print(x_test.shape, y_test.shape)
Exemplo n.º 14
0
 def test_data_loading(self):
     X, y = reader.load_data_and_labels(self.filename)
Exemplo n.º 15
0
 def test_batch_iter(self):
     sents, labels = load_data_and_labels(self.filename)
     batch_size = 32
     p = prepare_preprocessor(sents, labels)
     steps, batches = batch_iter(list(zip(sents, labels)), batch_size, preprocessor=p)
     self.assertEqual(len([_ for _ in batches]), steps)  # Todo: infinite loop
Exemplo n.º 16
0
 def test_extract(self):
     sents, labels = load_data_and_labels(self.filename)
     self.assertTrue(len(sents) == len(labels))
Exemplo n.º 17
0
 def test_pad_sequences(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
Exemplo n.º 18
0
 def test_to_numpy_array(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     y = np.asarray(y)
Exemplo n.º 19
0
 def test_fit(self):
     X, y = reader.load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor()
     p = preprocessor.fit(X, y)