示例#1
0
    def test_train(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../models')  # trained model
        LOG_ROOT = os.path.join(os.path.dirname(__file__),
                                '../logs')  # checkpoint, tensorboard
        embedding_path = os.path.join(os.path.dirname(__file__),
                                      '../data/glove.6B/glove.6B.100d.txt')

        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)
        x_test, y_test = load_data_and_labels(test_path)

        p = prepare_preprocessor(np.r_[x_train, x_valid, x_test],
                                 y_train)  # np.r_ is for vocabulary expansion.
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        trainer = anago.Trainer(model_config,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_test, y_test)
示例#2
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
示例#3
0
    def __init__(self,
                 char_emb_size=25,
                 word_emb_size=100,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 char_feature=True,
                 crf=True,
                 batch_size=1024,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=15,
                 early_stopping=True,
                 patience=3,
                 train_embeddings=True,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 embeddings=()):

        self.model_config = ModelConfig(char_emb_size, word_emb_size,
                                        char_lstm_units, word_lstm_units,
                                        dropout, char_feature, crf)
        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stopping, patience,
                                              train_embeddings,
                                              max_checkpoints_to_keep)
        self.model = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings = embeddings
示例#4
0
 def setUp(self):
     self.model_config = ModelConfig()
     self.training_config = TrainingConfig()
     vocab = 10000
     self.model_config.char_vocab_size = 80
     self.embeddings = np.zeros((vocab, self.model_config.word_embedding_size))
     self.filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt')
     self.valid_file = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/valid.txt')
示例#5
0
    def setUp(self):
        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))

        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        self.tagger = anago.Tagger(model, preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
示例#6
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
示例#7
0
    def setUp(self):
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        self.tagger = anago.Tagger(model_config,
                                   weights,
                                   save_path=SAVE_ROOT,
                                   preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
示例#8
0
    def test_eval(self):
        DATA_ROOT = os.path.join(os.path.dirname(__file__),
                                 '../data/conll2003/en/tagging')
        SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models')

        model_config = ModelConfig()

        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        weights = 'model_weights.h5'

        evaluator = anago.Evaluator(model_config,
                                    weights,
                                    save_path=SAVE_ROOT,
                                    preprocessor=p)
        evaluator.eval(x_test, y_test)
示例#9
0
文件: tag.py 项目: janjve/anago
import os
import anago
from anago.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig
from anago.models import SeqLabeling
import numpy as np
from anago.reader import load_word_embeddings, load_data_and_labels

DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):