def test_train(self): DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/tagging') SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models') # trained model LOG_ROOT = os.path.join(os.path.dirname(__file__), '../logs') # checkpoint, tensorboard embedding_path = os.path.join(os.path.dirname(__file__), '../data/glove.6B/glove.6B.100d.txt') model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) x_test, y_test = load_data_and_labels(test_path) p = prepare_preprocessor(np.r_[x_train, x_valid, x_test], y_train) # np.r_ is for vocabulary expansion. p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.char_vocab_size = len(p.vocab_char) trainer = anago.Trainer(model_config, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p, embeddings=embeddings) trainer.train(x_train, y_train, x_test, y_test)
def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH, model_config.word_embedding_size) model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) trainer = anago.Trainer(model, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p, embeddings=embeddings) trainer.train(x_train, y_train, x_valid, y_valid) model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
def __init__(self, char_emb_size=25, word_emb_size=100, char_lstm_units=25, word_lstm_units=100, dropout=0.5, char_feature=True, crf=True, batch_size=1024, optimizer='adam', learning_rate=0.001, lr_decay=0.9, clip_gradients=5.0, max_epoch=15, early_stopping=True, patience=3, train_embeddings=True, max_checkpoints_to_keep=5, log_dir=None, embeddings=()): self.model_config = ModelConfig(char_emb_size, word_emb_size, char_lstm_units, word_lstm_units, dropout, char_feature, crf) self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, lr_decay, clip_gradients, max_epoch, early_stopping, patience, train_embeddings, max_checkpoints_to_keep) self.model = None self.p = None self.log_dir = log_dir self.embeddings = embeddings
def setUp(self): self.model_config = ModelConfig() self.training_config = TrainingConfig() vocab = 10000 self.model_config.char_vocab_size = 80 self.embeddings = np.zeros((vocab, self.model_config.word_embedding_size)) self.filename = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/test.txt') self.valid_file = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/ner/valid.txt')
def setUp(self): p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) self.tagger = anago.Tagger(model, preprocessor=p) self.sent = 'President Obama is speaking at the White House.'
def test_eval(self): test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) evaluator = anago.Evaluator(model, preprocessor=p) evaluator.eval(x_test, y_test)
def setUp(self): SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models') model_config = ModelConfig() p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) weights = 'model_weights.h5' self.tagger = anago.Tagger(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) self.sent = 'President Obama is speaking at the White House.'
def test_eval(self): DATA_ROOT = os.path.join(os.path.dirname(__file__), '../data/conll2003/en/tagging') SAVE_ROOT = os.path.join(os.path.dirname(__file__), '../models') model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) weights = 'model_weights.h5' evaluator = anago.Evaluator(model_config, weights, save_path=SAVE_ROOT, preprocessor=p) evaluator.eval(x_test, y_test)
import os import anago from anago.preprocess import prepare_preprocessor from anago.config import ModelConfig, TrainingConfig from anago.models import SeqLabeling import numpy as np from anago.reader import load_word_embeddings, load_data_and_labels DATA_ROOT = 'data/conll2003/en/ner' LOAD_ROOT = './models' # trained model LOG_ROOT = './logs' # checkpoint, tensorboard embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec' model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'train.small.txt') x_test, y_test = load_data_and_labels(test_path) p = prepare_preprocessor(x_test, y_test) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) model_path = os.path.join(LOAD_ROOT, 'mymodel.h5') model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) model.load(model_path) X, y = p.transform(x_test, y_test) predictions = model.predict(X) for words, prediction, sentence_length in zip(x_test, predictions, X[2]):