def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH, model_config.word_embedding_size) model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) trainer = anago.Trainer(model, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT, preprocessor=p, embeddings=embeddings) trainer.train(x_train, y_train, x_valid, y_valid) model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
def test_unknown_word(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X = [['$unknownword$', 'あ']] y = [['O', 'O']] X, y = p.transform(X, y)
def test_predict(self): X, y = load_data_and_labels(self.filename) X, y = X[:100], y[:100] p = prepare_preprocessor(X, y) self.model_config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag)) model.predict(p.transform(X))
def test_calc_sequence_lengths(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) _, y = p.transform(X, y) y_t = np.argmax(y, -1) y_t = y_t.astype(np.int32) sequence_lengths = np.argmin(y_t, -1)
def test_transform_only_words(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X = p.transform(X) words, chars = X char, word = chars[0][0][0], words[0][0] self.assertIsInstance(word, int) self.assertIsInstance(char, int)
def test_transform_with_padding(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) X = p.transform(X) words, chars = X word, char = words[0][0], chars[0][0][0] self.assertIsInstance(int(word), int) self.assertIsInstance(int(char), int)
def test_batch_iter(self): sents, labels = load_data_and_labels(self.filename) batch_size = 32 p = prepare_preprocessor(sents, labels) steps, batches = batch_iter(list(zip(sents, labels)), batch_size, preprocessor=p) self.assertEqual(len([_ for _ in batches]), steps) # Todo: infinite loop
def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') test_path = os.path.join(DATA_ROOT, 'test.txt') cls.x_train, cls.y_train = load_data_and_labels(train_path) cls.x_valid, cls.y_valid = load_data_and_labels(valid_path) cls.x_test, cls.y_test = load_data_and_labels(test_path) cls.embeddings = load_glove(EMBEDDING_PATH) cls.words = 'President Obama is speaking at the White House.'.split() cls.dir_path = 'models'
def train_base_model(batch_size: int, max_epoch: int, log_dir: str, patience: int, no_log: bool) -> None: """Train a base NER model (Note: Not optimized for web parsing) Args: batch_size (int): number of batches to train on max_epoch (int): number of epochs to train the data on, early stopping is on by default patience (int); number of epochs to wait before stopping early log_dir (str): path to save tensorboard log information no_log (bool): don't log training data """ if not os.path.exists(log_dir): os.mkdir(log_dir) if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if not os.path.exists(BASE_MODEL_PATH): os.mkdir(BASE_MODEL_PATH) train_path = os.path.join(DATA_TRAIN, 'train.txt') valid_path = os.path.join(DATA_TRAIN, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = load_glove(EMBEDDING_PATH) if no_log: log_dir = None model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch, log_dir=log_dir, embeddings=embeddings, patience=patience) model.train(x_train, y_train, x_valid, y_valid) model.save(BASE_MODEL_PATH)
def test_preprocessor(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) words, chars = X char, word = chars[0][0][0], words[0][0] tag = y[0][0] self.assertIsInstance(word, int) self.assertIsInstance(char, int) self.assertIsInstance(tag, int) self.assertIsInstance(p.inverse_transform(y[0])[0], str)
def test_eval(self): test_path = os.path.join(DATA_ROOT, 'test.txt') x_test, y_test = load_data_and_labels(test_path) p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) config = ModelConfig() config.vocab_size = len(p.vocab_word) config.char_vocab_size = len(p.vocab_char) model = SeqLabeling(config, ntags=len(p.vocab_tag)) model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5')) evaluator = anago.Evaluator(model, preprocessor=p) evaluator.eval(x_test, y_test)
def evaluate(text_file: str, model_dir: str): """Evaluates a models performance on a TSV file with word/label pairs Args: text_file (str): TSV text file to evaluate model_dir (str): path to model to use for analysis """ model = anago.Sequence.load(model_dir) print('Loading data...') x_test, y_test = load_data_and_labels(text_file) model.eval(x_test, y_test)
def test_load(self): X, y = load_data_and_labels(self.filename) p = WordPreprocessor() p.fit(X, y) filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl') p.save(filepath) self.assertTrue(os.path.exists(filepath)) loaded_p = WordPreprocessor.load(filepath) x_test1, y_test1 = p.transform(X, y) x_test2, y_test2 = loaded_p.transform(X, y) np.testing.assert_array_equal(x_test1[0], x_test2[0]) # word np.testing.assert_array_equal(x_test1[1], x_test2[1]) # char np.testing.assert_array_equal(y_test1, y_test2) if os.path.exists(filepath): os.remove(filepath)
def test_vocab_init(self): X, y = load_data_and_labels(self.filename) unknown_word = 'unknownword' X_test, y_test = [[unknown_word]], [['O']] preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X_pred, _ = p.transform(X_test, y_test) words = X_pred[0][1] self.assertEqual(words, [p.vocab_word[UNK]]) vocab_init = {unknown_word} preprocessor = WordPreprocessor(vocab_init=vocab_init, padding=False) p = preprocessor.fit(X, y) X_pred, _ = p.transform(X_test, y_test) words = X_pred[0][1] self.assertNotEqual(words, [p.vocab_word[UNK]])
import os import anago from anago.preprocess import prepare_preprocessor from anago.config import ModelConfig, TrainingConfig from anago.models import SeqLabeling import numpy as np from anago.reader import load_word_embeddings, load_data_and_labels DATA_ROOT = 'data/conll2003/en/ner' LOAD_ROOT = './models' # trained model LOG_ROOT = './logs' # checkpoint, tensorboard embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec' model_config = ModelConfig() test_path = os.path.join(DATA_ROOT, 'train.small.txt') x_test, y_test = load_data_and_labels(test_path) p = prepare_preprocessor(x_test, y_test) embeddings = load_word_embeddings(p.vocab_word, embedding_path, model_config.word_embedding_size) model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) model_path = os.path.join(LOAD_ROOT, 'mymodel.h5') model = SeqLabeling(model_config, embeddings, len(p.vocab_tag)) model.load(model_path) X, y = p.transform(x_test, y_test) predictions = model.predict(X) for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
def test_extract(self): sents, labels = load_data_and_labels(self.filename) self.assertTrue(len(sents) == len(labels))
import os from gensim.models.keyedvectors import KeyedVectors import anago from anago.reader import load_data_and_labels DATA_ROOT = os.path.join(os.path.dirname(__file__), './data/conll2003/en/ner') EMBEDDING_PATH = 'model.txt' train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') print('Loading data...') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) print(len(x_train), 'train sequences') print(len(x_valid), 'valid sequences') embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv # Use pre-trained word embeddings model = anago.Sequence(max_epoch=1, embeddings=embeddings) model.train(x_train, y_train, x_valid, y_valid)
def test_to_numpy_array(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) y = np.asarray(y)
def pretty_print_json(json_str): analyzed_json = json.loads(json_str.replace('\'', '"')) print(json.dumps(analyzed_json, indent=2)) def load_model(path): model_path = os.path.realpath(__file__).replace('main.py', '') + path print("loading model from {}".format(model_path)) return anago.Sequence().load(model_path) # x_train, y_train = load_data_and_labels('anago/data/conll2003/en/ner/train.txt') # x_valid, y_valid = load_data_and_labels('anago/data/conll2003/en/ner/valid.txt') # x_test, y_test = load_data_and_labels('anago/data/conll2003/en/ner/test.txt') x_train, y_train = load_data_and_labels('data/training.txt') x_valid, y_valid = load_data_and_labels('data/validation.txt') # x_test, y_test = load_data_and_labels('data/test.txt') test_examples_num = 100000 model = anago.Sequence() model.train(x_train[:test_examples_num], y_train[:test_examples_num], x_valid[:test_examples_num], y_valid[:test_examples_num]) # model.eval(x_test, y_test) model.save('big-model/') # model = load_model('model/') # model = load_model('my-model/')
def test_data_loading(self): X, y = load_data_and_labels(self.filename)
def test_fit(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor() p = preprocessor.fit(X, y)
import anago from anago.reader import load_data_and_labels x_train, y_train = load_data_and_labels('./data/train.txt') x_valid, y_valid = load_data_and_labels('./data/dev.txt') x_test, y_test = load_data_and_labels('./data/test.txt') model = anago.Sequence() #.load('./models') model.train(x_train, y_train, x_valid, y_valid) model.save(dir_path='./models') model.eval(x_test, y_test)
import anago from anago.reader import load_data_and_labels, load_glove x_train, y_train = load_data_and_labels('train.txt') x_valid, y_valid = load_data_and_labels('valid.txt') x_test, y_test = load_data_and_labels('test.txt') EMBEDDING_PATH = 'vectors-ind.txt' embeddings = load_glove(EMBEDDING_PATH) # model = anago.Sequence() model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings) model.train(x_train, y_train, x_valid, y_valid) model.eval(x_test, y_test) matres = [] for sent in x_test: res = model.analyze(sent)['entities'] matres.append(res) y_resu = [] for i, sent in enumerate(matres): sent_pred = ['O']*len(y_test[i]) for enti in sent: bo = enti['beginOffset'] sent_pred[bo] = 'B-'+enti['type'] for x in range(bo+1, enti['endOffset']): sent_pred[x] = 'I-'+enti['type'] y_resu.append(sent_pred)
import anago from anago.reader import load_data_and_labels import os import numpy as np import random as rn namaDir = "data/" namaFileTrain = namaDir + "TrainNER.txt" namaFileValid = namaDir + "ValidNER.txt" namaFileTest = namaDir + "TestNER.txt" x_train, y_train = load_data_and_labels(namaFileTrain) x_valid, y_valid = load_data_and_labels(namaFileValid) x_test, y_test = load_data_and_labels(namaFileTest) # karena hasil tdk konsisten, random seednya diisi manual os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) rn.seed(12345) import tensorflow as tf from keras import backend as K # tf.set_random_seed(1234) # atur parameternya disini model = anago.Sequence(char_emb_size=25, word_emb_size=100, char_lstm_units=25, word_lstm_units=100, dropout=0.5,
def test_pad_sequences(self): X, y = load_data_and_labels(self.filename) preprocessor = WordPreprocessor(padding=True) p = preprocessor.fit(X, y) X, y = p.transform(X, y)