Пример #1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Пример #2
0
 def test_unknown_word(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = [['$unknownword$', 'あ']]
     y = [['O', 'O']]
     X, y = p.transform(X, y)
Пример #3
0
    def test_predict(self):
        X, y = load_data_and_labels(self.filename)
        X, y = X[:100], y[:100]
        p = prepare_preprocessor(X, y)
        self.model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag))
        model.predict(p.transform(X))
Пример #4
0
 def test_calc_sequence_lengths(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     _, y = p.transform(X, y)
     y_t = np.argmax(y, -1)
     y_t = y_t.astype(np.int32)
     sequence_lengths = np.argmin(y_t, -1)
Пример #5
0
 def test_transform_only_words(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
Пример #6
0
 def test_transform_with_padding(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X = p.transform(X)
     words, chars = X
     word, char = words[0][0], chars[0][0][0]
     self.assertIsInstance(int(word), int)
     self.assertIsInstance(int(char), int)
Пример #7
0
 def test_batch_iter(self):
     sents, labels = load_data_and_labels(self.filename)
     batch_size = 32
     p = prepare_preprocessor(sents, labels)
     steps, batches = batch_iter(list(zip(sents, labels)),
                                 batch_size,
                                 preprocessor=p)
     self.assertEqual(len([_ for _ in batches]),
                      steps)  # Todo: infinite loop
Пример #8
0
    def setUpClass(cls):
        if not os.path.exists(LOG_ROOT):
            os.mkdir(LOG_ROOT)

        if not os.path.exists(SAVE_ROOT):
            os.mkdir(SAVE_ROOT)

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        test_path = os.path.join(DATA_ROOT, 'test.txt')

        cls.x_train, cls.y_train = load_data_and_labels(train_path)
        cls.x_valid, cls.y_valid = load_data_and_labels(valid_path)
        cls.x_test, cls.y_test = load_data_and_labels(test_path)

        cls.embeddings = load_glove(EMBEDDING_PATH)

        cls.words = 'President Obama is speaking at the White House.'.split()

        cls.dir_path = 'models'
Пример #9
0
def train_base_model(batch_size: int, max_epoch: int, log_dir: str,
                     patience: int, no_log: bool) -> None:
    """Train a base NER model

    (Note: Not optimized for web parsing)

    Args:
        batch_size (int): number of batches to train on
        max_epoch (int): number of epochs to train the data on, early stopping
            is on by default
        patience (int); number of epochs to wait before stopping early
        log_dir (str): path to save tensorboard log information
        no_log (bool): don't log training data

    """
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
    if not os.path.exists(BASE_MODEL_PATH):
        os.mkdir(BASE_MODEL_PATH)

    train_path = os.path.join(DATA_TRAIN, 'train.txt')
    valid_path = os.path.join(DATA_TRAIN, 'valid.txt')

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = load_glove(EMBEDDING_PATH)

    if no_log:
        log_dir = None

    model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch,
                           log_dir=log_dir, embeddings=embeddings,
                           patience=patience)
    model.train(x_train, y_train, x_valid, y_valid)
    model.save(BASE_MODEL_PATH)
Пример #10
0
 def test_preprocessor(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     words, chars = X
     char, word = chars[0][0][0], words[0][0]
     tag = y[0][0]
     self.assertIsInstance(word, int)
     self.assertIsInstance(char, int)
     self.assertIsInstance(tag, int)
     self.assertIsInstance(p.inverse_transform(y[0])[0], str)
Пример #11
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
Пример #12
0
def evaluate(text_file: str, model_dir: str):
    """Evaluates a models performance on a TSV file with word/label pairs


    Args:
        text_file (str): TSV text file to evaluate
        model_dir (str): path to model to use for analysis

    """
    model = anago.Sequence.load(model_dir)

    print('Loading data...')
    x_test, y_test = load_data_and_labels(text_file)

    model.eval(x_test, y_test)
Пример #13
0
    def test_load(self):
        X, y = load_data_and_labels(self.filename)
        p = WordPreprocessor()
        p.fit(X, y)
        filepath = os.path.join(os.path.dirname(__file__),
                                'data/preprocessor.pkl')
        p.save(filepath)
        self.assertTrue(os.path.exists(filepath))

        loaded_p = WordPreprocessor.load(filepath)
        x_test1, y_test1 = p.transform(X, y)
        x_test2, y_test2 = loaded_p.transform(X, y)
        np.testing.assert_array_equal(x_test1[0], x_test2[0])  # word
        np.testing.assert_array_equal(x_test1[1], x_test2[1])  # char
        np.testing.assert_array_equal(y_test1, y_test2)
        if os.path.exists(filepath):
            os.remove(filepath)
Пример #14
0
    def test_vocab_init(self):
        X, y = load_data_and_labels(self.filename)
        unknown_word = 'unknownword'
        X_test, y_test = [[unknown_word]], [['O']]

        preprocessor = WordPreprocessor(padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertEqual(words, [p.vocab_word[UNK]])

        vocab_init = {unknown_word}
        preprocessor = WordPreprocessor(vocab_init=vocab_init, padding=False)
        p = preprocessor.fit(X, y)
        X_pred, _ = p.transform(X_test, y_test)
        words = X_pred[0][1]
        self.assertNotEqual(words, [p.vocab_word[UNK]])
Пример #15
0
import os
import anago
from anago.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig
from anago.models import SeqLabeling
import numpy as np
from anago.reader import load_word_embeddings, load_data_and_labels

DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
Пример #16
0
 def test_extract(self):
     sents, labels = load_data_and_labels(self.filename)
     self.assertTrue(len(sents) == len(labels))
Пример #17
0
import os

from gensim.models.keyedvectors import KeyedVectors

import anago
from anago.reader import load_data_and_labels

DATA_ROOT = os.path.join(os.path.dirname(__file__), './data/conll2003/en/ner')
EMBEDDING_PATH = 'model.txt'

train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')

print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')

embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv

# Use pre-trained word embeddings
model = anago.Sequence(max_epoch=1, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)
Пример #18
0
 def test_to_numpy_array(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=False)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)
     y = np.asarray(y)
Пример #19
0
def pretty_print_json(json_str):
    analyzed_json = json.loads(json_str.replace('\'', '"'))
    print(json.dumps(analyzed_json, indent=2))


def load_model(path):
    model_path = os.path.realpath(__file__).replace('main.py', '') + path
    print("loading model from {}".format(model_path))
    return anago.Sequence().load(model_path)


# x_train, y_train = load_data_and_labels('anago/data/conll2003/en/ner/train.txt')
# x_valid, y_valid = load_data_and_labels('anago/data/conll2003/en/ner/valid.txt')
# x_test, y_test = load_data_and_labels('anago/data/conll2003/en/ner/test.txt')

x_train, y_train = load_data_and_labels('data/training.txt')
x_valid, y_valid = load_data_and_labels('data/validation.txt')
# x_test, y_test = load_data_and_labels('data/test.txt')

test_examples_num = 100000
model = anago.Sequence()
model.train(x_train[:test_examples_num], y_train[:test_examples_num],
            x_valid[:test_examples_num], y_valid[:test_examples_num])

# model.eval(x_test, y_test)

model.save('big-model/')

# model = load_model('model/')
# model = load_model('my-model/')
Пример #20
0
 def test_data_loading(self):
     X, y = load_data_and_labels(self.filename)
Пример #21
0
 def test_fit(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor()
     p = preprocessor.fit(X, y)
Пример #22
0
import anago
from anago.reader import load_data_and_labels
x_train, y_train = load_data_and_labels('./data/train.txt')
x_valid, y_valid = load_data_and_labels('./data/dev.txt')
x_test, y_test = load_data_and_labels('./data/test.txt')
model = anago.Sequence()  #.load('./models')
model.train(x_train, y_train, x_valid, y_valid)
model.save(dir_path='./models')
model.eval(x_test, y_test)
Пример #23
0
import anago
from anago.reader import load_data_and_labels, load_glove

x_train, y_train = load_data_and_labels('train.txt')
x_valid, y_valid = load_data_and_labels('valid.txt')
x_test, y_test = load_data_and_labels('test.txt')

EMBEDDING_PATH = 'vectors-ind.txt'
embeddings = load_glove(EMBEDDING_PATH)

# model = anago.Sequence()
model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)

model.eval(x_test, y_test)

matres = []
for sent in x_test:
	res = model.analyze(sent)['entities']
	matres.append(res)

y_resu = []
for i, sent in enumerate(matres):
	sent_pred = ['O']*len(y_test[i])
	for enti in sent:
		bo = enti['beginOffset']
		sent_pred[bo] = 'B-'+enti['type']
		for x in range(bo+1, enti['endOffset']):
			sent_pred[x] = 'I-'+enti['type']
	y_resu.append(sent_pred)
Пример #24
0
import anago
from anago.reader import load_data_and_labels
import os
import numpy as np
import random as rn

namaDir = "data/"

namaFileTrain = namaDir + "TrainNER.txt"
namaFileValid = namaDir + "ValidNER.txt"
namaFileTest = namaDir + "TestNER.txt"

x_train, y_train = load_data_and_labels(namaFileTrain)
x_valid, y_valid = load_data_and_labels(namaFileValid)
x_test, y_test = load_data_and_labels(namaFileTest)

# karena hasil tdk konsisten, random seednya diisi manual
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
rn.seed(12345)
import tensorflow as tf
from keras import backend as K

# tf.set_random_seed(1234)

# atur parameternya disini
model = anago.Sequence(char_emb_size=25,
                       word_emb_size=100,
                       char_lstm_units=25,
                       word_lstm_units=100,
                       dropout=0.5,
Пример #25
0
 def test_pad_sequences(self):
     X, y = load_data_and_labels(self.filename)
     preprocessor = WordPreprocessor(padding=True)
     p = preprocessor.fit(X, y)
     X, y = p.transform(X, y)