Exemplo n.º 1
0
    def test_train(self):
        # Not use pre-trained word embeddings
        model = anago.Sequence(max_epoch=1)
        model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)

        # Use pre-trained word embeddings
        model = anago.Sequence(max_epoch=1, embeddings=self.embeddings)
        model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
Exemplo n.º 2
0
 def train(self):
     self.model = anago.Sequence()
     self.model.fit(self.x_train,
                    self.y_train,
                    self.x_test,
                    self.y_test,
                    epochs=self.epoch)
Exemplo n.º 3
0
 def test_train_vocab_init(self):
     vocab = set()
     for words in np.r_[self.x_train, self.x_test, self.x_test]:
         for word in words:
             vocab.add(word)
     model = anago.Sequence(initial_vocab=vocab, embeddings=self.embeddings)
     model.fit(self.x_train, self.y_train, self.x_test, self.y_test)
Exemplo n.º 4
0
    def test_analyze(self):
        model = anago.Sequence()
        model.fit(self.x_train, self.y_train)
        res = model.analyze(self.text)
        pprint(res)

        self.assertIn('words', res)
        self.assertIn('entities', res)
Exemplo n.º 5
0
 def test_train_vocab_init(self):
     vocab = set()
     for words in np.r_[self.x_train, self.x_valid, self.x_test]:
         for word in words:
             vocab.add(word)
     model = anago.Sequence(max_epoch=15, embeddings=self.embeddings, log_dir='logs')
     model.train(self.x_train, self.y_train, self.x_test, self.y_test, vocab_init=vocab)
     model.save(dir_path=self.dir_path)
Exemplo n.º 6
0
    def test_load(self):
        model = anago.Sequence(max_epoch=1, embeddings=self.embeddings)
        model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
        model.eval(self.x_test, self.y_test)
        model.save(dir_path=self.dir_path)

        model = anago.Sequence.load(self.dir_path)
        model.eval(self.x_test, self.y_test)
Exemplo n.º 7
0
    def test_analyze(self):
        model = anago.Sequence(max_epoch=1, embeddings=self.embeddings)
        model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
        res = model.analyze(self.words)
        pprint(res)

        self.assertIn('words', res)
        self.assertIn('entities', res)
Exemplo n.º 8
0
def get_intonator():
    model = anago.Sequence()
    model = model.load("char_model/best_model")
    model.model.model._make_predict_function()

    def intonator(sentence):
        chars = list(sentence)
        resp = model.analyze(chars)
        return marked_output(resp)

    return intonator
Exemplo n.º 9
0
    def test_save(self):
        model = anago.Sequence(max_epoch=1, embeddings=self.embeddings)
        model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
        model.save(dir_path=self.dir_path)

        config_file = os.path.join(self.dir_path, model.config_file)
        weight_file = os.path.join(self.dir_path, model.weight_file)
        preprocessor_file = os.path.join(self.dir_path, model.preprocessor_file)

        self.assertTrue(os.path.exists(config_file))
        self.assertTrue(os.path.exists(weight_file))
        self.assertTrue(os.path.exists(preprocessor_file))
Exemplo n.º 10
0
    def fit(self,
            X,
            y=None,
            char_emb_size=32,
            word_emb_size=128,
            char_lstm_units=32,
            word_lstm_units=128,
            dropout=0.1,
            batch_size=16,
            learning_rate=0.001,
            num_epochs=10):
        """ Trains the NER model. The input is a list of
            `AnnotatedDocument` instances.
        """

        # Anago splits the BIO tags on the dash "-", so if the label contains
        # a dash, it corrupts it. This is a workaround for this behavior.
        for annotated_document in X:
            for annotation in annotated_document.annotations:
                if "-" in annotation.label:
                    self._label_map[annotation.label.split("-")
                                    [-1]] = annotation.label
                else:
                    self._label_map["B_" + annotation.label] = annotation.label
                    self._label_map["I_" + annotation.label] = annotation.label

        self.model = anago.Sequence(char_emb_size=char_emb_size,
                                    word_emb_size=word_emb_size,
                                    char_lstm_units=char_lstm_units,
                                    word_lstm_units=word_lstm_units,
                                    dropout=dropout,
                                    batch_size=batch_size,
                                    learning_rate=learning_rate,
                                    max_epoch=num_epochs)

        log.info("Transforming {} items to BIO format...".format(len(X)))
        training_data = transform_annotated_documents_to_bio_format(X)

        BIO_Χ = np.asarray([x_i for x_i in training_data[0] if len(x_i) > 0])
        BIO_y = np.asarray([y_i for y_i in training_data[1] if len(y_i) > 0])

        log.info("Training the BiLSTM...")
        X_train, X_valid, y_train, y_valid = train_test_split(BIO_Χ,
                                                              BIO_y,
                                                              test_size=0.1)

        self.model.train(X_train, y_train, X_valid, y_valid)
        return self
Exemplo n.º 11
0
    def test_save_and_load(self):
        weights_file = os.path.join(SAVE_ROOT, 'weights.h5')
        params_file = os.path.join(SAVE_ROOT, 'params.json')
        preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle')

        model = anago.Sequence()
        model.fit(self.x_train, self.y_train)
        model.save(weights_file, params_file, preprocessor_file)
        score1 = model.score(self.x_test, self.y_test)

        self.assertTrue(weights_file)
        self.assertTrue(params_file)
        self.assertTrue(preprocessor_file)

        model = anago.Sequence.load(weights_file, params_file, preprocessor_file)
        score2 = model.score(self.x_test, self.y_test)

        self.assertEqual(score1, score2)
Exemplo n.º 12
0
    def test_train_callbacks(self):
        weights_file = os.path.join(SAVE_ROOT, 'weights.h5')
        params_file = os.path.join(SAVE_ROOT, 'params.json')
        preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle')

        log_dir = os.path.join(os.path.dirname(__file__), 'logs')
        file_name = '_'.join(['model_weights', '{epoch:02d}', '{f1:2.4f}']) + '.h5'
        callback = ModelCheckpoint(os.path.join(log_dir, file_name),
                                   monitor='f1',
                                   save_weights_only=True)
        vocab = set()
        for words in np.r_[self.x_train, self.x_test, self.x_test]:
            for word in words:
                vocab.add(word)
        model = anago.Sequence(initial_vocab=vocab, embeddings=self.embeddings)
        model.fit(self.x_train, self.y_train, self.x_test, self.y_test,
                  epochs=30, callbacks=[callback])
        model.save(weights_file, params_file, preprocessor_file)
Exemplo n.º 13
0
def train_base_model(batch_size: int, max_epoch: int, log_dir: str,
                     patience: int, no_log: bool) -> None:
    """Train a base NER model

    (Note: Not optimized for web parsing)

    Args:
        batch_size (int): number of batches to train on
        max_epoch (int): number of epochs to train the data on, early stopping
            is on by default
        patience (int); number of epochs to wait before stopping early
        log_dir (str): path to save tensorboard log information
        no_log (bool): don't log training data

    """
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
    if not os.path.exists(BASE_MODEL_PATH):
        os.mkdir(BASE_MODEL_PATH)

    train_path = os.path.join(DATA_TRAIN, 'train.txt')
    valid_path = os.path.join(DATA_TRAIN, 'valid.txt')

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = load_glove(EMBEDDING_PATH)

    if no_log:
        log_dir = None

    model = anago.Sequence(batch_size=batch_size, max_epoch=max_epoch,
                           log_dir=log_dir, embeddings=embeddings,
                           patience=patience)
    model.train(x_train, y_train, x_valid, y_valid)
    model.save(BASE_MODEL_PATH)
Exemplo n.º 14
0
def train(log_dir: str) -> None:
    """Fine-tune base model

    Args:
        log_dir (str): pth to save tensorboard log information

    """
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    x_train, y_train, x_valid, y_valid = train_test_split_from_queries()
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = load_glove(EMBEDDING_PATH)

    model = anago.Sequence(log_dir=LOG_DIR, embeddings=embeddings)
    model.load(BASE_MODEL_PATH)
    model.train(x_train, y_train, x_valid, y_valid)
    model.save(CUSTOM_MODEL_PATH)
Exemplo n.º 15
0
def load_model(path):
    model_path = os.path.realpath(__file__).replace('main.py', '') + path
    print("loading model from {}".format(model_path))
    return anago.Sequence().load(model_path)
Exemplo n.º 16
0
from anago.utils import load_data_and_labels
import anago

x_train, y_train = load_data_and_labels('data/conll2003/en/ner/train.txt')
x_test, y_test = load_data_and_labels('data/conll2003/en/ner/test.txt')
x_dev, y_dev = load_data_and_labels('data/conll2003/en/ner/valid.txt')
model = anago.Sequence()
model.fit(x_train, y_train, x_dev, y_dev, epochs=15)
model.score(x_test, y_test)
Exemplo n.º 17
0
 def test_eval(self):
     model = anago.Sequence(max_epoch=1, embeddings=self.embeddings)
     model.train(self.x_train, self.y_train, self.x_valid, self.y_valid)
     model.eval(self.x_test, self.y_test)
Exemplo n.º 18
0
MODEL_DIRPATH = "pretrained/ner_tagger/"
WEIGHT_FILENAME = "nerweight.h5"
PARAMS_FILENAME = "nerparams.json"
PREPROCESSOR_FILENAME = "nerprepro.pkl"

MODEL_NAME = "model_ner_12514_softmax_v5_w2v_100_POS_LSTM_EmbNotTrainable_OOV"
loaded_model = load_model(f"pretrained/{MODEL_NAME}")

with open(f"pretrained/{MODEL_NAME}/dict.pickle", 'rb') as handle:
    idx2token_loaded = pickle.load(handle)
    idx2tag_loaded = pickle.load(handle)
    token2idx_loaded = pickle.load(handle)
    tag2idx_loaded = pickle.load(handle)

model = anago.Sequence().load(weights_file=MODEL_DIRPATH + WEIGHT_FILENAME, 
            params_file=MODEL_DIRPATH + PARAMS_FILENAME, 
            preprocessor_file=MODEL_DIRPATH + PREPROCESSOR_FILENAME)

def __get_ner_tags(result):
    ner_tags = ["O" for _ in result['words']]
    for entity in result['entities']:
        ner_tags[entity['beginOffset']] = entity['type']
    return ner_tags

def get_entities(sentence):
    result = model.analyze(sentence)
    return __get_ner_tags(result)

def get_entities_tf(sentence,  pad_token = np.nan, n_timesteps = 48): 
    sentence=sentence.split()
    padded_sentence = sentence + [pad_token] * (n_timesteps - len(sentence))
Exemplo n.º 19
0
 def test_train_all(self):
     x_train = np.r_[self.x_train, self.x_valid, self.x_test]
     y_train = np.r_[self.y_train, self.y_valid, self.y_test]
     model = anago.Sequence(max_epoch=15, embeddings=self.embeddings, log_dir='logs')
     model.train(x_train, y_train, self.x_test, self.y_test)
     model.save(dir_path=self.dir_path)
import os

from gensim.models.keyedvectors import KeyedVectors

import anago
from anago.utils import load_data_and_labels

if __name__ == '__main__':
    DATA_ROOT = os.path.join(os.path.dirname(__file__),
                             '../data/conll2003/en/ner')
    EMBEDDING_PATH = 'model.txt'

    train_path = os.path.join(DATA_ROOT, 'train.txt')
    valid_path = os.path.join(DATA_ROOT, 'valid.txt')

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv

    # Use pre-trained word embeddings
    model = anago.Sequence(embeddings=embeddings)
    model.fit(x_train, y_train, x_valid, y_valid)
Exemplo n.º 21
0
import os

from gensim.models.keyedvectors import KeyedVectors

import anago
from anago.reader import load_data_and_labels

DATA_ROOT = os.path.join(os.path.dirname(__file__), './data/conll2003/en/ner')
EMBEDDING_PATH = 'model.txt'

train_path = os.path.join(DATA_ROOT, 'train.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.txt')

print('Loading data...')
x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
print(len(x_train), 'train sequences')
print(len(x_valid), 'valid sequences')

embeddings = KeyedVectors.load_word2vec_format(EMBEDDING_PATH).wv

# Use pre-trained word embeddings
model = anago.Sequence(max_epoch=1, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)
Exemplo n.º 22
0
 def test_score(self):
     model = anago.Sequence()
     model.fit(self.x_train, self.y_train)
     score = model.score(self.x_test, self.y_test)
     self.assertIsInstance(score, float)
Exemplo n.º 23
0
 def test_train_with_pretrained_embedding(self):
     model = anago.Sequence(embeddings=self.embeddings)
     model.fit(self.x_train, self.y_train, self.x_test, self.y_test)
Exemplo n.º 24
0
rn.seed(12345)
import tensorflow as tf
from keras import backend as K

# tf.set_random_seed(1234)

# atur parameternya disini
model = anago.Sequence(char_emb_size=25,
                       word_emb_size=100,
                       char_lstm_units=25,
                       word_lstm_units=100,
                       dropout=0.5,
                       char_feature=True,
                       crf=True,
                       batch_size=20,
                       optimizer='adam',
                       learning_rate=0.001,
                       lr_decay=0.9,
                       clip_gradients=5.0,
                       max_epoch=30,
                       early_stopping=True,
                       patience=3,
                       train_embeddings=True,
                       max_checkpoints_to_keep=5,
                       log_dir=None)

model.train(x_train, y_train, x_valid, y_valid)

print("\n\nEvaluasi Test:")
model.eval(x_test, y_test)

model.save('model/ModelAnagoIndo')
Exemplo n.º 25
0
import anago
from anago.reader import load_data_and_labels
x_train, y_train = load_data_and_labels('./data/train.txt')
x_valid, y_valid = load_data_and_labels('./data/dev.txt')
x_test, y_test = load_data_and_labels('./data/test.txt')
model = anago.Sequence()  #.load('./models')
model.train(x_train, y_train, x_valid, y_valid)
model.save(dir_path='./models')
model.eval(x_test, y_test)
Exemplo n.º 26
0
import anago
from anago.reader import load_data_and_labels, load_glove

x_train, y_train = load_data_and_labels('train.txt')
x_valid, y_valid = load_data_and_labels('valid.txt')
x_test, y_test = load_data_and_labels('test.txt')

EMBEDDING_PATH = 'vectors-ind.txt'
embeddings = load_glove(EMBEDDING_PATH)

# model = anago.Sequence()
model = anago.Sequence(char_emb_size=100,word_emb_size=50,char_lstm_units=25,word_lstm_units=100,dropout=0.5,char_feature=True,crf=True,batch_size=3,optimizer='adam', learning_rate=0.005,lr_decay=0.7,clip_gradients=5.0, embeddings=embeddings)
model.train(x_train, y_train, x_valid, y_valid)

model.eval(x_test, y_test)

matres = []
for sent in x_test:
	res = model.analyze(sent)['entities']
	matres.append(res)

y_resu = []
for i, sent in enumerate(matres):
	sent_pred = ['O']*len(y_test[i])
	for enti in sent:
		bo = enti['beginOffset']
		sent_pred[bo] = 'B-'+enti['type']
		for x in range(bo+1, enti['endOffset']):
			sent_pred[x] = 'I-'+enti['type']
	y_resu.append(sent_pred)
Exemplo n.º 27
0
    def fit(self,
            X,
            y=None,
            char_emb_size=32,
            word_emb_size=128,
            char_lstm_units=32,
            word_lstm_units=128,
            dropout=0.5,
            batch_size=8,
            num_epochs=10):
        """ Trains the NER model. The input is a list of
            `AnnotatedDocument` instances.

            We should be careful with batch size:
            it must satisfy len(X) % batch_size == 0.
            Otherwise, Anago crushes with an error from time to time.
            An example here is a token assigned a tag (the BIO scheme).
        """

        log.info("Checking parameters...")
        self.config.set_parameters({
            "num_epochs": num_epochs,
            "dropout": dropout,
            "batch_size": batch_size,
            "char_emb_size": char_emb_size,
            "word_emb_size": word_emb_size,
            "char_lstm_units": char_lstm_units,
            "word_lstm_units": word_lstm_units
        })
        self.config.validate()

        # Anago splits the BIO tags on the dash "-", so if the label contains
        # a dash, it corrupts it. This is a workaround for this behavior.
        label_map = {}
        for annotated_document in X:
            for annotation in annotated_document.annotations:
                if "-" in annotation.label:
                    label_map[annotation.label.split("-")
                              [-1]] = annotation.label
                else:
                    label_map["B_" + annotation.label] = annotation.label
                    label_map["I_" + annotation.label] = annotation.label
        self.config.set_parameter("label_map", label_map)

        self.model = anago.Sequence(
            char_embedding_dim=self.config.get_parameter("char_emb_size"),
            word_embedding_dim=self.config.get_parameter("word_emb_size"),
            char_lstm_size=self.config.get_parameter("char_lstm_units"),
            word_lstm_size=self.config.get_parameter("word_lstm_units"),
            dropout=self.config.get_parameter("dropout"))

        log.info("Transforming {} items to BIO format...".format(len(X)))
        train_data = transform_annotated_documents_to_bio_format(
            X, entity_labels=self.entity_labels)

        # new version does not use numpy arrays as arguments
        # BIO_X = np.asarray([x_i for x_i in training_data[0] if len(x_i) > 0])
        # BIO_y = np.asarray([y_i for y_i in training_data[1] if len(y_i) > 0])

        # validation is not necessary as we normally use Optimizer
        # X_train, X_valid, y_train, y_valid = train_test_split(
        # 	BIO_X, BIO_y, test_size=0.1)

        X_train = [x_i for x_i in train_data[0]]
        y_train = [y_i for y_i in train_data[1]]

        # check sizes
        if len(X_train) != len(y_train):
            log.error(
                "Got {} feature vectors but {} labels, cannot train!".format(
                    len(X_train), len(y_train)))
            return self

        # number of examples must be divisible by batch_size,
        # so skip examples in the end if needed
        exm_num = len(X_train)
        X_train = X_train[:exm_num - exm_num % batch_size]
        y_train = y_train[:exm_num - exm_num % batch_size]

        log.info("Training BiLSTM...")
        self.model.fit(X_train,
                       y_train,
                       epochs=self.config.get_parameter("num_epochs"),
                       batch_size=self.config.get_parameter("batch_size"))
        return self
import pandas as pd
from pathlib import Path
import anago
from anago.utils import load_glove
import utils

train_path = Path.cwd().joinpath('data/semeval-2016/train.csv')
test_path = Path.cwd().joinpath('data/semeval-2016/test.csv')

# Read data
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)

x_train, y_train = utils.df2data(data_train)
x_test, y_test = utils.df2data(data_test)

# Load glove embedding
EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt'
embeddings = load_glove(EMBEDDING_PATH)

# Use pre-trained word embeddings to train
model = anago.Sequence(embeddings=embeddings, word_embedding_dim=300)
model.fit(x_train, y_train, x_test, y_test, epochs=10)