Пример #1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Пример #2
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)
Пример #3
0
    def test_predict(self):
        X, y = load_data_and_labels(self.filename)
        X, y = X[:100], y[:100]
        p = prepare_preprocessor(X, y)
        self.model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(self.model_config, self.embeddings, ntags=len(p.vocab_tag))
        model.predict(p.transform(X))
Пример #4
0
    def setUp(self):
        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))

        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        self.tagger = anago.Tagger(model, preprocessor=p)
        self.sent = 'President Obama is speaking at the White House.'
Пример #5
0
    def load(cls, dir_path):
        self = cls()
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, cls.preprocessor_file))
        config = ModelConfig.load(os.path.join(dir_path, cls.config_file))
        dummy_embeddings = np.zeros(
            (config.vocab_size, config.word_embedding_size), dtype=np.float32)
        self.model = SeqLabeling(config,
                                 dummy_embeddings,
                                 ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, cls.weight_file))

        return self
Пример #6
0
    def __init__(self,
                 config,
                 weights,
                 save_path='',
                 preprocessor=None,
                 tokenizer=str.split):

        self.preprocessor = preprocessor
        self.tokenizer = tokenizer

        # Build the model
        self.model = SeqLabeling(config,
                                 ntags=len(self.preprocessor.vocab_tag))
        self.model.load(filepath=os.path.join(save_path, weights))
Пример #7
0
    def test_eval(self):
        test_path = os.path.join(DATA_ROOT, 'test.txt')
        x_test, y_test = load_data_and_labels(test_path)

        p = WordPreprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        config = ModelConfig()
        config.vocab_size = len(p.vocab_word)
        config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(config, ntags=len(p.vocab_tag))
        model.load(filepath=os.path.join(SAVE_ROOT, 'model_weights.h5'))

        evaluator = anago.Evaluator(model, preprocessor=p)
        evaluator.eval(x_test, y_test)
Пример #8
0
    def train(self, x_train, y_train, x_valid=None, y_valid=None):

        # Prepare training and validation data(steps, generator)
        train_steps, train_batches = batch_iter(
            list(zip(x_train, y_train)),
            self.training_config.batch_size,
            preprocessor=self.preprocessor)
        valid_steps, valid_batches = batch_iter(
            list(zip(x_valid, y_valid)),
            self.training_config.batch_size,
            preprocessor=self.preprocessor)

        # Build the model
        model = SeqLabeling(self.model_config, self.embeddings,
                            len(self.preprocessor.vocab_tag))
        model.compile(
            loss=model.crf.loss,
            optimizer=Adam(lr=self.training_config.learning_rate),
        )

        # Prepare callbacks for training
        callbacks = get_callbacks(
            log_dir=self.checkpoint_path,
            tensorboard=self.tensorboard,
            eary_stopping=self.training_config.early_stopping,
            valid=(valid_steps, valid_batches, self.preprocessor))

        # Train the model
        model.fit_generator(generator=train_batches,
                            steps_per_epoch=train_steps,
                            epochs=self.training_config.max_epoch,
                            callbacks=callbacks)

        # Save the model
        model.save(os.path.join(self.save_path, 'model_weights.h5'))
Пример #9
0
    def eval(self, x_test, y_test):

        # Prepare test data(steps, generator)
        train_steps, train_batches = batch_iter(list(zip(x_test, y_test)),
                                                self.config.batch_size,
                                                preprocessor=self.preprocessor)

        # Build the model
        model = SeqLabeling(self.config,
                            ntags=len(self.preprocessor.vocab_tag))
        model.load(filepath=os.path.join(self.save_path, self.weights))

        # Build the evaluator and evaluate the model
        f1score = F1score(train_steps, train_batches, self.preprocessor)
        f1score.model = model
        f1score.on_epoch_end(epoch=-1)  # epoch takes any integer.
Пример #10
0
    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p,
                          save_path='./models')
        trainer.train(x_train, y_train, x_valid, y_valid)
Пример #11
0
DATA_ROOT = 'data/conll2003/en/ner'
LOAD_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()

test_path = os.path.join(DATA_ROOT, 'train.small.txt')
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_test, y_test)

embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model_path = os.path.join(LOAD_ROOT, 'mymodel.h5')
model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
model.load(model_path)
X, y = p.transform(x_test, y_test)
predictions = model.predict(X)

for words, prediction, sentence_length in zip(x_test, predictions, X[2]):
    nopad_prediction = prediction[:sentence_length.item()]
    label_indices = [np.argmax(x) for x in nopad_prediction]
    labels = p.inverse_transform(label_indices)

    print "\n".join(["{}\t{}".format(w, l) for w, l in zip(words, labels)])
    print ''
Пример #12
0
class Tagger(object):
    def __init__(self,
                 config,
                 weights,
                 save_path='',
                 preprocessor=None,
                 tokenizer=str.split):

        self.preprocessor = preprocessor
        self.tokenizer = tokenizer

        # Build the model
        self.model = SeqLabeling(config,
                                 ntags=len(self.preprocessor.vocab_tag))
        self.model.load(filepath=os.path.join(save_path, weights))

    def predict(self, words):
        sequence_lengths = [len(words)]
        X = self.preprocessor.transform([words])
        pred = self.model.predict(X, sequence_lengths)
        pred = np.argmax(pred, -1)
        pred = self.preprocessor.inverse_transform(pred[0])

        return pred

    def tag(self, sent):
        """Tags a sentence named entities.

        Args:
            sent: a sentence

        Return:
            labels_pred: list of (word, tag) for a sentence

        Example:
            >>> sent = 'President Obama is speaking at the White House.'
            >>> print(self.tag(sent))
            [('President', 'O'), ('Obama', 'PERSON'), ('is', 'O'),
             ('speaking', 'O'), ('at', 'O'), ('the', 'O'),
             ('White', 'LOCATION'), ('House', 'LOCATION'), ('.', 'O')]
        """
        assert isinstance(sent, str)

        words = self.tokenizer(sent)
        pred = self.predict(words)
        pred = [t.split('-')[-1]
                for t in pred]  # remove prefix: e.g. B-Person -> Person

        return list(zip(words, pred))

    def get_entities(self, sent):
        """Gets entities from a sentence.

        Args:
            sent: a sentence

        Return:
            labels_pred: dict of entities for a sentence

        Example:
            sent = 'President Obama is speaking at the White House.'
            result = {'Person': ['Obama'], 'LOCATION': ['White House']}
        """
        assert isinstance(sent, str)

        words = self.tokenizer(sent)
        pred = self.predict(words)
        entities = self._get_chunks(words, pred)

        return entities

    def _get_chunks(self, words, tags):
        """
        Args:
            words: sequence of word
            tags: sequence of labels

        Returns:
            dict of entities for a sequence

        Example:
            words = ['President', 'Obama', 'is', 'speaking', 'at', 'the', 'White', 'House', '.']
            tags = ['O', 'B-Person', 'O', 'O', 'O', 'O', 'B-Location', 'I-Location', 'O']
            result = {'Person': ['Obama'], 'LOCATION': ['White House']}
        """
        chunks = get_entities(tags)
        res = defaultdict(list)
        for chunk_type, chunk_start, chunk_end in chunks:
            res[chunk_type].append(' '.join(
                words[chunk_start:chunk_end]))  # todo delimiter changeable

        return res
Пример #13
0
 def test_compile(self):
     model = SeqLabeling(self.model_config, self.embeddings, ntags=10)
     model.compile(loss=model.crf.loss,
                   optimizer=Adam(lr=self.training_config.learning_rate)
                   )
Пример #14
0
 def test_build(self):
     model = SeqLabeling(self.model_config, self.embeddings, ntags=10)
Пример #15
0
DATA_ROOT = 'data/conll2003/en/ner'
SAVE_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()
training_config = TrainingConfig()

model_path = os.path.join(SAVE_ROOT, 'mymodel.h5')

train_path = os.path.join(DATA_ROOT, 'train.small.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.small.txt')

x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)

p = prepare_preprocessor(x_train, y_train)
embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
trainer = anago.Trainer(model,
                        training_config,
                        checkpoint_path=LOG_ROOT,
                        save_path=SAVE_ROOT,
                        preprocessor=p)
trainer.train(x_train, y_train, x_valid, y_valid)
evaluator = anago.Evaluator(model, preprocessor=p)
model.save(model_path)
Пример #16
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.h5'
    preprocessor_file = 'preprocessor.pkl'

    def __init__(self,
                 char_emb_size=25,
                 word_emb_size=100,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 char_feature=True,
                 crf=True,
                 batch_size=1024,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=15,
                 early_stopping=True,
                 patience=3,
                 train_embeddings=True,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 embeddings=()):

        self.model_config = ModelConfig(char_emb_size, word_emb_size,
                                        char_lstm_units, word_lstm_units,
                                        dropout, char_feature, crf)
        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stopping, patience,
                                              train_embeddings,
                                              max_checkpoints_to_keep)
        self.model = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings = embeddings

    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)

    def eval(self, x_test, y_test):
        if self.model:
            evaluator = Evaluator(self.model, preprocessor=self.p)
            evaluator.eval(x_test, y_test)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def analyze(self, words):
        if self.model:
            tagger = Tagger(self.model, preprocessor=self.p)
            return tagger.analyze(words)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def save(self, dir_path):
        self.p.save(os.path.join(dir_path, self.preprocessor_file))
        self.model_config.save(os.path.join(dir_path, self.config_file))
        self.model.save(os.path.join(dir_path, self.weight_file))

    @classmethod
    def load(cls, dir_path):
        self = cls()
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, cls.preprocessor_file))
        config = ModelConfig.load(os.path.join(dir_path, cls.config_file))
        dummy_embeddings = np.zeros(
            (config.vocab_size, config.word_embedding_size), dtype=np.float32)
        self.model = SeqLabeling(config,
                                 dummy_embeddings,
                                 ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, cls.weight_file))
        self.model._make_predict_function()

        return self