Пример #1
0
    def test_train(self):
        model_config = ModelConfig()
        training_config = TrainingConfig()

        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        x_train, y_train = load_data_and_labels(train_path)
        x_valid, y_valid = load_data_and_labels(valid_path)

        p = prepare_preprocessor(x_train, y_train)
        p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl'))
        embeddings = load_word_embeddings(p.vocab_word, EMBEDDING_PATH,
                                          model_config.word_embedding_size)
        model_config.char_vocab_size = len(p.vocab_char)

        model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))

        trainer = anago.Trainer(model,
                                training_config,
                                checkpoint_path=LOG_ROOT,
                                save_path=SAVE_ROOT,
                                preprocessor=p,
                                embeddings=embeddings)
        trainer.train(x_train, y_train, x_valid, y_valid)

        model.save(os.path.join(SAVE_ROOT, 'model_weights.h5'))
Пример #2
0
    def train(self, x_train, y_train, x_valid=None, y_valid=None):

        # Prepare training and validation data(steps, generator)
        train_steps, train_batches = batch_iter(
            list(zip(x_train, y_train)),
            self.training_config.batch_size,
            preprocessor=self.preprocessor)
        valid_steps, valid_batches = batch_iter(
            list(zip(x_valid, y_valid)),
            self.training_config.batch_size,
            preprocessor=self.preprocessor)

        # Build the model
        model = SeqLabeling(self.model_config, self.embeddings,
                            len(self.preprocessor.vocab_tag))
        model.compile(
            loss=model.crf.loss,
            optimizer=Adam(lr=self.training_config.learning_rate),
        )

        # Prepare callbacks for training
        callbacks = get_callbacks(
            log_dir=self.checkpoint_path,
            tensorboard=self.tensorboard,
            eary_stopping=self.training_config.early_stopping,
            valid=(valid_steps, valid_batches, self.preprocessor))

        # Train the model
        model.fit_generator(generator=train_batches,
                            steps_per_epoch=train_steps,
                            epochs=self.training_config.max_epoch,
                            callbacks=callbacks)

        # Save the model
        model.save(os.path.join(self.save_path, 'model_weights.h5'))
Пример #3
0
DATA_ROOT = 'data/conll2003/en/ner'
SAVE_ROOT = './models'  # trained model
LOG_ROOT = './logs'  # checkpoint, tensorboard
embedding_path = '/media/jan/OS/Dataset/WordEmbeddings/wiki.en.vec'
model_config = ModelConfig()
training_config = TrainingConfig()

model_path = os.path.join(SAVE_ROOT, 'mymodel.h5')

train_path = os.path.join(DATA_ROOT, 'train.small.txt')
valid_path = os.path.join(DATA_ROOT, 'valid.small.txt')

x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)

p = prepare_preprocessor(x_train, y_train)
embeddings = load_word_embeddings(p.vocab_word, embedding_path,
                                  model_config.word_embedding_size)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

model = SeqLabeling(model_config, embeddings, len(p.vocab_tag))
trainer = anago.Trainer(model,
                        training_config,
                        checkpoint_path=LOG_ROOT,
                        save_path=SAVE_ROOT,
                        preprocessor=p)
trainer.train(x_train, y_train, x_valid, y_valid)
evaluator = anago.Evaluator(model, preprocessor=p)
model.save(model_path)
Пример #4
0
class Sequence(object):

    config_file = 'config.json'
    weight_file = 'model_weights.h5'
    preprocessor_file = 'preprocessor.pkl'

    def __init__(self,
                 char_emb_size=25,
                 word_emb_size=100,
                 char_lstm_units=25,
                 word_lstm_units=100,
                 dropout=0.5,
                 char_feature=True,
                 crf=True,
                 batch_size=1024,
                 optimizer='adam',
                 learning_rate=0.001,
                 lr_decay=0.9,
                 clip_gradients=5.0,
                 max_epoch=15,
                 early_stopping=True,
                 patience=3,
                 train_embeddings=True,
                 max_checkpoints_to_keep=5,
                 log_dir=None,
                 embeddings=()):

        self.model_config = ModelConfig(char_emb_size, word_emb_size,
                                        char_lstm_units, word_lstm_units,
                                        dropout, char_feature, crf)
        self.training_config = TrainingConfig(batch_size, optimizer,
                                              learning_rate, lr_decay,
                                              clip_gradients, max_epoch,
                                              early_stopping, patience,
                                              train_embeddings,
                                              max_checkpoints_to_keep)
        self.model = None
        self.p = None
        self.log_dir = log_dir
        self.embeddings = embeddings

    def train(self,
              x_train,
              y_train,
              x_valid=None,
              y_valid=None,
              vocab_init=None,
              verbose=1):
        self.p = prepare_preprocessor(x_train, y_train, vocab_init=vocab_init)
        embeddings = filter_embeddings(self.embeddings, self.p.vocab_word,
                                       self.model_config.word_embedding_size)
        self.model_config.vocab_size = len(self.p.vocab_word)
        self.model_config.char_vocab_size = len(self.p.vocab_char)

        self.model = SeqLabeling(self.model_config, embeddings,
                                 len(self.p.vocab_tag))

        if not os.path.exists(self.log_dir):
            print('Successfully made a directory: {}'.format(self.log_dir))
            os.mkdir(self.log_dir)
        self.p.save(os.path.join(self.log_dir, self.preprocessor_file))
        self.model_config.save(os.path.join(self.log_dir, self.config_file))
        print('Successfully save config and preprocess files')

        trainer = Trainer(self.model,
                          self.training_config,
                          checkpoint_path=self.log_dir,
                          preprocessor=self.p)
        return trainer.train(x_train, y_train, x_valid, y_valid, verbose)

    def eval(self, x_test, y_test):
        if self.model:
            evaluator = Evaluator(self.model, preprocessor=self.p)
            evaluator.eval(x_test, y_test)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def analyze(self, words):
        if self.model:
            tagger = Tagger(self.model, preprocessor=self.p)
            return tagger.analyze(words)
        else:
            raise (OSError('Could not find a model. Call load(dir_path).'))

    def save(self, dir_path):
        self.p.save(os.path.join(dir_path, self.preprocessor_file))
        self.model_config.save(os.path.join(dir_path, self.config_file))
        self.model.save(os.path.join(dir_path, self.weight_file))

    @classmethod
    def load(cls, dir_path):
        self = cls()
        self.p = WordPreprocessor.load(
            os.path.join(dir_path, cls.preprocessor_file))
        config = ModelConfig.load(os.path.join(dir_path, cls.config_file))
        dummy_embeddings = np.zeros(
            (config.vocab_size, config.word_embedding_size), dtype=np.float32)
        self.model = SeqLabeling(config,
                                 dummy_embeddings,
                                 ntags=len(self.p.vocab_tag))
        self.model.load(filepath=os.path.join(dir_path, cls.weight_file))
        self.model._make_predict_function()

        return self