Exemplo n.º 1
0
def load_and_predict():
    test = pd.read_csv(config.data_folder + "test.csv",
                       converters={"pos": literal_eval})
    x_test = [x.split() for x in test['sentence'].tolist()]

    p = IndexTransformer(use_char=True)
    p = p.load('../models/best_transform.it')

    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      word_embedding_dim=300,
                      char_embedding_dim=100,
                      word_lstm_size=100,
                      char_lstm_size=50,
                      fc_dim=100,
                      dropout=0.5,
                      embeddings=None,
                      use_char=True,
                      use_crf=True)

    model, loss = model.build()
    model.load_weights('../models/' + 'best_model.h5')

    predict(model, p, x_test)
Exemplo n.º 2
0
def predict_with_folds(swa):
    test = pd.read_csv(config.data_folder + "test.csv",
                       converters={"pos": literal_eval})
    x_test = [x.split() for x in test['sentence'].tolist()]

    p = IndexTransformer(use_char=True)
    p = p.load('../models/best_transform.it')
    lengths = map(len, x_test)
    x_test = p.transform(x_test)

    fold_result = []
    for n_fold in range(config.nfolds):

        path = '../models/best_model_' + str(n_fold)

        if swa:
            path += '_swa'

        model = load_model(path + '.h5',
                           custom_objects={
                               'CRF': CRF,
                               'RAdam': RAdam,
                               'crf_loss': crf_loss,
                               'crf_viterbi_accuracy': crf_viterbi_accuracy
                           })
        y_pred = model.predict(x_test, verbose=True)

        fold_result.append(y_pred)

    final_pred = np.mean(fold_result, axis=0)
    y_pred = p.inverse_transform(final_pred, lengths)
    build_submission(y_pred, 'fold')
Exemplo n.º 3
0
    def test_vocab_size_lower_off(self):
        word_vocab_size = 5
        char_vocab_size = 4
        label_size = 3

        # lower is not effective.
        it = IndexTransformer(lower=False)
        it.fit(self.x, self.y)
        self.assertEqual(it.word_vocab_size, word_vocab_size + 2)  # pad, unk
        self.assertEqual(it.char_vocab_size, char_vocab_size + 2)  # pad, unk
        self.assertEqual(it.label_size, label_size + 1)            # pad
Exemplo n.º 4
0
    def test_vocab_size_with_initial_vocab(self):
        vocab = {'aaa', 'aab', 'aac'}
        word_vocab_size = 4 + len(vocab)
        char_vocab_size = 4
        label_size = 3

        # Add initial vocab.
        it = IndexTransformer(lower=True, initial_vocab=vocab)
        it.fit(self.x, self.y)
        self.assertEqual(it.word_vocab_size, word_vocab_size + 2)  # pad, unk
        self.assertEqual(it.char_vocab_size, char_vocab_size + 2)  # pad, unk
        self.assertEqual(it.label_size, label_size + 1)            # pad
Exemplo n.º 5
0
    def test_batch_iter(self):
        X, y = load_data_and_labels(self.filename)
        batch_size = 32
        p = IndexTransformer()
        p.fit(X, y)
        gen = NERSequence(X, y, batch_size, preprocess=p.transform)

        y_gen = []
        for i in range(len(gen)):
            x1, y1 = gen[i]
            y_gen.extend(y1)
        self.assertEqual(len(y_gen), len(y))
    def test_transform_without_character(self):
        # No character feature.
        it = IndexTransformer(use_char=False)
        x, y = it.fit_transform(self.x, self.y)

        # Check sequence length.
        self.assertEqual(len(x), len(self.x))
        self.assertEqual(len(y), len(self.y))

        # Check sequence type.
        self.assertIsInstance(x, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
Exemplo n.º 7
0
 def test_train_no_character(self):
     p = IndexTransformer(use_char=False)
     p.fit(self.x_train, self.y_train)
     model = BiLSTMCRF(word_vocab_size=p.word_vocab_size,
                       num_labels=p.label_size,
                       use_crf=False,
                       use_char=False)
     model, loss = model.build()
     model.compile(loss=loss, optimizer='adam')
     trainer = Trainer(model, preprocessor=p)
     trainer.train(self.x_train,
                   self.y_train,
                   x_valid=self.x_valid,
                   y_valid=self.y_valid)
def char_featur():
    word_map = get_word_dict()
    result = []
    c = IndexTransformer()
    for item in word_map.keys():
        vec = []
        train = [[item + '']]
        word_vec = c.transform(train)
        print(word_vec)
        vec.append(item)
        meta_vec = word_vec[0][0][0]
        meta_vec = meta_vec.tolist()
        vec.append(meta_vec)
        result.append(vec)
        print(result)
Exemplo n.º 9
0
    def test_transform_with_character(self):
        # With character feature.
        it = IndexTransformer(use_char=True)
        X, y = it.fit_transform(self.x, self.y)
        words, chars, length = X

        # Check sequence length.
        self.assertEqual(len(words), len(self.x))
        self.assertEqual(len(chars), len(self.x))
        self.assertEqual(len(y), len(self.y))

        # Check sequence type.
        self.assertIsInstance(words, np.ndarray)
        self.assertIsInstance(chars, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
Exemplo n.º 10
0
    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or 
                os.path.exists(params_file) or
                os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")
        
        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = IndexTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Exemplo n.º 11
0
    def test_batch_iter(self):
        X, y = load_data_and_labels(self.filename)
        batch_size = 32
        p = IndexTransformer()
        p.fit(X, y)
        steps, generator = batch_iter(X,
                                      y,
                                      batch_size,
                                      shuffle=False,
                                      preprocessor=p)

        y_gen = []
        for _ in range(steps):
            x1, y1 = next(generator)
            y_gen.extend(y1)
        self.assertEqual(len(y_gen), len(y))
Exemplo n.º 12
0
    def fit(self, x_train, y_train, x_valid=None, y_valid=None,
            epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
        """Fit the model for a fixed number of epochs.

        Args:
            x_train: list of training data.
            y_train: list of training target (label) data.
            x_valid: list of validation data.
            y_valid: list of validation target (label) data.
            batch_size: Integer.
                Number of samples per gradient update.
                If unspecified, `batch_size` will default to 32.
            epochs: Integer. Number of epochs to train the model.
            verbose: Integer. 0, 1, or 2. Verbosity mode.
                0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
            shuffle: Boolean (whether to shuffle the training data
                before each epoch). `shuffle` will default to True.
        """
        p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)

        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                          word_vocab_size=p.word_vocab_size,
                          num_labels=p.label_size,
                          word_embedding_dim=self.word_embedding_dim,
                          char_embedding_dim=self.char_embedding_dim,
                          word_lstm_size=self.word_lstm_size,
                          char_lstm_size=self.char_lstm_size,
                          fc_dim=self.fc_dim,
                          dropout=self.dropout,
                          embeddings=embeddings,
                          use_char=self.use_char,
                          use_crf=self.use_crf)
        model, loss = model.build()
        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train, y_train, x_valid, y_valid,
                      epochs=epochs, batch_size=batch_size,
                      verbose=verbose, callbacks=callbacks,
                      shuffle=shuffle)

        self.p = p
        self.model = model
Exemplo n.º 13
0
    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self
Exemplo n.º 14
0
    def test_transform_unknown_token(self):
        it = IndexTransformer()
        it.fit(self.x, self.y)

        x_train, y_train = [['aaa']], [['X']]
        X, y = it.transform(x_train, y_train)
        words, chars, length = X

        # Check sequence length.
        self.assertEqual(len(words), len(x_train))
        self.assertEqual(len(chars), len(x_train))
        self.assertEqual(len(y), len(y_train))

        # Check sequence type.
        self.assertIsInstance(words, np.ndarray)
        self.assertIsInstance(chars, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
Exemplo n.º 15
0
    def setUp(self):
        # Load datasets.
        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        self.x_train, self.y_train = load_data_and_labels(train_path)
        self.x_valid, self.y_valid = load_data_and_labels(valid_path)

        # Fit transformer.
        self.p = IndexTransformer()
        self.p.fit(self.x_train, self.y_train)

        # Build a model.
        self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                               word_vocab_size=self.p.word_vocab_size,
                               num_labels=self.p.label_size)
        self.model, loss = self.model.build()
        self.model.compile(loss=loss, optimizer='adam')
Exemplo n.º 16
0
def evaluate(swa):
    train = pd.read_csv(config.data_folder + "train.csv", converters={"pos": literal_eval, "tag": literal_eval})
    x_train = [x.split() for x in train['sentence'].tolist()]
    y_train = train['tag'].tolist()

    p = IndexTransformer(use_char=True)
    p = p.load('../models/best_transform.it')

    oof_data = []
    oof_data_pred = []

    skf = KFold(n_splits=config.nfolds, random_state=config.seed, shuffle=True)

    for n_fold, (train_indices, val_indices) in enumerate(skf.split(x_train)):

        x_val = list(np.array(x_train)[val_indices])
        y_val = list(np.array(y_train)[val_indices])
        print(y_val[:5])
        oof_data.extend([x for line in y_val for x in line])
        print(oof_data[:5])
        lengths = map(len, x_val)
        x_val = p.transform(x_val)

        path = '../models/best_model_' + str(n_fold)

        if swa:
            path += '_swa'

        model = load_model(path + '.h5',
                           custom_objects={'CRF': CRF,
                                           'RAdam': RAdam,
                                           'crf_loss' : crf_loss,
                                           'crf_viterbi_accuracy': crf_viterbi_accuracy})

        # model.load_weights('../models/best_model_' + str(n_fold) + '.h5')

        y_pred = model.predict(x_val,
                               verbose=True)
        print(y_pred[:5])
        y_pred = p.inverse_transform(y_pred, lengths)
        print(y_pred[:5])
        oof_data_pred.extend([pred for line in y_pred for pred in line])
        print(oof_data_pred[:5])

    bacc = balanced_accuracy_score(oof_data,oof_data_pred)
    print("Final CV: ", bacc*100)
Exemplo n.º 17
0
    def load(cls, weights_file, params_file, preprocessor_file):
        self = cls()
        self.p = IndexTransformer.load(preprocessor_file)
        self.model = load_model(weights_file, params_file)
        # Added by Sonvx on Jan 14, 2021: fix issue ("<tensor> is not an element of this graph." when loading model)
        self.model._make_predict_function()

        return self
Exemplo n.º 18
0
 def __init__(self, process_proper_nouns=False):
     super().__init__(process_proper_nouns)
     model = load_model(os.path.join(ELMO_TAGGER_PATH, 'weights.h5'),
                        os.path.join(ELMO_TAGGER_PATH, 'params.json'))
     it = IndexTransformer.load(
         os.path.join(ELMO_TAGGER_PATH, 'preprocessor.pkl'))
     self.pos_tagger = Tagger(model,
                              preprocessor=it,
                              tokenizer=wordpunct_tokenize)
Exemplo n.º 19
0
def main(args):
    print('Loading objects...')
    model = BiLSTMCRF.load(args.weights_file, args.params_file)
    it = IndexTransformer.load(args.preprocessor_file)
    tagger = Tagger(model, preprocessor=it)

    print('Tagging a sentence...')
    res = tagger.analyze(args.sent)
    pprint(res)
Exemplo n.º 20
0
def main(args):
    print('Loading datasets...')
    X, y = load_data_and_labels(args.data_path)
    x_train, x_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=42)
    embeddings = KeyedVectors.load(args.embedding_path).wv

    print('Transforming datasets...')
    p = IndexTransformer()
    p.fit(X, y)
    embeddings = filter_embeddings(embeddings, p._word_vocab,
                                   embeddings.vector_size)

    print('Building a model...')
    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      embeddings=embeddings,
                      char_embedding_dim=50)
    model.build()

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
Exemplo n.º 21
0
 def test_inverse_transform_unknown_token(self):
     x_train, y_train = [['a', 'b']], [['X', 'O']]
     it = IndexTransformer()
     it.fit(self.x, self.y)
     _, y = it.transform(x_train, y_train)
     inv_y = it.inverse_transform(y)
     self.assertNotEqual(inv_y, self.y)
Exemplo n.º 22
0
 def test_inverse_transform_one_cat(self):
     x_train, y_train = [['a']], [['O']]
     it = IndexTransformer()
     it.fit(self.x, self.y)
     _, y = it.transform(x_train, y_train)
     inv_y = it.inverse_transform(y)
     self.assertNotEqual(inv_y, self.y)
Exemplo n.º 23
0
def main(args):
    print('Loading dataset...')
    x_train, y_train = load_data_and_labels(args.train_data)
    x_valid, y_valid = load_data_and_labels(args.valid_data)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=args.no_char_feature)
    p.fit(x_train, y_train)

    print('Building a model.')
    model = BiLSTMCRF(char_embedding_dim=args.char_emb_size,
                      word_embedding_dim=args.word_emb_size,
                      char_lstm_size=args.char_lstm_units,
                      word_lstm_size=args.word_lstm_units,
                      char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      dropout=args.dropout,
                      use_char=args.no_char_feature,
                      use_crf=args.no_use_crf)
    model, loss = model.build()
    model.compile(loss=loss, optimizer='adam')

    print('Training the model...')
    trainer = Trainer(model, preprocessor=p)
    trainer.train(x_train, y_train, x_valid, y_valid)

    print('Saving the model...')
    model.save(args.weights_file, args.params_file)
    p.save(args.preprocessor_file)
Exemplo n.º 24
0
    def setUpClass(cls):
        weights_file = os.path.join(SAVE_ROOT, 'weights.h5')
        params_file = os.path.join(SAVE_ROOT, 'params.json')
        preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle')

        # Load preprocessor
        p = IndexTransformer.load(preprocessor_file)

        # Load the model.
        model = load_model(weights_file, params_file)

        # Build a tagger
        cls.tagger = anago.Tagger(model, preprocessor=p)

        cls.sent = 'President Obama is speaking at the White House.'
Exemplo n.º 25
0
    def test_save_and_load(self):
        it = IndexTransformer(lower=False)
        x1, y1 = it.fit_transform(self.x, self.y)
        x1_word, x1_char, x1_length = x1

        self.assertFalse(os.path.exists(self.preprocessor_file))
        it.save(self.preprocessor_file)
        self.assertTrue(os.path.exists(self.preprocessor_file))

        it = IndexTransformer.load(self.preprocessor_file)
        x2, y2 = it.transform(self.x, self.y)
        x2_word, x2_char, x2_length = x2

        np.testing.assert_array_equal(x1_word, x2_word)
        np.testing.assert_array_equal(x1_char, x2_char)
        np.testing.assert_array_equal(y1, y2)
Exemplo n.º 26
0
 def test_inverse_transform(self):
     it = IndexTransformer()
     x, y = it.fit_transform(self.x, self.y)
     _, _, length = x
     inv_y = it.inverse_transform(y, length)
     self.assertEqual(inv_y, self.y)
Exemplo n.º 27
0
def training(train, test):
    x_train = [x.split() for x in train['sentence'].tolist()]
    y_train = train['tag'].tolist()

    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      train_size=0.8,
                                                      random_state=233)

    print('Transforming datasets...')
    p = IndexTransformer(use_char=True)
    p.fit(x_train, y_train)

    embeddings = load_glove(config.glove_file)

    embeddings = filter_embeddings(embeddings, p._word_vocab.vocab,
                                   config.glove_size)

    model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                      word_vocab_size=p.word_vocab_size,
                      num_labels=p.label_size,
                      word_embedding_dim=300,
                      char_embedding_dim=100,
                      word_lstm_size=100,
                      char_lstm_size=50,
                      fc_dim=100,
                      dropout=0.5,
                      embeddings=embeddings,
                      use_char=True,
                      use_crf=True)

    opt = Adam(lr=0.001)
    model, loss = model.build()
    model.compile(loss=loss, optimizer=opt, metrics=[crf_viterbi_accuracy])

    filepath = '../models/' + 'best_model'
    ckp = ModelCheckpoint(filepath + '.h5',
                          monitor='val_crf_viterbi_accuracy',
                          verbose=1,
                          save_best_only=True,
                          mode='max',
                          save_weights_only=True)

    es = EarlyStopping(monitor='val_crf_viterbi_accuracy',
                       min_delta=0.00001,
                       patience=3,
                       verbose=1,
                       mode='max')
    rlr = ReduceLROnPlateau(monitor='val_crf_viterbi_accuracy',
                            factor=0.2,
                            patience=2,
                            verbose=1,
                            mode='max',
                            min_delta=0.0001)

    callbacks = [ckp, es, rlr]

    train_seq = NERSequence(x_train, y_train, config.batch_size, p.transform)

    if x_val and y_val:
        valid_seq = NERSequence(x_val, y_val, config.batch_size, p.transform)
        f1 = F1score(valid_seq, preprocessor=p)
        callbacks.append(f1)

    model.fit_generator(generator=train_seq,
                        validation_data=valid_seq,
                        epochs=config.nepochs,
                        callbacks=callbacks,
                        verbose=True,
                        shuffle=True,
                        use_multiprocessing=True,
                        workers=42)
Exemplo n.º 28
0
    def load(cls, weights_file, params_file, preprocessor_file):
        self = cls()
        self.p = IndexTransformer.load(preprocessor_file)
        self.model = load_model(weights_file, params_file)

        return self
Exemplo n.º 29
0
        "wiki_cbow_100/wikipedia_cbow_100").wv
    train_path = '../../data/collected/NER/train.txt'
    valid_path = '../../data/collected/NER/valid.txt'

    print('Loading data...')
    x_train, y_train = load_data_and_labels(train_path)
    x_valid, y_valid = load_data_and_labels(valid_path)
    print("got ", len(x_train), " entries for training and ", len(x_valid),
          " entries for testing")
    entities = set()
    for s in y_train:
        for w in s:
            entities.add(w)
    print("Defined entities are :", entities)

    preprocessor = IndexTransformer(use_char=True)
    x = x_train + x_valid
    y = y_train + y_valid
    preprocessor.fit(x, y)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'valid sequences')

    embeddings = filter_embeddings(wv_model, preprocessor._word_vocab.vocab,
                                   wv_model.vector_size)
    # Use pre-trained word embeddings

    model = anago.models.BiLSTMCRF(
        embeddings=embeddings,
        use_crf=False,
        use_char=True,
        num_labels=preprocessor.label_size,
Exemplo n.º 30
0
class BiLstmCrfNER(NERModel):

    def __init__(self,
            word_embedding_dim=100,
            char_embedding_dim=25,
            word_lstm_size=100,
            char_lstm_size=25,
            fc_dim=100,
            dropout=0.5,
            embeddings=None,
            use_char=True,
            use_crf=True,
            batch_size=16, 
            learning_rate=0.001, 
            max_iter=10):
        """ Construct a BiLSTM-CRF NER model. Model is augmented with character
            level embeddings as well as word embeddings by default. Implementation 
            is provided by the Anago project.

            Parameters
            ----------
            word_embedding_dim : int, optional, default 100
                word embedding dimensions.
            char_embedding_dim : int, optional, default 25
                character embedding dimensions.
            word_lstm_size : int, optional, default 100
                character LSTM feature extractor output dimensions.
            char_lstm_size : int, optional, default 25
                word tagger LSTM output dimensions.
            fc_dim : int, optional, default 100
                output fully-connected layer size.
            dropout : float, optional, default 0.5
                dropout rate.
            embeddings : numpy array
                word embedding matrix.
            use_char : bool, optional, default True
                add char feature.
            use_crf : bool, optional, default True
                use crf as last layer.
            batch_size : int, optional, default 16
                training batch size.
            learning_rate : float, optional, default 0.001
                learning rate for Adam optimizer
            max_iter : int
                number of epochs of training

            Attributes
            ----------
            preprocessor_ : reference to preprocessor
            model_ : reference to generated model
            trainer_ : internal reference to Anago Trainer (model)
            tagger_ : internal reference to Anago Tagger (predictor)
        """
        super().__init__()
        self.word_embedding_dim = word_embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.word_lstm_size = word_lstm_size
        self.char_lstm_size = char_lstm_size
        self.fc_dim = fc_dim
        self.dropout = dropout
        self.embedding = None
        self.use_char = True
        self.use_crf = True
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        # populated by fit() and load(), expected by save() and transform()
        self.preprocessor_ = None
        self.model_ = None
        self.trainer_ = None
        self.tagger_ = None


    def fit(self, X, y):
        """ Trains the NER model. Input is list of list of tokens and tags.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens
            y : list(list(str))
                list of list of BIO tags

            Returns
            -------
            self
        """
        log.info("Preprocessing dataset...")
        self.preprocessor_ = IndexTransformer(use_char=self.use_char)
        self.preprocessor_.fit(X, y)

        log.info("Building model...")
        self.model_ = BiLSTMCRF(
            char_embedding_dim=self.char_embedding_dim,
            word_embedding_dim=self.word_embedding_dim,
            char_lstm_size=self.char_lstm_size,
            word_lstm_size=self.word_lstm_size,
            char_vocab_size=self.preprocessor_.char_vocab_size,
            word_vocab_size=self.preprocessor_.word_vocab_size,
            num_labels=self.preprocessor_.label_size,
            dropout=self.dropout,
            use_char=self.use_char,
            use_crf=self.use_crf)
        self.model_, loss = self.model_.build()
        optimizer = Adam(lr=self.learning_rate)
        self.model_.compile(loss=loss, optimizer=optimizer)
        self.model_.summary()

        log.info('Training the model...')
        self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_)

        x_train, x_valid, y_train, y_valid = train_test_split(X, y, 
            test_size=0.1, random_state=42)
        self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid,
            batch_size=self.batch_size, epochs=self.max_iter)

        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self


    def predict(self, X):
        """ Predicts using the NER model.

            Parameters
            ----------
            X : list(list(str))
                list of list of tokens.

            Returns
            -------
            y : list(list(str))
                list of list of predicted BIO tags.
        """
        if self.tagger_ is None:
            raise ValueError("No tagger found, either run fit() to train or load() a trained model")

        log.info("Predicting from model...")
        ypreds = [self.tagger_.predict(" ".join(x)) for x in X]
        return ypreds


    def save(self, dirpath):
        """ Saves model to local disk, given a dirpath 
        
            Parameters
            ----------
            dirpath : str
                a directory where model artifacts will be saved.
                Model saves a weights.h5 weights file, a params.json parameter
                file, and a preprocessor.pkl preprocessor file.

            Returns
            -------
            None
        """
        if self.model_ is None or self.preprocessor_ is None:
            raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model")

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        save_model(self.model_, weights_file, params_file)
        self.preprocessor_.save(preprocessor_file)

        write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml"))


    def load(self, dirpath):
        """ Loads a trained model from local disk, given the dirpath

            Parameters
            ----------
            dirpath : str
                a directory where model artifacts are saved.

            Returns
            -------
            self
        """
        if not os.path.exists(dirpath):
            raise ValueError("Model directory not found: {:s}".format(dirpath))

        weights_file = os.path.join(dirpath, "weights.h5")
        params_file = os.path.join(dirpath, "params.json")
        preprocessor_file = os.path.join(dirpath, "preprocessor.pkl")

        if not (os.path.exists(weights_file) or 
                os.path.exists(params_file) or
                os.path.exists(preprocessor_file)):
            raise ValueError("Model files may be corrupted, exiting")
        
        self.model_ = load_model(weights_file, params_file)
        self.preprocessor_ = IndexTransformer.load(preprocessor_file)
        self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_)

        return self