def test_vocab_size_lower_off(self): word_vocab_size = 5 char_vocab_size = 4 label_size = 3 # lower is not effective. it = IndexTransformer(lower=False) it.fit(self.x, self.y) self.assertEqual(it.word_vocab_size, word_vocab_size + 2) # pad, unk self.assertEqual(it.char_vocab_size, char_vocab_size + 2) # pad, unk self.assertEqual(it.label_size, label_size + 1) # pad
def test_transform_without_character(self): # No character feature. it = IndexTransformer(use_char=False) x, y = it.fit_transform(self.x, self.y) # Check sequence length. self.assertEqual(len(x), len(self.x)) self.assertEqual(len(y), len(self.y)) # Check sequence type. self.assertIsInstance(x, np.ndarray) self.assertIsInstance(y, np.ndarray)
def test_vocab_size_with_initial_vocab(self): vocab = {'aaa', 'aab', 'aac'} word_vocab_size = 4 + len(vocab) char_vocab_size = 4 label_size = 3 # Add initial vocab. it = IndexTransformer(lower=True, initial_vocab=vocab) it.fit(self.x, self.y) self.assertEqual(it.word_vocab_size, word_vocab_size + 2) # pad, unk self.assertEqual(it.char_vocab_size, char_vocab_size + 2) # pad, unk self.assertEqual(it.label_size, label_size + 1) # pad
def test_batch_iter(self): X, y = load_data_and_labels(self.filename) batch_size = 32 p = IndexTransformer() p.fit(X, y) gen = NERSequence(X, y, batch_size, preprocess=p.transform) y_gen = [] for i in range(len(gen)): x1, y1 = gen[i] y_gen.extend(y1) self.assertEqual(len(y_gen), len(y))
def test_train_no_character(self): p = IndexTransformer(use_char=False) p.fit(self.x_train, self.y_train) model = BiLSTMCRF(word_vocab_size=p.word_vocab_size, num_labels=p.label_size, use_crf=False, use_char=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid)
def test_transform_with_character(self): # With character feature. it = IndexTransformer(use_char=True) X, y = it.fit_transform(self.x, self.y) words, chars = X # Check sequence length. self.assertEqual(len(words), len(self.x)) self.assertEqual(len(chars), len(self.x)) self.assertEqual(len(y), len(self.y)) # Check sequence type. self.assertIsInstance(words, np.ndarray) self.assertIsInstance(chars, np.ndarray) self.assertIsInstance(y, np.ndarray)
def setUp(self): # Load datasets. train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') self.x_train, self.y_train = load_data_and_labels(train_path) self.x_valid, self.y_valid = load_data_and_labels(valid_path) # Fit transformer. self.p = IndexTransformer() self.p.fit(self.x_train, self.y_train) # Build a model. self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size) self.model, loss = self.model.build() self.model.compile(loss=loss, optimizer='adam')
def test_transform_unknown_token(self): it = IndexTransformer() it.fit(self.x, self.y) x_train, y_train = [['aaa']], [['X']] X, y = it.transform(x_train, y_train) words, chars = X # Check sequence length. self.assertEqual(len(words), len(x_train)) self.assertEqual(len(chars), len(x_train)) self.assertEqual(len(y), len(y_train)) # Check sequence type. self.assertIsInstance(words, np.ndarray) self.assertIsInstance(chars, np.ndarray) self.assertIsInstance(y, np.ndarray)
def fit(self, x_train, y_train, x_valid=None, y_valid=None, epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True): """Fit the model for a fixed number of epochs Args: x_train: list of training data y_train: list of training label data x_valid: list of validation data y_valid: list of validatoin label data batch_size: Integer. Number of samples per gradient update verbose: Integer. 0 = silent, 1 = progress bar, 2 = one line per epoch. callbacks: List of `keras.callbacks.Callback` instances shuffle: Boolean. Indicate whether to shuffle training data before each epoch """ p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char) p.fit(x_train, y_train) embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim) model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, word_embedding_dim=self.word_embedding_dim, char_embedding_dim=self.char_embedding_dim, word_lstm_size=self.word_lstm_size, char_lstm_size=self.char_lstm_size, fc_dim=self.fc_dim, dropout=self.dropout, embeddings=embeddings, use_char=self.use_char, use_crf=self.use_crf) model, loss = model.build() model.compile(loss=loss, optimizer=self.optimizer) trainer = Trainer(model, preprocessor=p) trainer.train(x_train,y_train,x_valid,y_valid, epochs=epochs, batch_size=batch_size, verbose=verbose, callbacks=callbacks, shuffle=shuffle) self.p = p self.model = model
def test_inverse_transform_one_cat(self): x_train, y_train = [['a']], [['O']] it = IndexTransformer() it.fit(self.x, self.y) _, y = it.transform(x_train, y_train) inv_y = it.inverse_transform(y) self.assertNotEqual(inv_y, self.y)
def test_inverse_transform_unknown_token(self): x_train, y_train = [['a', 'b']], [['X', 'O']] it = IndexTransformer() it.fit(self.x, self.y) _, y = it.transform(x_train, y_train) inv_y = it.inverse_transform(y) self.assertNotEqual(inv_y, self.y)
def setUpClass(cls): weights_file = os.path.join(SAVE_ROOT, 'weights.h5') params_file = os.path.join(SAVE_ROOT, 'params.json') preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') # Load preprocessor p = IndexTransformer.load(preprocessor_file) # Load the model. model = load_model(weights_file, params_file) # Build a tagger cls.tagger = polygo.Tagger(model, preprocessor=p) cls.sent = 'President Obama is speaking at the White House.'
def test_save_and_load(self): it = IndexTransformer(lower=False) x1, y1 = it.fit_transform(self.x, self.y) x1_word, x1_char = x1 self.assertFalse(os.path.exists(self.preprocessor_file)) it.save(self.preprocessor_file) self.assertTrue(os.path.exists(self.preprocessor_file)) it = IndexTransformer.load(self.preprocessor_file) x2, y2 = it.transform(self.x, self.y) x2_word, x2_char = x2 np.testing.assert_array_equal(x1_word, x2_word) np.testing.assert_array_equal(x1_char, x2_char) np.testing.assert_array_equal(y1, y2)
class TestTrainer(unittest.TestCase): @classmethod def setUpClass(cls): if not os.path.exists(LOG_ROOT): os.mkdir(LOG_ROOT) if not os.path.exists(SAVE_ROOT): os.mkdir(SAVE_ROOT) cls.weights_file = os.path.join(SAVE_ROOT, 'weights.h5') cls.params_file = os.path.join(SAVE_ROOT, 'params.json') cls.preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle') def setUp(self): # Load datasets. train_path = os.path.join(DATA_ROOT, 'train.txt') valid_path = os.path.join(DATA_ROOT, 'valid.txt') self.x_train, self.y_train = load_data_and_labels(train_path) self.x_valid, self.y_valid = load_data_and_labels(valid_path) # Fit transformer. self.p = IndexTransformer() self.p.fit(self.x_train, self.y_train) # Build a model. self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size) self.model, loss = self.model.build() self.model.compile(loss=loss, optimizer='adam') def test_train(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_train_no_valid(self): trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train) def test_train_no_crf(self): model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size, word_vocab_size=self.p.word_vocab_size, num_labels=self.p.label_size, use_crf=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=self.p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_train_no_character(self): p = IndexTransformer(use_char=False) p.fit(self.x_train, self.y_train) model = BiLSTMCRF(word_vocab_size=p.word_vocab_size, num_labels=p.label_size, use_crf=False, use_char=False) model, loss = model.build() model.compile(loss=loss, optimizer='adam') trainer = Trainer(model, preprocessor=p) trainer.train(self.x_train, self.y_train, x_valid=self.x_valid, y_valid=self.y_valid) def test_save(self): # Train the model. trainer = Trainer(self.model, preprocessor=self.p) trainer.train(self.x_train, self.y_train) # Save the model. save_model(self.model, self.weights_file, self.params_file) self.p.save(self.preprocessor_file)
def load(cls, weights_file, params_file, preprocessor_file): self = cls() self.p = IndexTransformer.load(preprocessor_file) self.model = load_model(weights_file, params_file) return self
def test_inverse_transform(self): it = IndexTransformer() x, y = it.fit_transform(self.x, self.y) lengths = map(len, self.y) inv_y = it.inverse_transform(y, lengths) self.assertEqual(inv_y, self.y)