def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() # train_path = os.path.join(DATA_ROOT, 'conll.txt') # valid_path = os.path.join(DATA_ROOT, 'conll.txt') # x_train, y_train = load_data_and_labels(train_path) # x_valid, y_valid = load_data_and_labels(valid_path) path = os.path.join(DATA_ROOT, 'dataset.tsv') X, y = load_data_and_labels(path) from sklearn.model_selection import train_test_split x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42) # p = prepare_preprocessor(x_train, y_train) p = prepare_preprocessor(X, y) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_file = os.path.join(SAVE_ROOT, 'model.h5') model = CharNER(model_config, p.vocab_size(), p.tag_size()) trainer = namaco.Trainer(model, model.loss, training_config, log_dir=LOG_ROOT, save_path=model_file, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid)
def test_predict(self): X, y = load_data_and_labels(self.filename) p = prepare_preprocessor(X, y) self.model_config.vocab_size = len(p.vocab_char) model = CharNER(self.model_config, p.vocab_size(), p.tag_size()) model.predict(p.transform(X))
def test_unknown_word(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = Preprocessor(padding=False, return_lengths=False) p = preprocessor.fit(X, y) X = [['$unknownword$']] X = p.transform(X) self.assertEqual(X[0][0], p.vocab_char[UNK])
def test_batch_iter(self): sents, labels = load_data_and_labels(self.filename) batch_size = 32 p = prepare_preprocessor(sents, labels) steps, batches = batch_iter(sents, labels, batch_size, preprocessor=p) for _ in range(steps): next(batches)
def test_transform(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = Preprocessor(padding=False, return_lengths=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) char_id = X[0][0] # 1th character of 1th sent. tag_id = y[0][0] # 1th tag of 1th sent. self.assertIsInstance(char_id, int) self.assertIsInstance(tag_id, int)
def test_fit(self): X, y = reader.load_data_and_labels(self.filename) p = Preprocessor(padding=False) p = p.fit(X, y) self.assertTrue(PAD in p.vocab_char) self.assertTrue(UNK in p.vocab_char) self.assertTrue(PAD in p.vocab_tag) char_set = set(p.vocab_char) - {PAD, UNK} for ch in char_set: self.assertEqual(len(ch), 1)
def test_eval(self): DATA_ROOT = os.path.join(os.path.dirname(__file__), 'data') SAVE_ROOT = os.path.join(os.path.dirname(__file__), 'models') test_path = os.path.join(DATA_ROOT, 'conll.txt') model_path = os.path.join(SAVE_ROOT, 'model.h5') x_test, y_test = load_data_and_labels(test_path) p = Preprocessor.load(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) evaluator = namaco.Evaluator(model_path, preprocessor=p) evaluator.eval(x_test, y_test)
def test_return_lengths(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = Preprocessor(padding=False, return_lengths=True) p = preprocessor.fit(X, y) X, y = p.transform(X, y) chars, lengths = X char_id = chars[0][0] tag_id = y[0][0] self.assertIsInstance(char_id, int) self.assertIsInstance(tag_id, int) for seq, leng in zip(chars, lengths): self.assertEqual(len(seq), leng)
def test_train(self): model_config = ModelConfig() training_config = TrainingConfig() train_path = os.path.join(DATA_ROOT, 'conll.txt') valid_path = os.path.join(DATA_ROOT, 'conll.txt') x_train, y_train = load_data_and_labels(train_path) x_valid, y_valid = load_data_and_labels(valid_path) p = prepare_preprocessor(x_train, y_train) p.save(os.path.join(SAVE_ROOT, 'preprocessor.pkl')) model_file = os.path.join(SAVE_ROOT, 'model.h5') model = CharNER(model_config, p.vocab_size(), p.tag_size()) trainer = namaco.Trainer(model, model.loss, training_config, log_dir=LOG_ROOT, save_path=model_file, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid)
def test_transform_with_padding(self): X, y = reader.load_data_and_labels(self.filename) preprocessor = Preprocessor(padding=True, return_lengths=False) p = preprocessor.fit(X, y) X, y = p.transform(X, y) char_id = X[0][0] tag_id = y[0][0] self.assertIsInstance(char_id, np.int32) self.assertIsInstance(tag_id, np.int32) length_set1 = set(map(len, X)) length_set2 = set(map(len, y)) self.assertEqual(len(length_set1), 1) # all sequence has same length. self.assertEqual(len(length_set2), 1)
def test_load(self): X, y = reader.load_data_and_labels(self.filename) p = Preprocessor() p.fit(X, y) filepath = os.path.join(os.path.dirname(__file__), 'data/preprocessor.pkl') p.save(filepath) self.assertTrue(os.path.exists(filepath)) loaded_p = Preprocessor.load(filepath) x_test1, y_test1 = p.transform(X, y) x_test2, y_test2 = loaded_p.transform(X, y) np.testing.assert_array_equal(x_test1[0], x_test2[0]) # word np.testing.assert_array_equal(x_test1[1], x_test2[1]) # char np.testing.assert_array_equal(y_test1, y_test2) if os.path.exists(filepath): os.remove(filepath)
def test_extract(self): sents, labels = load_data_and_labels(self.filename) self.assertTrue(len(sents) == len(labels))