示例#1
0
    def test_vocab_size_lower_off(self):
        word_vocab_size = 5
        char_vocab_size = 4
        label_size = 3

        # lower is not effective.
        it = IndexTransformer(lower=False)
        it.fit(self.x, self.y)
        self.assertEqual(it.word_vocab_size, word_vocab_size + 2)  # pad, unk
        self.assertEqual(it.char_vocab_size, char_vocab_size + 2)  # pad, unk
        self.assertEqual(it.label_size, label_size + 1)            # pad
示例#2
0
    def test_transform_without_character(self):
        # No character feature.
        it = IndexTransformer(use_char=False)
        x, y = it.fit_transform(self.x, self.y)

        # Check sequence length.
        self.assertEqual(len(x), len(self.x))
        self.assertEqual(len(y), len(self.y))

        # Check sequence type.
        self.assertIsInstance(x, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
示例#3
0
    def test_vocab_size_with_initial_vocab(self):
        vocab = {'aaa', 'aab', 'aac'}
        word_vocab_size = 4 + len(vocab)
        char_vocab_size = 4
        label_size = 3

        # Add initial vocab.
        it = IndexTransformer(lower=True, initial_vocab=vocab)
        it.fit(self.x, self.y)
        self.assertEqual(it.word_vocab_size, word_vocab_size + 2)  # pad, unk
        self.assertEqual(it.char_vocab_size, char_vocab_size + 2)  # pad, unk
        self.assertEqual(it.label_size, label_size + 1)            # pad
示例#4
0
    def test_batch_iter(self):
        X, y = load_data_and_labels(self.filename)
        batch_size = 32
        p = IndexTransformer()
        p.fit(X, y)
        gen = NERSequence(X, y, batch_size, preprocess=p.transform)

        y_gen = []
        for i in range(len(gen)):
            x1, y1 = gen[i]
            y_gen.extend(y1)
        self.assertEqual(len(y_gen), len(y))
示例#5
0
 def test_train_no_character(self):
     p = IndexTransformer(use_char=False)
     p.fit(self.x_train, self.y_train)
     model = BiLSTMCRF(word_vocab_size=p.word_vocab_size,
                       num_labels=p.label_size,
                       use_crf=False,
                       use_char=False)
     model, loss = model.build()
     model.compile(loss=loss, optimizer='adam')
     trainer = Trainer(model, preprocessor=p)
     trainer.train(self.x_train,
                   self.y_train,
                   x_valid=self.x_valid,
                   y_valid=self.y_valid)
示例#6
0
    def test_transform_with_character(self):
        # With character feature.
        it = IndexTransformer(use_char=True)
        X, y = it.fit_transform(self.x, self.y)
        words, chars = X

        # Check sequence length.
        self.assertEqual(len(words), len(self.x))
        self.assertEqual(len(chars), len(self.x))
        self.assertEqual(len(y), len(self.y))

        # Check sequence type.
        self.assertIsInstance(words, np.ndarray)
        self.assertIsInstance(chars, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
示例#7
0
    def setUp(self):
        # Load datasets.
        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        self.x_train, self.y_train = load_data_and_labels(train_path)
        self.x_valid, self.y_valid = load_data_and_labels(valid_path)

        # Fit transformer.
        self.p = IndexTransformer()
        self.p.fit(self.x_train, self.y_train)

        # Build a model.
        self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                               word_vocab_size=self.p.word_vocab_size,
                               num_labels=self.p.label_size)
        self.model, loss = self.model.build()
        self.model.compile(loss=loss, optimizer='adam')
示例#8
0
    def test_transform_unknown_token(self):
        it = IndexTransformer()
        it.fit(self.x, self.y)

        x_train, y_train = [['aaa']], [['X']]
        X, y = it.transform(x_train, y_train)
        words, chars = X

        # Check sequence length.
        self.assertEqual(len(words), len(x_train))
        self.assertEqual(len(chars), len(x_train))
        self.assertEqual(len(y), len(y_train))

        # Check sequence type.
        self.assertIsInstance(words, np.ndarray)
        self.assertIsInstance(chars, np.ndarray)
        self.assertIsInstance(y, np.ndarray)
示例#9
0
    def fit(self, x_train, y_train, x_valid=None, y_valid=None,
            epochs=1, batch_size=32, verbose=1, callbacks=None, shuffle=True):
        """Fit the model for a fixed number of epochs

        Args:
            x_train: list of training data
            y_train: list of training label data
            x_valid: list of validation data
            y_valid: list of validatoin label data
            batch_size: Integer. Number of samples per gradient update
            verbose: Integer. 0 = silent, 1 = progress bar, 2 = one line per epoch.
            callbacks: List of `keras.callbacks.Callback` instances
            shuffle: Boolean. Indicate whether to shuffle training data before each epoch
        """
        p = IndexTransformer(initial_vocab=self.initial_vocab, use_char=self.use_char)
        p.fit(x_train, y_train)
        embeddings = filter_embeddings(self.embeddings, p._word_vocab.vocab, self.word_embedding_dim)

        model = BiLSTMCRF(char_vocab_size=p.char_vocab_size,
                        word_vocab_size=p.word_vocab_size,
                        num_labels=p.label_size,
                        word_embedding_dim=self.word_embedding_dim,
                        char_embedding_dim=self.char_embedding_dim,
                        word_lstm_size=self.word_lstm_size,
                        char_lstm_size=self.char_lstm_size,
                        fc_dim=self.fc_dim,
                        dropout=self.dropout,
                        embeddings=embeddings,
                        use_char=self.use_char,
                        use_crf=self.use_crf)
        
        model, loss = model.build()
        model.compile(loss=loss, optimizer=self.optimizer)

        trainer = Trainer(model, preprocessor=p)
        trainer.train(x_train,y_train,x_valid,y_valid,
            epochs=epochs, batch_size=batch_size,
            verbose=verbose, callbacks=callbacks,
            shuffle=shuffle)
        
        self.p = p
        self.model = model
示例#10
0
 def test_inverse_transform_one_cat(self):
     x_train, y_train = [['a']], [['O']]
     it = IndexTransformer()
     it.fit(self.x, self.y)
     _, y = it.transform(x_train, y_train)
     inv_y = it.inverse_transform(y)
     self.assertNotEqual(inv_y, self.y)
示例#11
0
 def test_inverse_transform_unknown_token(self):
     x_train, y_train = [['a', 'b']], [['X', 'O']]
     it = IndexTransformer()
     it.fit(self.x, self.y)
     _, y = it.transform(x_train, y_train)
     inv_y = it.inverse_transform(y)
     self.assertNotEqual(inv_y, self.y)
示例#12
0
    def setUpClass(cls):
        weights_file = os.path.join(SAVE_ROOT, 'weights.h5')
        params_file = os.path.join(SAVE_ROOT, 'params.json')
        preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle')

        # Load preprocessor
        p = IndexTransformer.load(preprocessor_file)

        # Load the model.
        model = load_model(weights_file, params_file)

        # Build a tagger
        cls.tagger = polygo.Tagger(model, preprocessor=p)

        cls.sent = 'President Obama is speaking at the White House.'
示例#13
0
    def test_save_and_load(self):
        it = IndexTransformer(lower=False)
        x1, y1 = it.fit_transform(self.x, self.y)
        x1_word, x1_char = x1

        self.assertFalse(os.path.exists(self.preprocessor_file))
        it.save(self.preprocessor_file)
        self.assertTrue(os.path.exists(self.preprocessor_file))

        it = IndexTransformer.load(self.preprocessor_file)
        x2, y2 = it.transform(self.x, self.y)
        x2_word, x2_char = x2

        np.testing.assert_array_equal(x1_word, x2_word)
        np.testing.assert_array_equal(x1_char, x2_char)
        np.testing.assert_array_equal(y1, y2)
示例#14
0
class TestTrainer(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        if not os.path.exists(LOG_ROOT):
            os.mkdir(LOG_ROOT)

        if not os.path.exists(SAVE_ROOT):
            os.mkdir(SAVE_ROOT)

        cls.weights_file = os.path.join(SAVE_ROOT, 'weights.h5')
        cls.params_file = os.path.join(SAVE_ROOT, 'params.json')
        cls.preprocessor_file = os.path.join(SAVE_ROOT, 'preprocessor.pickle')

    def setUp(self):
        # Load datasets.
        train_path = os.path.join(DATA_ROOT, 'train.txt')
        valid_path = os.path.join(DATA_ROOT, 'valid.txt')
        self.x_train, self.y_train = load_data_and_labels(train_path)
        self.x_valid, self.y_valid = load_data_and_labels(valid_path)

        # Fit transformer.
        self.p = IndexTransformer()
        self.p.fit(self.x_train, self.y_train)

        # Build a model.
        self.model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                               word_vocab_size=self.p.word_vocab_size,
                               num_labels=self.p.label_size)
        self.model, loss = self.model.build()
        self.model.compile(loss=loss, optimizer='adam')

    def test_train(self):
        trainer = Trainer(self.model, preprocessor=self.p)
        trainer.train(self.x_train,
                      self.y_train,
                      x_valid=self.x_valid,
                      y_valid=self.y_valid)

    def test_train_no_valid(self):
        trainer = Trainer(self.model, preprocessor=self.p)
        trainer.train(self.x_train, self.y_train)

    def test_train_no_crf(self):
        model = BiLSTMCRF(char_vocab_size=self.p.char_vocab_size,
                          word_vocab_size=self.p.word_vocab_size,
                          num_labels=self.p.label_size,
                          use_crf=False)
        model, loss = model.build()
        model.compile(loss=loss, optimizer='adam')
        trainer = Trainer(model, preprocessor=self.p)
        trainer.train(self.x_train,
                      self.y_train,
                      x_valid=self.x_valid,
                      y_valid=self.y_valid)

    def test_train_no_character(self):
        p = IndexTransformer(use_char=False)
        p.fit(self.x_train, self.y_train)
        model = BiLSTMCRF(word_vocab_size=p.word_vocab_size,
                          num_labels=p.label_size,
                          use_crf=False,
                          use_char=False)
        model, loss = model.build()
        model.compile(loss=loss, optimizer='adam')
        trainer = Trainer(model, preprocessor=p)
        trainer.train(self.x_train,
                      self.y_train,
                      x_valid=self.x_valid,
                      y_valid=self.y_valid)

    def test_save(self):
        # Train the model.
        trainer = Trainer(self.model, preprocessor=self.p)
        trainer.train(self.x_train, self.y_train)

        # Save the model.
        save_model(self.model, self.weights_file, self.params_file)
        self.p.save(self.preprocessor_file)
示例#15
0
 def load(cls, weights_file, params_file, preprocessor_file):
     self = cls()
     self.p = IndexTransformer.load(preprocessor_file)
     self.model = load_model(weights_file, params_file)
     return self
示例#16
0
 def test_inverse_transform(self):
     it = IndexTransformer()
     x, y = it.fit_transform(self.x, self.y)
     lengths = map(len, self.y)
     inv_y = it.inverse_transform(y, lengths)
     self.assertEqual(inv_y, self.y)