示例#1
0
    def test_preprocessing(self):
        preprocessor = Preprocessor()
        data = (('First text!', 'first head'), ('2-nd täxt', 'Second head'))
        data_preprocessed = [preprocessor(d) for d in data]
        self.assertEqual(
            ('<start> first text ! <end>', '<start> first head <end>'),
            data_preprocessed[0])
        self.assertEqual(
            ('<start> #-nd täxt <end>', '<start> second head <end>'),
            data_preprocessed[1])

        preprocessor = Preprocessor(start_token='<start>',
                                    end_token='<end>',
                                    lower_case=True,
                                    hash_numbers=False)
        data_preprocessed = [preprocessor(d) for d in data]
        self.assertEqual(
            ('<start> 2-nd täxt <end>', '<start> second head <end>'),
            data_preprocessed[1])

        preprocessor = Preprocessor(start_token='<start>',
                                    end_token='<end>',
                                    lower_case=False,
                                    hash_numbers=True)
        data_preprocessed = [preprocessor(d) for d in data]
        self.assertEqual(
            ('<start> #-nd täxt <end>', '<start> Second head <end>'),
            data_preprocessed[1])
    def test_serde_happy_path(self) -> None:
        preprocessor = Preprocessor()
        tokenizer = KerasTokenizer(oov_token='<unk>')
        tokenizer.fit(['a b c {} {}'.format(
            preprocessor.start_token, preprocessor.end_token)])
        vectorizer = Vectorizer(tokenizer, tokenizer)
        summarizer = AttentionSummarizer(lstm_size=10,
                                         max_prediction_len=10,
                                         embedding_size=10,
                                         embedding_encoder_trainable=False)
        summarizer.init_model(preprocessor=preprocessor,
                              vectorizer=vectorizer)

        # we need at least a train step to init the weights
        train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True)
        train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32)
        train_step(train_seq, train_seq)

        save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path')
        summarizer.save(save_dir)
        summarizer_loaded = AttentionSummarizer.load(save_dir)
        self.assertEqual(10, summarizer_loaded.lstm_size)
        self.assertEqual(10, summarizer_loaded.max_prediction_len)
        self.assertIsNotNone(summarizer_loaded.preprocessor)
        self.assertIsNotNone(summarizer_loaded.vectorizer)
        self.assertIsNotNone(summarizer_loaded.encoder)
        self.assertIsNotNone(summarizer_loaded.decoder)
        self.assertFalse(summarizer_loaded.encoder.embedding.trainable)
        self.assertTrue(summarizer_loaded.decoder.embedding.trainable)
        self.assertIsNotNone(summarizer_loaded.optimizer)

        pred = summarizer.predict_vectors('a c', '')
        pred_loaded = summarizer_loaded.predict_vectors('a c', '')
        np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6)
示例#3
0
    def test_init(self) -> None:
        preprocessor = Preprocessor(start_token='<custom_start_token>', lower_case=False, hash_numbers=False)
        trainer = Trainer(max_output_len=9,
                          batch_size=1,
                          max_vocab_size_encoder=2,
                          max_vocab_size_decoder=3,
                          embedding_path_encoder='glove.txt',
                          steps_per_epoch=4,
                          tensorboard_dir='tensor_dir',
                          model_save_path='model_save_path',
                          shuffle_buffer_size=10,
                          bucketing_buffer_size_batches=5,
                          bucketing_batches_to_bucket=6,
                          steps_to_log=7,
                          logging_level=logging.DEBUG,
                          preprocessor=preprocessor)

        self.assertEqual(1, trainer.batch_size)
        self.assertEqual(2, trainer.max_vocab_size_encoder)
        self.assertEqual(3, trainer.max_vocab_size_decoder)
        self.assertEqual('glove.txt', trainer.embedding_path_encoder)
        self.assertIsNone(trainer.embedding_path_decoder)
        self.assertEqual(4, trainer.steps_per_epoch)
        self.assertEqual('tensor_dir', trainer.tensorboard_dir)
        self.assertEqual('model_save_path', trainer.model_save_path)
        self.assertFalse(trainer.use_bucketing)
        self.assertEqual(10, trainer.shuffle_buffer_size)
        self.assertEqual(5, trainer.bucketing_buffer_size_batches)
        self.assertEqual(6, trainer.bucketing_batches_to_bucket)
        self.assertEqual(7, trainer.steps_to_log)
        self.assertEqual(9, trainer.max_output_len)
        self.assertEqual(logging.DEBUG, trainer.logger.level)
        self.assertEqual('<custom_start_token>', trainer.preprocessor.start_token)
        self.assertEqual(False, trainer.preprocessor.lower_case)
        self.assertEqual(False, trainer.preprocessor.hash_numbers)
    def test_serde_happy_path(self) -> None:
        preprocessor = Preprocessor(start_token='[CLS]', end_token='[SEP]')

        tokenizer_encoder = BertTokenizer.from_pretrained('bert-base-uncased')
        tokenizer_decoder = KerasTokenizer(oov_token='<unk>')
        tokenizer_decoder.fit([
            'a b c {} {}'.format(preprocessor.start_token,
                                 preprocessor.end_token)
        ])
        vectorizer = Vectorizer(tokenizer_encoder, tokenizer_decoder)
        summarizer = SummarizerBert(num_layers_encoder=1,
                                    num_layers_decoder=1,
                                    bert_embedding_encoder='bert-base-uncased',
                                    num_heads=2,
                                    max_prediction_len=3,
                                    embedding_size_encoder=768,
                                    embedding_size_decoder=10,
                                    embedding_encoder_trainable=False)
        summarizer.init_model(preprocessor=preprocessor, vectorizer=vectorizer)

        # we need at least a train step to init the weights
        train_step = summarizer.new_train_step(masked_crossentropy,
                                               batch_size=1,
                                               apply_gradients=True)
        train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32)
        train_step(train_seq, train_seq)

        save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path')
        summarizer.save(save_dir)
        summarizer_loaded = SummarizerBert.load(save_dir)
        self.assertEqual(1, summarizer_loaded.num_layers_encoder)
        self.assertEqual(1, summarizer_loaded.num_layers_decoder)
        self.assertEqual(2, summarizer_loaded.num_heads)
        self.assertEqual(3, summarizer_loaded.max_prediction_len)
        self.assertEqual(768, summarizer_loaded.embedding_size_encoder)
        self.assertEqual(10, summarizer_loaded.embedding_size_decoder)
        self.assertIsNotNone(summarizer_loaded.preprocessor)
        self.assertIsNotNone(summarizer_loaded.vectorizer)
        self.assertIsNotNone(summarizer_loaded.transformer)
        self.assertFalse(
            summarizer_loaded.transformer.encoder.embedding.trainable)
        self.assertTrue(
            summarizer_loaded.transformer.decoder.embedding.trainable)
        self.assertIsNotNone(summarizer_loaded.optimizer_encoder)
        self.assertIsNotNone(summarizer_loaded.optimizer_decoder)

        pred = summarizer.predict_vectors('a c', '')
        pred_loaded = summarizer_loaded.predict_vectors('a c', '')
        np.testing.assert_almost_equal(pred['logits'],
                                       pred_loaded['logits'],
                                       decimal=6)
示例#5
0
 def setUp(self) -> None:
     tf.random.set_seed(42)
     np.random.seed(42)
     self.data = [('a b', 'c'), ('a b c', 'd')]
     tokenizer_encoder = KerasTokenizer(lower=False, filters='')
     tokenizer_decoder = KerasTokenizer(lower=False, filters='')
     tokenizer_encoder.fit(['a b c <start> <end>'])
     tokenizer_decoder.fit(['c d <start> <end>'])
     self.vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder,
                                  tokenizer_decoder=tokenizer_decoder,
                                  max_output_len=3)
     self.preprocessor = Preprocessor()
     batch_generator = DatasetGenerator(2)
     data_prep = [self.preprocessor(d) for d in self.data]
     data_vecs = [self.vectorizer(d) for d in data_prep]
     self.dataset = batch_generator(lambda: data_vecs)
     self.loss_func = masked_crossentropy
示例#6
0
    def __init__(self,
                 max_input_len=None,
                 max_output_len=None,
                 batch_size=16,
                 max_vocab_size_encoder=200000,
                 max_vocab_size_decoder=200000,
                 embedding_path_encoder=None,
                 embedding_path_decoder=None,
                 steps_per_epoch=500,
                 tensorboard_dir=None,
                 model_save_path=None,
                 shuffle_buffer_size=100000,
                 use_bucketing=False,
                 bucketing_buffer_size_batches=10000,
                 bucketing_batches_to_bucket=100,
                 logging_level=logging.INFO,
                 num_print_predictions=5,
                 steps_to_log=10,
                 preprocessor: Union[Preprocessor, None] = None) -> None:
        """
        Initializes the trainer.

        Args:
            max_input_len (output): Maximum length of input sequences, longer sequences will be truncated.
            max_output_len (output): Maximum length of output sequences, longer sequences will be truncated.
            batch_size: Size of mini-batches for stochastic gradient descent.
            max_vocab_size_encoder: Maximum number of unique tokens to consider for encoder embeddings.
            max_vocab_size_decoder: Maximum number of unique tokens to consider for decoder embeddings.
            embedding_path_encoder: Path to embedding file for the encoder.
            embedding_path_decoder: Path to embedding file for the decoder.
            steps_per_epoch: Number of steps to train until callbacks are invoked.
            tensorboard_dir: Directory for saving tensorboard logs.
            model_save_path: Directory for saving the best model.
            shuffle_buffer_size: Size of the buffer for shuffling the files before batching.
            use_bucketing: Whether to bucket the sequences by length to reduce the amount of padding.
            bucketing_buffer_size_batches: Number of batches to buffer when bucketing sequences.
            bucketing_batches_to_bucket: Number of buffered batches from which sequences are collected for bucketing.
            logging_level: Level of logging to use, e.g. logging.INFO or logging.DEBUG.
            num_print_predictions: Number of sample predictions to print in each evaluation.
            steps_to_log: Number of steps to wait for logging output.
            preprocessor (optional): custom preprocessor, if None a standard preprocessor will be created.
        """

        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        self.batch_size = batch_size
        self.max_vocab_size_encoder = max_vocab_size_encoder
        self.max_vocab_size_decoder = max_vocab_size_decoder
        self.bucketing_buffer_size_batches = bucketing_buffer_size_batches
        self.bucketing_batches_to_bucket = bucketing_batches_to_bucket
        self.embedding_path_encoder = embedding_path_encoder
        self.embedding_path_decoder = embedding_path_decoder
        self.steps_per_epoch = steps_per_epoch
        self.tensorboard_dir = tensorboard_dir
        self.model_save_path = model_save_path
        self.loss_function = masked_crossentropy
        self.use_bucketing = use_bucketing
        self.shuffle_buffer_size = None if use_bucketing else shuffle_buffer_size

        self.bucket_generator = None
        if use_bucketing:
            self.bucket_generator = BucketGenerator(
                element_length_function=lambda vecs: len(vecs[0]),
                batch_size=self.batch_size,
                buffer_size_batches=self.bucketing_buffer_size_batches,
                batches_to_bucket=self.bucketing_batches_to_bucket,
                shuffle=True,
                seed=42)
        self.logger = get_logger(__name__)
        self.logger.setLevel(logging_level)
        self.num_print_predictions = num_print_predictions
        self.steps_to_log = steps_to_log
        self.preprocessor = preprocessor or Preprocessor(
            start_token=START_TOKEN, end_token=END_TOKEN)
示例#7
0
    def test_training(self) -> None:
        data = [('a b', 'c'), ('a b c', 'd')]
        tokenizer_encoder = KerasTokenizer(lower=False, filters='')
        tokenizer_decoder = KerasTokenizer(lower=False, filters='')
        tokenizer_encoder.fit(['a b c <start> <end>'])
        tokenizer_decoder.fit(['c d <start> <end>'])
        vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder,
                                tokenizer_decoder=tokenizer_decoder,
                                max_output_len=3)
        preprocessor = Preprocessor()
        batch_generator = DatasetGenerator(2)
        data_prep = [preprocessor(d) for d in data]
        data_vecs = [vectorizer(d) for d in data_prep]
        dataset = batch_generator(lambda: data_vecs)

        summarizer_transformer = SummarizerTransformer(num_heads=1,
                                                       num_layers=1,
                                                       feed_forward_dim=20,
                                                       embedding_size=10,
                                                       dropout_rate=0,
                                                       max_prediction_len=3)

        summarizer_transformer.init_model(preprocessor=preprocessor,
                                          vectorizer=vectorizer,
                                          embedding_weights_encoder=None,
                                          embedding_weights_decoder=None)

        summarizer_attention = SummarizerAttention(lstm_size=10,
                                                   embedding_size=10)

        summarizer_attention.init_model(preprocessor=preprocessor,
                                        vectorizer=vectorizer,
                                        embedding_weights_encoder=None,
                                        embedding_weights_decoder=None)

        summarizer = SummarizerBasic(lstm_size=10, embedding_size=10)

        summarizer.init_model(preprocessor=preprocessor,
                              vectorizer=vectorizer,
                              embedding_weights_encoder=None,
                              embedding_weights_decoder=None)

        loss_func = masked_crossentropy

        loss_attention = 0
        train_step = summarizer_attention.new_train_step(
            loss_function=loss_func, batch_size=2)
        for _ in range(10):
            for source_seq, target_seq in dataset.take(-1):
                loss_attention = train_step(source_seq, target_seq)
                print(str(loss_attention))

        self.assertAlmostEqual(1.5810251235961914, float(loss_attention), 10)
        output_attention = summarizer_attention.predict_vectors('a c', '')
        expected_first_logits = np.array(
            [-0.069454, 0.00272, 0.007199, -0.039547, 0.014357])
        np.testing.assert_allclose(expected_first_logits,
                                   output_attention['logits'][0],
                                   atol=1e-6)
        self.assertEqual('a c', output_attention['preprocessed_text'][0])
        self.assertEqual('<end>', output_attention['predicted_text'])

        loss = 0
        train_step = summarizer.new_train_step(loss_function=loss_func,
                                               batch_size=2)
        for e in range(0, 10):
            for source_seq, target_seq in dataset.take(-1):
                loss = train_step(source_seq, target_seq)

        self.assertAlmostEqual(1.5771859884262085, float(loss), 10)
        output = summarizer.predict_vectors('a c', '')
        expected_first_logits = np.array(
            [-0.03838864, 0.01226684, 0.01055636, -0.05209339, 0.02549592])
        np.testing.assert_allclose(expected_first_logits,
                                   output['logits'][0],
                                   atol=1e-6)
        self.assertEqual('a c', output['preprocessed_text'][0])
        self.assertEqual('<end>', output['predicted_text'])

        loss_transformer = 0
        train_step = summarizer_transformer.new_train_step(
            loss_function=loss_func, batch_size=2)
        for e in range(0, 10):
            for source_seq, target_seq in dataset.take(-1):
                loss_transformer = train_step(source_seq, target_seq)
                print(str(loss_transformer))

        self.assertAlmostEqual(1.2841172218322754, float(loss_transformer), 10)
        output_transformer = summarizer_transformer.predict_vectors('a c', '')

        expected_first_logits = np.array(
            [0.094787, 0.516092, 1.165521, 0.271338, 0.670318])
        np.testing.assert_allclose(expected_first_logits,
                                   output_transformer['logits'][0],
                                   atol=1e-6)
        self.assertEqual('a c', output_transformer['preprocessed_text'][0])
        self.assertEqual('d <end>', output_transformer['predicted_text'])
示例#8
0
    def test_training(self) -> None:
        data = [('a b', 'c'), ('a b c', 'd')]
        tokenizer_encoder = KerasTokenizer(lower=False, filters='')
        tokenizer_decoder = KerasTokenizer(lower=False, filters='')
        tokenizer_encoder.fit(['a b c <start> <end>'])
        tokenizer_decoder.fit(['c d <start> <end>'])
        vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder,
                                tokenizer_decoder=tokenizer_decoder,
                                max_output_len=3)
        preprocessor = Preprocessor()
        batch_generator = DatasetGenerator(2)
        data_prep = [preprocessor(d) for d in data]
        data_vecs = [vectorizer(d) for d in data_prep]
        dataset = batch_generator(lambda: data_vecs)

        summarizer_transformer = SummarizerTransformer(num_heads=1,
                                                       num_layers=1,
                                                       feed_forward_dim=20,
                                                       embedding_size=10,
                                                       dropout_rate=0,
                                                       max_prediction_len=3)

        summarizer_transformer.init_model(preprocessor=preprocessor,
                                          vectorizer=vectorizer,
                                          embedding_weights_encoder=None,
                                          embedding_weights_decoder=None)

        summarizer_attention = SummarizerAttention(lstm_size=10,
                                                   embedding_size=10)

        summarizer_attention.init_model(preprocessor=preprocessor,
                                        vectorizer=vectorizer,
                                        embedding_weights_encoder=None,
                                        embedding_weights_decoder=None)

        summarizer = SummarizerBasic(lstm_size=10, embedding_size=10)

        summarizer.init_model(preprocessor=preprocessor,
                              vectorizer=vectorizer,
                              embedding_weights_encoder=None,
                              embedding_weights_decoder=None)

        loss_func = masked_crossentropy

        loss_attention = 0
        train_step = summarizer_attention.new_train_step(
            loss_function=loss_func, batch_size=2)
        for _ in range(10):
            for source_seq, target_seq in dataset.take(-1):
                loss_attention = train_step(source_seq, target_seq)
                print(str(loss_attention))

        self.assertAlmostEqual(1.577033519744873, float(loss_attention), 5)
        output_attention = summarizer_attention.predict_vectors('a c', '')
        expected_first_logits = np.array(
            [-0.077805, 0.012667, 0.021359, -0.04872, 0.014989])
        np.testing.assert_allclose(expected_first_logits,
                                   output_attention['logits'][0],
                                   atol=1e-6)
        self.assertEqual('<start> a c <end>',
                         output_attention['preprocessed_text'][0])
        self.assertEqual('d <end>', output_attention['predicted_text'])

        loss = 0
        train_step = summarizer.new_train_step(loss_function=loss_func,
                                               batch_size=2)
        for e in range(0, 10):
            for source_seq, target_seq in dataset.take(-1):
                loss = train_step(source_seq, target_seq)

        self.assertAlmostEqual(1.5713274478912354, float(loss), 5)
        output = summarizer.predict_vectors('a c', '')
        expected_first_logits = np.array(
            [-0.051753, 0.013869, 0.010337, -0.073727, 0.033059])
        np.testing.assert_allclose(expected_first_logits,
                                   output['logits'][0],
                                   atol=1e-6)
        self.assertEqual('<start> a c <end>', output['preprocessed_text'][0])
        self.assertEqual('<end>', output['predicted_text'])

        loss_transformer = 0
        train_step = summarizer_transformer.new_train_step(
            loss_function=loss_func, batch_size=2)
        for e in range(0, 10):
            for source_seq, target_seq in dataset.take(-1):
                loss_transformer = train_step(source_seq, target_seq)
                print(str(loss_transformer))

        self.assertAlmostEqual(1.175953984260559, float(loss_transformer), 5)
        output_transformer = summarizer_transformer.predict_vectors('a c', '')

        expected_first_logits = np.array(
            [-0.197903, 0.884185, 1.147212, 0.318798, 0.97936])
        np.testing.assert_allclose(expected_first_logits,
                                   output_transformer['logits'][0],
                                   atol=1e-6)
        self.assertEqual('<start> a c <end>',
                         output_transformer['preprocessed_text'][0])
        self.assertEqual('d <end>', output_transformer['predicted_text'])