def test_preprocessing(self): preprocessor = Preprocessor() data = (('First text!', 'first head'), ('2-nd täxt', 'Second head')) data_preprocessed = [preprocessor(d) for d in data] self.assertEqual( ('<start> first text ! <end>', '<start> first head <end>'), data_preprocessed[0]) self.assertEqual( ('<start> #-nd täxt <end>', '<start> second head <end>'), data_preprocessed[1]) preprocessor = Preprocessor(start_token='<start>', end_token='<end>', lower_case=True, hash_numbers=False) data_preprocessed = [preprocessor(d) for d in data] self.assertEqual( ('<start> 2-nd täxt <end>', '<start> second head <end>'), data_preprocessed[1]) preprocessor = Preprocessor(start_token='<start>', end_token='<end>', lower_case=False, hash_numbers=True) data_preprocessed = [preprocessor(d) for d in data] self.assertEqual( ('<start> #-nd täxt <end>', '<start> Second head <end>'), data_preprocessed[1])
def test_serde_happy_path(self) -> None: preprocessor = Preprocessor() tokenizer = KerasTokenizer(oov_token='<unk>') tokenizer.fit(['a b c {} {}'.format( preprocessor.start_token, preprocessor.end_token)]) vectorizer = Vectorizer(tokenizer, tokenizer) summarizer = AttentionSummarizer(lstm_size=10, max_prediction_len=10, embedding_size=10, embedding_encoder_trainable=False) summarizer.init_model(preprocessor=preprocessor, vectorizer=vectorizer) # we need at least a train step to init the weights train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) train_step(train_seq, train_seq) save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') summarizer.save(save_dir) summarizer_loaded = AttentionSummarizer.load(save_dir) self.assertEqual(10, summarizer_loaded.lstm_size) self.assertEqual(10, summarizer_loaded.max_prediction_len) self.assertIsNotNone(summarizer_loaded.preprocessor) self.assertIsNotNone(summarizer_loaded.vectorizer) self.assertIsNotNone(summarizer_loaded.encoder) self.assertIsNotNone(summarizer_loaded.decoder) self.assertFalse(summarizer_loaded.encoder.embedding.trainable) self.assertTrue(summarizer_loaded.decoder.embedding.trainable) self.assertIsNotNone(summarizer_loaded.optimizer) pred = summarizer.predict_vectors('a c', '') pred_loaded = summarizer_loaded.predict_vectors('a c', '') np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6)
def test_init(self) -> None: preprocessor = Preprocessor(start_token='<custom_start_token>', lower_case=False, hash_numbers=False) trainer = Trainer(max_output_len=9, batch_size=1, max_vocab_size_encoder=2, max_vocab_size_decoder=3, embedding_path_encoder='glove.txt', steps_per_epoch=4, tensorboard_dir='tensor_dir', model_save_path='model_save_path', shuffle_buffer_size=10, bucketing_buffer_size_batches=5, bucketing_batches_to_bucket=6, steps_to_log=7, logging_level=logging.DEBUG, preprocessor=preprocessor) self.assertEqual(1, trainer.batch_size) self.assertEqual(2, trainer.max_vocab_size_encoder) self.assertEqual(3, trainer.max_vocab_size_decoder) self.assertEqual('glove.txt', trainer.embedding_path_encoder) self.assertIsNone(trainer.embedding_path_decoder) self.assertEqual(4, trainer.steps_per_epoch) self.assertEqual('tensor_dir', trainer.tensorboard_dir) self.assertEqual('model_save_path', trainer.model_save_path) self.assertFalse(trainer.use_bucketing) self.assertEqual(10, trainer.shuffle_buffer_size) self.assertEqual(5, trainer.bucketing_buffer_size_batches) self.assertEqual(6, trainer.bucketing_batches_to_bucket) self.assertEqual(7, trainer.steps_to_log) self.assertEqual(9, trainer.max_output_len) self.assertEqual(logging.DEBUG, trainer.logger.level) self.assertEqual('<custom_start_token>', trainer.preprocessor.start_token) self.assertEqual(False, trainer.preprocessor.lower_case) self.assertEqual(False, trainer.preprocessor.hash_numbers)
def test_serde_happy_path(self) -> None: preprocessor = Preprocessor(start_token='[CLS]', end_token='[SEP]') tokenizer_encoder = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer_decoder = KerasTokenizer(oov_token='<unk>') tokenizer_decoder.fit([ 'a b c {} {}'.format(preprocessor.start_token, preprocessor.end_token) ]) vectorizer = Vectorizer(tokenizer_encoder, tokenizer_decoder) summarizer = SummarizerBert(num_layers_encoder=1, num_layers_decoder=1, bert_embedding_encoder='bert-base-uncased', num_heads=2, max_prediction_len=3, embedding_size_encoder=768, embedding_size_decoder=10, embedding_encoder_trainable=False) summarizer.init_model(preprocessor=preprocessor, vectorizer=vectorizer) # we need at least a train step to init the weights train_step = summarizer.new_train_step(masked_crossentropy, batch_size=1, apply_gradients=True) train_seq = tf.convert_to_tensor(np.array([[1, 1, 1]]), dtype=tf.int32) train_step(train_seq, train_seq) save_dir = os.path.join(self.temp_dir, 'summarizer_serde_happy_path') summarizer.save(save_dir) summarizer_loaded = SummarizerBert.load(save_dir) self.assertEqual(1, summarizer_loaded.num_layers_encoder) self.assertEqual(1, summarizer_loaded.num_layers_decoder) self.assertEqual(2, summarizer_loaded.num_heads) self.assertEqual(3, summarizer_loaded.max_prediction_len) self.assertEqual(768, summarizer_loaded.embedding_size_encoder) self.assertEqual(10, summarizer_loaded.embedding_size_decoder) self.assertIsNotNone(summarizer_loaded.preprocessor) self.assertIsNotNone(summarizer_loaded.vectorizer) self.assertIsNotNone(summarizer_loaded.transformer) self.assertFalse( summarizer_loaded.transformer.encoder.embedding.trainable) self.assertTrue( summarizer_loaded.transformer.decoder.embedding.trainable) self.assertIsNotNone(summarizer_loaded.optimizer_encoder) self.assertIsNotNone(summarizer_loaded.optimizer_decoder) pred = summarizer.predict_vectors('a c', '') pred_loaded = summarizer_loaded.predict_vectors('a c', '') np.testing.assert_almost_equal(pred['logits'], pred_loaded['logits'], decimal=6)
def setUp(self) -> None: tf.random.set_seed(42) np.random.seed(42) self.data = [('a b', 'c'), ('a b c', 'd')] tokenizer_encoder = KerasTokenizer(lower=False, filters='') tokenizer_decoder = KerasTokenizer(lower=False, filters='') tokenizer_encoder.fit(['a b c <start> <end>']) tokenizer_decoder.fit(['c d <start> <end>']) self.vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder, tokenizer_decoder=tokenizer_decoder, max_output_len=3) self.preprocessor = Preprocessor() batch_generator = DatasetGenerator(2) data_prep = [self.preprocessor(d) for d in self.data] data_vecs = [self.vectorizer(d) for d in data_prep] self.dataset = batch_generator(lambda: data_vecs) self.loss_func = masked_crossentropy
def __init__(self, max_input_len=None, max_output_len=None, batch_size=16, max_vocab_size_encoder=200000, max_vocab_size_decoder=200000, embedding_path_encoder=None, embedding_path_decoder=None, steps_per_epoch=500, tensorboard_dir=None, model_save_path=None, shuffle_buffer_size=100000, use_bucketing=False, bucketing_buffer_size_batches=10000, bucketing_batches_to_bucket=100, logging_level=logging.INFO, num_print_predictions=5, steps_to_log=10, preprocessor: Union[Preprocessor, None] = None) -> None: """ Initializes the trainer. Args: max_input_len (output): Maximum length of input sequences, longer sequences will be truncated. max_output_len (output): Maximum length of output sequences, longer sequences will be truncated. batch_size: Size of mini-batches for stochastic gradient descent. max_vocab_size_encoder: Maximum number of unique tokens to consider for encoder embeddings. max_vocab_size_decoder: Maximum number of unique tokens to consider for decoder embeddings. embedding_path_encoder: Path to embedding file for the encoder. embedding_path_decoder: Path to embedding file for the decoder. steps_per_epoch: Number of steps to train until callbacks are invoked. tensorboard_dir: Directory for saving tensorboard logs. model_save_path: Directory for saving the best model. shuffle_buffer_size: Size of the buffer for shuffling the files before batching. use_bucketing: Whether to bucket the sequences by length to reduce the amount of padding. bucketing_buffer_size_batches: Number of batches to buffer when bucketing sequences. bucketing_batches_to_bucket: Number of buffered batches from which sequences are collected for bucketing. logging_level: Level of logging to use, e.g. logging.INFO or logging.DEBUG. num_print_predictions: Number of sample predictions to print in each evaluation. steps_to_log: Number of steps to wait for logging output. preprocessor (optional): custom preprocessor, if None a standard preprocessor will be created. """ self.max_input_len = max_input_len self.max_output_len = max_output_len self.batch_size = batch_size self.max_vocab_size_encoder = max_vocab_size_encoder self.max_vocab_size_decoder = max_vocab_size_decoder self.bucketing_buffer_size_batches = bucketing_buffer_size_batches self.bucketing_batches_to_bucket = bucketing_batches_to_bucket self.embedding_path_encoder = embedding_path_encoder self.embedding_path_decoder = embedding_path_decoder self.steps_per_epoch = steps_per_epoch self.tensorboard_dir = tensorboard_dir self.model_save_path = model_save_path self.loss_function = masked_crossentropy self.use_bucketing = use_bucketing self.shuffle_buffer_size = None if use_bucketing else shuffle_buffer_size self.bucket_generator = None if use_bucketing: self.bucket_generator = BucketGenerator( element_length_function=lambda vecs: len(vecs[0]), batch_size=self.batch_size, buffer_size_batches=self.bucketing_buffer_size_batches, batches_to_bucket=self.bucketing_batches_to_bucket, shuffle=True, seed=42) self.logger = get_logger(__name__) self.logger.setLevel(logging_level) self.num_print_predictions = num_print_predictions self.steps_to_log = steps_to_log self.preprocessor = preprocessor or Preprocessor( start_token=START_TOKEN, end_token=END_TOKEN)
def test_training(self) -> None: data = [('a b', 'c'), ('a b c', 'd')] tokenizer_encoder = KerasTokenizer(lower=False, filters='') tokenizer_decoder = KerasTokenizer(lower=False, filters='') tokenizer_encoder.fit(['a b c <start> <end>']) tokenizer_decoder.fit(['c d <start> <end>']) vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder, tokenizer_decoder=tokenizer_decoder, max_output_len=3) preprocessor = Preprocessor() batch_generator = DatasetGenerator(2) data_prep = [preprocessor(d) for d in data] data_vecs = [vectorizer(d) for d in data_prep] dataset = batch_generator(lambda: data_vecs) summarizer_transformer = SummarizerTransformer(num_heads=1, num_layers=1, feed_forward_dim=20, embedding_size=10, dropout_rate=0, max_prediction_len=3) summarizer_transformer.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) summarizer_attention = SummarizerAttention(lstm_size=10, embedding_size=10) summarizer_attention.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) summarizer = SummarizerBasic(lstm_size=10, embedding_size=10) summarizer.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) loss_func = masked_crossentropy loss_attention = 0 train_step = summarizer_attention.new_train_step( loss_function=loss_func, batch_size=2) for _ in range(10): for source_seq, target_seq in dataset.take(-1): loss_attention = train_step(source_seq, target_seq) print(str(loss_attention)) self.assertAlmostEqual(1.5810251235961914, float(loss_attention), 10) output_attention = summarizer_attention.predict_vectors('a c', '') expected_first_logits = np.array( [-0.069454, 0.00272, 0.007199, -0.039547, 0.014357]) np.testing.assert_allclose(expected_first_logits, output_attention['logits'][0], atol=1e-6) self.assertEqual('a c', output_attention['preprocessed_text'][0]) self.assertEqual('<end>', output_attention['predicted_text']) loss = 0 train_step = summarizer.new_train_step(loss_function=loss_func, batch_size=2) for e in range(0, 10): for source_seq, target_seq in dataset.take(-1): loss = train_step(source_seq, target_seq) self.assertAlmostEqual(1.5771859884262085, float(loss), 10) output = summarizer.predict_vectors('a c', '') expected_first_logits = np.array( [-0.03838864, 0.01226684, 0.01055636, -0.05209339, 0.02549592]) np.testing.assert_allclose(expected_first_logits, output['logits'][0], atol=1e-6) self.assertEqual('a c', output['preprocessed_text'][0]) self.assertEqual('<end>', output['predicted_text']) loss_transformer = 0 train_step = summarizer_transformer.new_train_step( loss_function=loss_func, batch_size=2) for e in range(0, 10): for source_seq, target_seq in dataset.take(-1): loss_transformer = train_step(source_seq, target_seq) print(str(loss_transformer)) self.assertAlmostEqual(1.2841172218322754, float(loss_transformer), 10) output_transformer = summarizer_transformer.predict_vectors('a c', '') expected_first_logits = np.array( [0.094787, 0.516092, 1.165521, 0.271338, 0.670318]) np.testing.assert_allclose(expected_first_logits, output_transformer['logits'][0], atol=1e-6) self.assertEqual('a c', output_transformer['preprocessed_text'][0]) self.assertEqual('d <end>', output_transformer['predicted_text'])
def test_training(self) -> None: data = [('a b', 'c'), ('a b c', 'd')] tokenizer_encoder = KerasTokenizer(lower=False, filters='') tokenizer_decoder = KerasTokenizer(lower=False, filters='') tokenizer_encoder.fit(['a b c <start> <end>']) tokenizer_decoder.fit(['c d <start> <end>']) vectorizer = Vectorizer(tokenizer_encoder=tokenizer_encoder, tokenizer_decoder=tokenizer_decoder, max_output_len=3) preprocessor = Preprocessor() batch_generator = DatasetGenerator(2) data_prep = [preprocessor(d) for d in data] data_vecs = [vectorizer(d) for d in data_prep] dataset = batch_generator(lambda: data_vecs) summarizer_transformer = SummarizerTransformer(num_heads=1, num_layers=1, feed_forward_dim=20, embedding_size=10, dropout_rate=0, max_prediction_len=3) summarizer_transformer.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) summarizer_attention = SummarizerAttention(lstm_size=10, embedding_size=10) summarizer_attention.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) summarizer = SummarizerBasic(lstm_size=10, embedding_size=10) summarizer.init_model(preprocessor=preprocessor, vectorizer=vectorizer, embedding_weights_encoder=None, embedding_weights_decoder=None) loss_func = masked_crossentropy loss_attention = 0 train_step = summarizer_attention.new_train_step( loss_function=loss_func, batch_size=2) for _ in range(10): for source_seq, target_seq in dataset.take(-1): loss_attention = train_step(source_seq, target_seq) print(str(loss_attention)) self.assertAlmostEqual(1.577033519744873, float(loss_attention), 5) output_attention = summarizer_attention.predict_vectors('a c', '') expected_first_logits = np.array( [-0.077805, 0.012667, 0.021359, -0.04872, 0.014989]) np.testing.assert_allclose(expected_first_logits, output_attention['logits'][0], atol=1e-6) self.assertEqual('<start> a c <end>', output_attention['preprocessed_text'][0]) self.assertEqual('d <end>', output_attention['predicted_text']) loss = 0 train_step = summarizer.new_train_step(loss_function=loss_func, batch_size=2) for e in range(0, 10): for source_seq, target_seq in dataset.take(-1): loss = train_step(source_seq, target_seq) self.assertAlmostEqual(1.5713274478912354, float(loss), 5) output = summarizer.predict_vectors('a c', '') expected_first_logits = np.array( [-0.051753, 0.013869, 0.010337, -0.073727, 0.033059]) np.testing.assert_allclose(expected_first_logits, output['logits'][0], atol=1e-6) self.assertEqual('<start> a c <end>', output['preprocessed_text'][0]) self.assertEqual('<end>', output['predicted_text']) loss_transformer = 0 train_step = summarizer_transformer.new_train_step( loss_function=loss_func, batch_size=2) for e in range(0, 10): for source_seq, target_seq in dataset.take(-1): loss_transformer = train_step(source_seq, target_seq) print(str(loss_transformer)) self.assertAlmostEqual(1.175953984260559, float(loss_transformer), 5) output_transformer = summarizer_transformer.predict_vectors('a c', '') expected_first_logits = np.array( [-0.197903, 0.884185, 1.147212, 0.318798, 0.97936]) np.testing.assert_allclose(expected_first_logits, output_transformer['logits'][0], atol=1e-6) self.assertEqual('<start> a c <end>', output_transformer['preprocessed_text'][0]) self.assertEqual('d <end>', output_transformer['predicted_text'])