def test_output_layer(self): decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=None) self.assertIsInstance(decoder, TransformerDecoder) decoder = TransformerDecoder(output_layer=texar.core.identity) self.assertIsInstance(decoder, TransformerDecoder) tensor = torch.rand(self._vocab_size, self._emb_dim, dtype=torch.float) decoder = TransformerDecoder(output_layer=tensor) self.assertIsInstance(decoder, TransformerDecoder) self.assertEqual(decoder.vocab_size, self._vocab_size)
def test_output_layer(self): decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=None) self.assertIsInstance(decoder, TransformerDecoder) decoder = TransformerDecoder(output_layer=tf.identity) self.assertIsInstance(decoder, TransformerDecoder) tensor = tf.random_uniform( [self._emb_dim, self._vocab_size], maxval=1, dtype=tf.float32 ) decoder = TransformerDecoder(output_layer=tensor) self.assertIsInstance(decoder, TransformerDecoder) self.assertEqual(decoder.vocab_size, self._vocab_size)
def test_beam_search(self): """Tests beam_search """ decoder = TransformerDecoder( vocab_size=self._vocab_size, output_layer=self._output_layer ) outputs = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, embedding=self._embedding_fn, beam_width=5, start_tokens=self._start_tokens, end_token=self._end_token, max_decoding_length=self._max_decode_len, mode=tf.estimator.ModeKeys.PREDICT ) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertEqual(outputs_['log_prob'].shape, (self._batch_size, 5)) self.assertEqual(outputs_['sample_id'].shape, (self._batch_size, self._max_decode_len, 5))
def test_infer_greedy_with_context_without_memory(self): """Tests train_greedy with context """ decoder = TransformerDecoder( vocab_size=self._vocab_size, output_layer=self._output_layer ) helper = tx_helper.GreedyEmbeddingHelper( self._embedding_fn, self._start_tokens, self._end_token) outputs, length = decoder( memory=None, memory_sequence_length=None, memory_attention_bias=None, inputs=None, decoding_strategy='infer_greedy', helper=helper, context=self._context, context_sequence_length=self._context_length, end_token=self._end_token, max_decoding_length=self._max_decode_len, mode=tf.estimator.ModeKeys.PREDICT) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertIsInstance(outputs_, TransformerDecoderOutput)
def __init__(self, config_model, config_data): ModuleBase.__init__(self) self.config_model = config_model self.config_data = config_data with open(config_data.vocab_file, "rb") as f: id2w = pickle.load(f) self.id2w = id2w self.vocab_size = len(id2w) self.pad_token_id, self.bos_token_id = (0, 1) self.eos_token_id, self.unk_token_id = (2, 3) self.word_embedder = WordEmbedder(vocab_size=self.vocab_size, hparams=config_model.emb) self.pos_embedder = SinusoidsPositionEmbedder( position_size=config_data.max_decoding_length, hparams=config_model.position_embedder_hparams, ) self.encoder = TransformerEncoder(hparams=config_model.encoder) self.decoder = TransformerDecoder( vocab_size=self.vocab_size, output_layer=self.word_embedder.embedding, hparams=config_model.decoder, ) self.smoothed_loss_func = LabelSmoothingLoss( label_confidence=self.config_model.loss_label_confidence, tgt_vocab_size=self.vocab_size, ignore_index=0, )
def test_greedy_embedding_helper(self): """Tests with tf.contrib.seq2seq.GreedyEmbeddingHelper """ decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.eval() helper = decoder_helpers.GreedyEmbeddingHelper(self._embedding, self._start_tokens, self._end_token) outputs, length = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, helper=helper, max_decoding_length=self._max_decode_len) self.assertIsInstance(outputs, TransformerDecoderOutput)
def test_beam_search(self): """Tests beam_search """ decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.eval() outputs = decoder(memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, beam_width=5, start_tokens=self._start_tokens, end_token=self._end_token, max_decoding_length=self._max_decode_len) self.assertEqual(outputs['log_prob'].shape, (self._batch_size, 5)) self.assertEqual(outputs['sample_id'].shape, (self._batch_size, self._max_decode_len, 5))
def test_infer_greedy_with_context_without_memory(self): """Tests train_greedy with context """ decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.eval() outputs, length = decoder(memory=None, memory_sequence_length=None, memory_attention_bias=None, inputs=None, decoding_strategy='infer_greedy', context=self._context, context_sequence_length=self._context_length, end_token=self._end_token, embedding=self._embedding_fn, max_decoding_length=self._max_decode_len) self.assertIsInstance(outputs, TransformerDecoderOutput)
def test_decode_infer_sample(self): """Tests infer_sample """ decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.eval() helper = decoder_helpers.SampleEmbeddingHelper(self._embedding_fn, self._start_tokens, self._end_token) outputs, length = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, helper=helper, max_decoding_length=self._max_decode_len) self.assertIsInstance(outputs, TransformerDecoderOutput)
def test_decode_train(self): """Tests train_greedy """ decoder = TransformerDecoder(vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.train() # 6 blocks # -self multihead_attention: 4 dense without bias + 2 layer norm vars # -encdec multihead_attention: 4 dense without bias + 2 layer norm vars # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars # 2 layer norm vars outputs = decoder(memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=self._inputs, decoding_strategy='train_greedy') # print(decoder) # for name, _ in decoder.named_parameters(): # print(name) self.assertEqual(len(decoder.trainable_variables), 110) self.assertIsInstance(outputs, TransformerDecoderOutput)
def test_beam_search(self): """Tests beam_search """ decoder = TransformerDecoder(token_pos_embedder=self._embedding_fn, vocab_size=self._vocab_size, output_layer=self._output_layer) decoder.eval() beam_width = 5 outputs = decoder(memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, beam_width=beam_width, start_tokens=self._start_tokens, end_token=self._end_token, max_decoding_length=self._max_decode_len) self.assertEqual(outputs['log_prob'].size(), (self._batch_size, beam_width)) self.assertEqual(outputs['sample_id'].size(0), self._batch_size) self.assertLessEqual(outputs['sample_id'].size(2), self._max_decode_len) self.assertEqual(outputs['sample_id'].size(2), beam_width)
def test_greedy_embedding_helper(self): """Tests with tf.contrib.seq2seq.GreedyEmbeddingHelper """ decoder = TransformerDecoder(embedding=self._embedding) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self._embedding, self._start_tokens, self._end_token) outputs, length = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, helper=helper, max_decoding_length=self._max_decode_len, mode=tf.estimator.ModeKeys.PREDICT) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertIsInstance(outputs_, TransformerDecoderOutput)
def test_infer_greedy(self): """Tests train_greedy """ decoder = TransformerDecoder(embedding=self._embedding) outputs, length = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, decoding_strategy='infer_greedy', start_tokens=self._start_tokens, end_token=self._end_token, max_decoding_length=self._max_decode_len, mode=tf.estimator.ModeKeys.PREDICT) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertIsInstance(outputs_, TransformerDecoderOutput)
def __init__(self, gpt2_config, top_k, temperature): super().__init__() self.word_embedder = WordEmbedder(vocab_size=gpt2_config.vocab_size, hparams=gpt2_config.embed) self.pos_embedder = PositionEmbedder( position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed) self.decoder = TransformerDecoder( vocab_size=gpt2_config.vocab_size, output_layer=self.word_embedder.embedding, hparams=gpt2_config.decoder) self.top_k = top_k self.temperature = temperature self._embedding_fn = lambda x, y: (self.word_embedder(x) + self. pos_embedder(y))
def test_train(self): """Tests train_greedy """ decoder = TransformerDecoder(embedding=self._embedding) # 6 blocks # -self multihead_attention: 4 dense without bias + 2 layer norm vars # -encdec multihead_attention: 4 dense without bias + 2 layer norm vars # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars # 2 layer norm vars outputs = decoder(memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=self._inputs, decoding_strategy='train_greedy', mode=tf.estimator.ModeKeys.TRAIN) self.assertEqual(len(decoder.trainable_variables), 110) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertIsInstance(outputs_, TransformerDecoderOutput)
def test_decode_infer_sample(self): """Tests infer_sample """ decoder = TransformerDecoder( vocab_size=self._vocab_size, output_layer=self._output_layer ) helper = tx_helper.SampleEmbeddingHelper( self._embedding_fn, self._start_tokens, self._end_token) outputs, length = decoder( memory=self._memory, memory_sequence_length=self._memory_sequence_length, memory_attention_bias=None, inputs=None, helper=helper, max_decoding_length=self._max_decode_len, mode=tf.estimator.ModeKeys.PREDICT) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs_ = sess.run(outputs) self.assertIsInstance(outputs_, TransformerDecoderOutput)
def __init__(self, pretrained_model_name: Optional[str] = None, cache_dir: Optional[str] = None, hparams=None): super().__init__(pretrained_model_name=pretrained_model_name, cache_dir=cache_dir, hparams=hparams) if self.pretrained_model_dir: self._hparams = HParams(self.pretrained_model_hparams, self._hparams.todict()) # Word embedding self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size, hparams=self._hparams.embed) # Position embedding self.position_embedder = PositionEmbedder( position_size=self._hparams.position_size, hparams=self._hparams.position_embed) # The GPT2 decoder (a TransformerDecoder) self.decoder = TransformerDecoder( vocab_size=self._hparams.vocab_size, output_layer=self.word_embedder.embedding, hparams=self._hparams.decoder) if self.pretrained_model_dir: gpt2_utils.init_gpt2_checkpoint(self, self.pretrained_model_dir) elif self._hparams.initializer: initialize = layers.get_initializer(self._hparams.initializer) assert initialize is not None # Do not re-initialize LayerNorm modules. for name, param in self.named_parameters(): if name.split( '.')[-1] == 'weight' and 'layer_norm' not in name: initialize(param)
def default_hparams(): r"""Returns a dictionary of hyperparameters with default values. * The decoder arch is determined by the constructor argument :attr:`pretrained_model_name` if it's specified. In this case, `hparams` are ignored. * Otherwise, the encoder arch is determined by `hparams['pretrained_model_name']` if it's specified. All other configurations in `hparams` are ignored. * If the above two are `None`, the encoder arch is defined by the configurations in `hparams` and weights are randomly initialized. .. code-block:: python { "name": "gpt2_decoder", "pretrained_model_name": "117M", "vocab_size": 50257, "context_size": 1024, "embedding_size": 768, "embed": { "dim": 768, "name": "word_embeddings" }, "position_size": 1024, "position_embed": { "dim": 768, "name": "position_embeddings" }, # hparams for TransformerDecoder "decoder": { "dim": 768, "num_blocks": 12, "use_gpt_config": True, "embedding_dropout": 0, "residual_dropout": 0, "multihead_attention": { "use_bias": True, "num_units": 768, "num_heads": 12, "dropout_rate": 0.0, "output_dim": 768 }, "initializer": { "type": "variance_scaling_initializer", "kwargs": { "factor": 1.0, "mode": "FAN_AVG", "uniform": True } }, "poswise_feedforward": { "layers": [ { "type": "Linear", "kwargs": { "in_features": 768, "out_features": 3072, "bias": True } }, { "type": "GPTGELU", "kwargs": {} }, { "type": "Linear", "kwargs": { "in_features": 3072, "out_features": 768, "bias": True } } ], "name": "ffn" } }, } Here: The default parameters are values for 117M GPT2 model. `"pretrained_model_name"`: str or None The name of the pre-trained GPT2 model. If None, the model will be randomly initialized. `"embed"`: dict Hyperparameters for word embedding layer. `"vocab_size"`: int The vocabulary size of `inputs` in `GPT2Model`. `"position_embed"`: dict Hyperparameters for position embedding layer. `"position_size"`: int The maximum sequence length that this model might ever be used with. `"name"`: str Name of the module. """ return { **TransformerDecoder.default_hparams(), 'dim': 768, 'num_blocks': 12, 'use_gpt_config': True, 'embedding_dropout': 0, 'residual_dropout': 0, 'multihead_attention': { 'use_bias': True, 'num_units': 768, 'num_heads': 12, "dropout_rate": 0.0, 'output_dim': 768 }, 'initializer': { 'type': 'variance_scaling_initializer', 'kwargs': { 'factor': 1.0, 'mode': 'FAN_AVG', 'uniform': True } }, 'poswise_feedforward': { 'layers': [{ 'type': 'Linear', 'kwargs': { 'in_features': 768, 'out_features': 3072, 'bias': True } }, { 'type': 'GPTGELU', 'kwargs': {} }, { 'type': 'Linear', 'kwargs': { 'in_features': 3072, 'out_features': 768, 'bias': True } }], 'name': 'ffn' }, 'pretrained_model_name': '117M', 'vocab_size': 50257, 'context_size': 1024, 'embedding_size': 768, 'embed': { 'dim': 768, 'name': 'word_embeddings' }, 'position_size': 1024, 'position_embed': { 'dim': 768, 'name': 'position_embeddings' }, 'name': 'gpt2_decoder', '@no_typecheck': ['pretrained_model_name'], }
def main(_): """ Builds the model and runs """ np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) nsamples = FLAGS.nsamples batch_size = FLAGS.batch_size max_decoding_length = FLAGS.max_decoding_length ckpt_path = FLAGS.checkpoint # Load GPT-2 model configuration if FLAGS.config_type == "json": gpt2_config = model_utils.transform_gpt2_to_texar_config( FLAGS.config_model) elif FLAGS.config_type == 'texar': gpt2_config = importlib.import_module(FLAGS.config_model) else: raise ValueError('Unknown config_type.') assert max_decoding_length <= gpt2_config.decoder["position_size"], ( "max_decoding_length should be smaller than position size") assert nsamples % batch_size == 0, ( "nsamples must be dividable by batch_size") # Create a data pre-processor for, e.g., BPE encoding proc = processor.get_encoder("gpt2_pretrained_models/model_117M") context = tf.placeholder(tf.int32, [batch_size, None]) context_length = tf.placeholder(tf.int32, [batch_size]) end_token = proc.encoder['<|endoftext|>'] if FLAGS.is_interactive: start_tokens = context[:, 0] else: start_tokens = tf.fill([batch_size], end_token) # Build the GPT-2 modle embedder = tx.modules.WordEmbedder(vocab_size=gpt2_config.vocab_size, hparams=gpt2_config.embed) helper = tx.modules.TopKSampleEmbeddingHelper( embedding=embedder, start_tokens=start_tokens, end_token=end_token, top_k=FLAGS.top_k, softmax_temperature=FLAGS.temperature) decoder = TransformerDecoder(embedding=embedder.embedding, hparams=gpt2_config.decoder) with tf.Session() as sess: if FLAGS.is_interactive: # Generate continuations of context lm_output, _ = decoder(context=context, context_sequence_length=context_length, max_decoding_length=max_decoding_length, helper=helper, mode=tf.estimator.ModeKeys.PREDICT) # Load model checkpoint model_utils.init_gpt2_checkpoint(sess, ckpt_path) print("\nFinished loading\n") # Enter interactive mode while True: raw_text = input("Model input >>> ") while not raw_text: print('Input should not be empty!') raw_text = input("Model input >>> ") context_tokens = proc.encode(raw_text) feed_dict = { context: [context_tokens for _ in range(batch_size)], context_length: [len(context_tokens) for _ in range(batch_size)], tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT } generated = 0 for _ in range(nsamples // batch_size): output = sess.run(lm_output, feed_dict=feed_dict) sample_id = output.sample_id for i in range(batch_size): generated += 1 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) si = sample_id[i][len(context_tokens):] print(proc.decode(si)) print("=" * 80) else: # Generate samples from scratch lm_output, _ = decoder(max_decoding_length=max_decoding_length, helper=helper, mode=tf.estimator.ModeKeys.PREDICT) # Load model checkpoint model_utils.init_gpt2_checkpoint(sess, ckpt_path) print("\nFinished loading\n") feed_dict = { tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT } generated = 0 while nsamples == 0 or generated < nsamples: output = sess.run(lm_output, feed_dict=feed_dict) sample_id = output.sample_id for i in range(batch_size): generated += batch_size text = proc.decode(sample_id[i]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)