def build_generator(self, inputs): if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] self.encoder_outputs = self.encode( inputs, self.attention_bias) # [batch, src_len, hidden_size] if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, self.encoder_outputs, None, None, None) if self.is_train: # if self.mode == tf.estimator.ModeKeys.TRAIN: tf.logging.info("!!!!!! using rl predict in traning !!!!!!") decoded_ids, decoded_logits, log_probs = self.rl_predict( self.encoder_outputs, self.attention_bias) return decoded_ids, decoded_logits, log_probs else: tf.logging.info( "!!!!!!! using argmax_predict in prediction/evaluation !!!!!!!!" ) decoded_ids, decoded_logits = self.argmax_predict( self.encoder_outputs, self.attention_bias) return decoded_ids, decoded_logits, _
def build_padding_rollout_generator(self, real_inputs, gen_samples, max_len, given_num): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE): if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias(real_inputs) self.encoder_outputs = self.encode(real_inputs, self.attention_bias) def condition(given_num, _): return given_num < max_len def inner_loop(given_num, given_y): logits = self.decode(given_y, self.encoder_outputs, self.attention_bias) next_logits = logits[:, given_num, :] # [batch, decoder_vocab_size] next_probs = tf.nn.softmax(next_logits) log_probs = tf.log(next_probs) next_sample = tf.multinomial(log_probs, num_samples=1) next_sample = tf.cast(next_sample, dtype=tf.int32) given_y = tf.concat([given_y[:, :given_num], next_sample], axis=1) given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num - 1]]) return given_num + 1, given_y given_y = gen_samples[:, :given_num] init_given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num]]) init_given_num = given_num given_num, roll_sample = tf.while_loop( cond=condition, body=inner_loop, loop_vars=[init_given_num, init_given_y], shape_invariants=[init_given_num.get_shape(), tf.TensorShape([None, None])] ) return roll_sample
def build_no_teacher_discriminator(self, origin_inputs, gen_target, real_loss, margin=1.0): fake_attention_bias = model_utils.get_padding_bias( gen_target) # [batch, 1, 1, src_len] fake_encoder_outputs = self.encode( gen_target, fake_attention_bias) # [batch, src_len, hidden_size] _, fake_logits = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum( fake_weights, axis=1) tf.identity(fake_loss[:5], "fake_loss") mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss") tf.summary.scalar("mean_fake_loss", mean_fake_loss) rewards = 1 / tf.maximum(margin, fake_loss / (real_loss + 1e-12) - 1) # [batch] tf.identity(rewards[:5], "rewards") mean_wards = tf.reduce_mean(rewards, name="mean_wards") tf.summary.scalar("mean_wards", mean_wards) return rewards
def build_pretrain(self, inputs, targets): # initializer = tf.variance_scaling_initializer( # self.params.initializer_gain, mode="fan_avg", distribution="uniform") # # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE): if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] encoder_outputs = self.encode( inputs, attention_bias) # [batch, src_len, hidden_size] if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, encoder_outputs, None, None, None) if targets is None: prediction, _ = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: logits = self.decode( targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def build_generator(self, inputs): with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE): self.attention_bias = model_utils.get_padding_bias(inputs) # [batch, 1, 1, src_len] self.encoder_outputs = self.encode(inputs, self.attention_bias) # [batch, src_len, hidden_size] tf.logging.info("!!!!!!! using argmax_predict in generator !!!!!!!!") decoded_ids = self.argmax_predict(self.encoder_outputs, self.attention_bias) return decoded_ids
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def build_pretrain_mono(self, inputs, targets): inputs_length = tf.argmin(inputs, axis=-1) + 1 max_len = inputs_length[tf.argmax(inputs_length)] batch_size = tf.shape(inputs)[0] pad_inputs = tf.zeros([0, max_len], dtype=tf.int32) def inner_loop(i, pad_inputs): ori_length = inputs_length[i] ori_input = tf.reshape(inputs[i][:ori_length], [1, -1]) pad_input = tf.pad(ori_input, [[0,0], [0, max_len - ori_length]]) pad_inputs = tf.concat([pad_inputs, pad_input], axis=0) return i + 1, pad_inputs _, pad_inputs = tf.while_loop( cond=lambda i,_: i < batch_size, body=inner_loop, loop_vars=[tf.constant(0), pad_inputs], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None])] ) with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(pad_inputs) # [batch, 1, 1, src_len] encoder_outputs = self.encode(pad_inputs, attention_bias) # [batch, src_len, hidden_size] #encoder_outputs = tf.stop_gradient(encoder_outputs) if targets is None: prediction = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: tf.logging.info("!!! mono decoder by techer forcing !!!") logits = self.decode(targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def build_generator(self, inputs): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. self.encoder_outputs = self.encode( inputs, self.attention_bias) # [batch, src_len, hidden_size] # get encdec_attenion k/v just for predict_one_encoder if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, self.encoder_outputs, None, None, None) # Generate output sequence if targets is None, or return logits if target # sequence is known. if self.is_train: tf.logging.info("!!!!!! using rl predict in traning !!!!!!") return self.rl_predict(self.encoder_outputs, self.attention_bias) else: tf.logging.info( "!!!!!!! using argmax_predict in inference !!!!!!!!") return self.argmax_predict(self.encoder_outputs, self.attention_bias)
def get_bleu(self, gen_targets, real_inputs): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(gen_targets) encoder_outputs = self.encode(gen_targets, attention_bias) logits = self.decode(real_inputs, encoder_outputs, attention_bias) prediction = tf.argmax(logits, axis=-1) # [batch, ori_inp_len] bleu = tf.py_func(metrics.compute_bleu_batch, (real_inputs, prediction), tf.float32) return tf.reshape(bleu, (-1, 1)) # [batch,]
def get_loss(self, origin_inputs, targets): with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(targets) # [batch, 1, 1, src_len] encoder_outputs = self.encode(targets, attention_bias) # [batch, src_len, hidden_size] logits = self.decode(origin_inputs, encoder_outputs, attention_bias) xentropy, weights = metrics.padded_cross_entropy_loss( logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] self.loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch] #prediction = self.argmax_predict(encoder_outputs, attention_bias) # [batch, max_len] return tf.reshape(self.loss, (-1, 1)) # [batch, 1]
def call(self, inputs, targets: Optional[np.ndarray] = None): attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs = self._encode(inputs, attention_bias) if targets is None: logits = self._decode(encoder_outputs, targets, attention_bias) #raise Exception() return logits #self.predict(encoder_outputs, attention_bias) else: logits = self._decode(encoder_outputs, targets, attention_bias) return logits
def get_loss(self, gen_targets, real_inputs): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(gen_targets) encoder_outputs = self.encode(gen_targets, attention_bias) logits = self.decode(real_inputs, encoder_outputs, attention_bias) xentropy, weights = metrics.padded_cross_entropy_loss(logits, real_inputs, self.params.label_smoothing, self.params.target_vocab_size) loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch, 1] return tf.reshape(loss, (-1, 1))
def build_pretrain(self, inputs, targets): self.init_embed("Transformer") with tf.variable_scope("Transformer", initializer=self._initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(inputs) # [batch, 1, 1, src_len] encoder_outputs = self.encode(inputs, attention_bias) # [batch, src_len, hidden_size] if targets is None: prediction = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: tf.logging.info("!!!!!!!!!! pretrain decoder !!!!!!!!!!!!!!!!!!") logits = self.decode(targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def test_get_padding_bias(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) bias = model_utils.get_padding_bias(x) bias_shape = tf.shape(bias) flattened_bias = tf.reshape(bias, [3, 5]) with self.test_session() as sess: flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape)) self.assertAllEqual( [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias) self.assertAllEqual([3, 1, 1, 5], bias_shape)
def __call__(self, inputs, targets=None, eos_id=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias, eos_id) # initial_ids = tf.zeros([1], dtype=tf.int32) # Create cache storing decoder attention values for each layer. cache = { "layer_%d" % layer: { "k": tf.zeros([1, 0, self.params.hidden_size]), 'w': tf.constant([]) } for layer in range(self.params.num_hidden_layers)} # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs cache["encoder_decoder_attention_bias"] = attention_bias self._get_symbols_to_logits_fn(10)(tf.constant([[1], [2]], dtype=tf.int32), 0, cache) return self._get_symbols_to_logits_fn(10)(tf.constant([[1], [1]], dtype=tf.int32), 1, cache) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def get_real_loss(self, origin_inputs, origin_target): with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): real_attention_bias = model_utils.get_padding_bias( origin_target) # [batch, 1, 1, src_len] real_encoder_outputs = self.encode( origin_target, real_attention_bias) # [batch, src_len, hidden_size] real_logits = self.decode(origin_inputs, real_encoder_outputs, real_attention_bias) real_xentropy, real_weights = metrics.padded_cross_entropy_loss( real_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) self.real_loss = tf.reduce_sum(real_xentropy) / tf.reduce_sum( real_weights) # [batch] return self.real_loss
def inference(self, inputs, targets=None, reuse=None): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=reuse): if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs = self.encode(inputs, attention_bias) if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack( fake_decoder_inputs, encoder_outputs, None, None, None) if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def get_fake_loss(self, origin_inputs, gen_targets): inputs_length = tf.argmin(gen_targets, axis=-1) + 1 max_len = inputs_length[tf.argmax(inputs_length)] batch_size = tf.shape(gen_targets)[0] pad_gen_targets = tf.zeros([0, max_len], dtype=tf.int32) def inner_loop(i, pad_inputs): ori_length = inputs_length[i] ori_input = tf.reshape(gen_targets[i][:ori_length], [1, -1]) pad_input = tf.pad(ori_input, [[0, 0], [0, max_len - ori_length]]) pad_inputs = tf.concat([pad_inputs, pad_input], axis=0) return i + 1, pad_inputs _, pad_gen_targets = tf.while_loop( cond=lambda i, _: i < batch_size, body=inner_loop, loop_vars=[tf.constant(0), pad_gen_targets], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None]) ]) gen_targets = pad_gen_targets with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): fake_attention_bias = model_utils.get_padding_bias( gen_targets) # [batch, 1, 1, src_len] fake_encoder_outputs = self.encode( gen_targets, fake_attention_bias) # [batch, src_len, hidden_size] fake_logits = self.decode(origin_inputs, fake_encoder_outputs, fake_attention_bias) fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] self.fake_loss = tf.reduce_sum(fake_xentropy) / tf.reduce_sum( fake_weights) #fake_prediction = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) # [batch, max_len] return self.fake_loss
def call(self, inputs, targets: Optional[np.ndarray] = None): attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs, enc_ponders, enc_remainders = self._encode( inputs, attention_bias) logits, dec_ponders, dec_remainders = self._decode( encoder_outputs, targets, attention_bias) if targets is None: raise Exception() enc_act_loss = tf.reduce_mean(enc_ponders + enc_remainders) dec_act_loss = tf.reduce_mean(dec_ponders + dec_remainders) act_loss = self.hparams['act_loss_weight'] * (enc_act_loss + dec_act_loss) if self.is_train: with tf.contrib.summary.record_summaries_every_n_global_steps(10): tf.contrib.summary.scalar('summary/ponder_times_enc', tf.reduce_mean(enc_ponders)) tf.contrib.summary.scalar('summary/ponder_times_dec', tf.reduce_mean(dec_ponders)) return logits, act_loss
def __call__(self, feature, targets=None): """ :param feature: :param targets: :return: """ initializer = tf.variance_scaling_initializer( scale=self.params.get('initializer_gain'), mode='fan_avg', distribution='uniform') with tf.variable_scope('transformer', initializer=initializer): # [batch_size, 1, 1, length] attention_bias = model_utils.get_padding_bias(feature) encoder_outputs = self.encode(feature, attention_bias) if targets is None: return self.predict(encoder_outputs, attention_bias) logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def build_pretrain(self, inputs, targets): # initializer = tf.variance_scaling_initializer( # self.params.initializer_gain, mode="fan_avg", distribution="uniform") # # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode( inputs, attention_bias) # [batch, src_len, hidden_size] # get encdec_attenion k/v just for predict_one_encoder if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, encoder_outputs, None, None, None) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: tf.logging.info( "!!!!!!!!!!!prediction using argmax prediction!!!!!!!!!!!!!") prediction, _ = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: logits = self.decode( targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def forward(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs = self.encode(inputs, attention_bias) if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
if __name__ == "__main__": import os tf.enable_eager_execution() os.environ["CUDA_VISIBLE_DEVICES"] = "0" params = model_params.TransformerBaseParams() x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32) Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size, params.hidden_size, "source_embedding") embedded_inputs = Enc_Embedding( x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN)) print(embedded_inputs.shape) attention_bias = model_utils.get_padding_bias(x_inputs) print(attention_bias.shape) encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN) enc_out = encoder_stack(embedded_inputs, attention_bias, None) print(enc_out.shape) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( 10) self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1] print(self_attention_bias) attention_bias = model_utils.get_padding_bias(x_inputs) cache = { "layer_%d" % layer: { "k": tf.zeros([2, 0, params.hidden_size]), "v": tf.zeros([2, 0, params.hidden_size]), } for layer in range(params.num_hidden_layers)