def cycle_gan_internal(inputs, targets, _, hparams): """Cycle GAN, main step used for training.""" with tf.variable_scope("cycle_gan"): # Embed inputs and targets. inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) inputs = common_layers.embedding( inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") targets = common_layers.embedding( targets_orig, hparams.vocab_size, hparams.hidden_size, "embed", reuse=True) # Split the batch into input-input and target-target parts. inputs1, _ = split_on_batch(inputs) _, targets2 = split_on_batch(targets) # Define F and G, called inp2tgt and tgt2inp here. def inp2tgt(x, reuse=False): return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt", reuse) def tgt2inp(x, reuse=False): return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp", reuse) # Input-input part. inp1_tgt = inp2tgt(inputs1) inp1_back = tgt2inp(inp1_tgt) # Target-target part. tgt2_inp = tgt2inp(targets2, reuse=True) tgt2_back = inp2tgt(tgt2_inp, reuse=True) # Reconstruction losses. inp1_orig, _ = split_on_batch(inputs_orig) _, tgt2_orig = split_on_batch(targets_orig) inp1_loss = reconstruct_loss( inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) tgt2_loss = reconstruct_loss( tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) # Discriminator losses. dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams, "inp_disc") dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams, "tgt_disc") # Reconstruct targets from inputs. tgt = inp2tgt(inputs, reuse=True) tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) # We use the reconstruction only for tracking progress, no gradients here! tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss, "target_target": hparams.cycle_loss_multiplier * tgt2_loss, "input_disc": dloss1, "target_disc": dloss2} return tgt, losses
def bottom(self, x): with tf.variable_scope(self.name): return common_layers.embedding( x, self._vocab_size, self._body_input_depth, multiplier=self._body_input_depth**0.5 if self._model_hparams.multiply_embedding_mode == "sqrt_depth" else 1.0)
def cycle_gan_internal(inputs, targets, _, hparams): """Cycle GAN, main step used for training.""" with tf.variable_scope("cycle_gan"): # Embed inputs and targets. inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) inputs = common_layers.embedding( inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") targets = common_layers.embedding( targets_orig, hparams.vocab_size, hparams.hidden_size, "embed", reuse=True) x, _ = split_on_batch(inputs) _, y = split_on_batch(targets) # Y --> X y_fake = generator(y, hparams, "Fy", reuse=False) y_to_x_loss = lossfn(y, y_fake, True, hparams, True, "YtoX") # X --> Y x_fake = generator(x, hparams, "Gx", reuse=False) x_to_y_loss = lossfn(y, x_fake, True, hparams, True, "XtoY") # Cycle-Consistency y_fake_ = generator(y_fake, hparams, "Gx", reuse=True) x_fake_ = generator(x_fake, hparams, "Fy", reuse=True) x_to_x_loss = hparams.cycle_loss_multiplier1 * tf.reduce_mean( tf.abs(x_fake_ - x)) y_to_y_loss = hparams.cycle_loss_multiplier2 * tf.reduce_mean( tf.abs(y_fake_ - y)) cycloss = x_to_x_loss + y_to_y_loss sample_generated = generator(inputs, hparams, "Gx", reuse=True) sample_generated = tf.layers.dense( sample_generated, hparams.vocab_size, name="softmax", reuse=None) sample_generated = tf.stop_gradient( tf.expand_dims(sample_generated, axis=2)) losses = {"cycloss": cycloss, "y_to_x_loss": y_to_x_loss, "x_to_y_loss": x_to_y_loss} return sample_generated, losses
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] encoder_self_attention_bias = common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def targets_bottom(self, inputs): with tf.variable_scope(self.name): # Reshape inputs to 2-d tensor and embed the RGB pixel values. ret = common_layers.embedding( tf.to_int32(common_layers.flatten4d3d(inputs)), self.top_dimensionality, self._body_input_depth, name="input_rgb_embedding") if self._model_hparams.multiply_embedding_mode == "sqrt_depth": ret *= self._body_input_depth**0.5 reshape_shape = common_layers.shape_list(inputs)[:3] reshape_shape.append(self._body_input_depth * 3) ret = tf.reshape(ret, reshape_shape) return tf.layers.dense(ret, self._body_input_depth)
def targets_bottom(self, inputs): with tf.variable_scope(self.name): # Reshape inputs to 2-d tensor and embed the RGB pixel values. ret = common_layers.embedding(tf.to_int32( common_layers.flatten4d3d(inputs)), self.top_dimensionality, self._body_input_depth, name="input_rgb_embedding") if self._model_hparams.multiply_embedding_mode == "sqrt_depth": ret *= self._body_input_depth**0.5 reshape_shape = [ common_layers.shape_dim(inputs, i) for i in range(3) ] reshape_shape.append(self._body_input_depth * 3) ret = tf.reshape(ret, reshape_shape) return tf.layers.dense(ret, self._body_input_depth)
def set_embedding(x, vocab_size, dense_size, **kwargs): """Each (ID, position) tuple gets a unique embedding. Args: x: An int Tensor with shape [batch_size, length] whose elements are in [0, vocab_size). vocab_size: Int. The range of valid ID values elements in x can take. dense_size: Int. The dimensionality of an embedding vector. Returns: A float Tensor with shape [batch_size, length, dense_size]. """ x = keep_first_dims(x, 2) seq_length = common_layers.shape_list(x)[1] x += tf.range(seq_length, dtype=x.dtype) * vocab_size new_vocab_size = vocab_size * seq_length return common_layers.embedding(x, new_vocab_size, dense_size, **kwargs)
def body(self, features): observations = features["inputs_raw"] # Axis 0 - Batch. # Axis 1 - Input Frames, 4 frames. # Axis 2, 3 - Height & Width. # Axis 4 - Channels RGB, 3 colours. x = tf.transpose(observations, [0, 2, 3, 1, 4]) x_shape = common_layers.shape_list(x) x = tf.reshape(x, x_shape[:-2] + [-1]) dropout = getattr(self.hparams, "dropout_ppo", 0.0) with tf.variable_scope("feed_forward_cnn_small"): x = tf.cast(x, tf.float32) / 255.0 x = tf.layers.conv2d(x, 32, (5, 5), strides=(2, 2), activation=tf.nn.relu, padding="same") x = tf.layers.conv2d(x, 32, (5, 5), strides=(2, 2), activation=tf.nn.relu, padding="same") flat_x = tf.layers.flatten(x) if self.use_epochs: epoch = features["epoch"] + tf.zeros([x_shape[0]], dtype=tf.int32) # Randomly set epoch to 0 in some cases as that's the inference value. rand = tf.random.uniform([x_shape[0]]) epoch = tf.where(rand < 0.1, tf.zeros_like(epoch), epoch) # Embed the epoch number. emb_epoch = common_layers.embedding(epoch, 32, 32) # [batch, 32] flat_x = tf.concat([flat_x, emb_epoch], axis=1) flat_x = tf.layers.dropout(flat_x, rate=dropout) x = tf.layers.dense(flat_x, 128, activation=tf.nn.relu) logits = tf.layers.dense(x, self.hparams.problem.num_actions, name="dense2") logits = clip_logits(logits, self.hparams) logits = tf.expand_dims(logits, axis=1) value = tf.layers.dense(x, self.distributional_value_size) return {"target_policy": logits, "target_value": value}
def transformer_error_tag_prediction_layer(x, hparams, features, loss_mask, layer_collection=None): """Layer that predicts the error tag.""" with tf.variable_scope('error_tag_prediction'): x = maybe_flatten4d3d(x) vocab_size = hparams.problem.feature_info[ 'targets_error_tag'].vocab_size labels = features['targets_error_tag_raw'] with tf.variable_scope('projection'): bottleneck = common_layers.dense( x, hparams.error_tag_embed_size, layer_collection=layer_collection, name='bottleneck', ) logits = common_layers.dense( bottleneck, vocab_size, use_bias=False, layer_collection=layer_collection, name='logits', ) xent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) loss = tf.reduce_sum(xent * loss_mask) with tf.variable_scope('embedding'): # embed_mat = get_error_tag_embedding_matrix() y = common_layers.layer_preprocess( common_layers.embedding( labels, vocab_size, hparams.hidden_size, embedding_var=None, ), hparams, layer_collection=layer_collection, ) x = common_layers.layer_postprocess(x, y, hparams) return x, logits, loss
def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. Args: inputs: Tensor with shape [batch, memory_length, depth] target_space: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ignore_padding = get_ignore_padding(inputs) encoder_self_attention_bias = ignore_padding # Bias for self-attention to encourage attention to close positions. if hparams.proximity_bias: encoder_self_attention_bias += comm_attn.attention_bias_proximal( length=tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( x=target_space, vocab_size=32, dense_size=inputs.shape.as_list[-1], name='target_space_embedding') emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) # Question: wat encoder_input = inputs + emb_target_space if hparams.pos == 'timing': encoder_input = comm_attn.add_timing_signal_1d(encoder_input) # Putting this here since always called immediately after... encoder_input = with_dropout(encoder_input, hparams) return EncoderState(input=encoder_input, self_attn_bias=encoder_self_attention_bias, decoder_attn_bias=ignore_padding, output=None)
def body(self, features): filters = self.hparams.hidden_size cur_frame = features["inputs_0"] prev_frame = features["inputs_1"] if self.hparams.per_image_standardization: cur_frame = tf.map_fn( lambda frame: tf.image.per_image_standardization(frame), cur_frame) prev_frame = tf.map_fn( lambda frame: tf.image.per_image_standardization(frame), prev_frame) action = common_layers.embedding(tf.to_int64(features["action"]), 10, filters) action = tf.reshape(action, [-1, 1, 1, filters]) frames = tf.concat([cur_frame, prev_frame], axis=3) h1 = tf.layers.conv2d(frames, filters, kernel_size=(3, 3), padding="SAME") h2 = tf.layers.conv2d(tf.nn.relu(h1 + action), filters, kernel_size=(5, 5), padding="SAME") res = tf.layers.conv2d(tf.nn.relu(h2 + action), 3 * 256, kernel_size=(3, 3), padding="SAME") reward_pred_h1 = tf.reduce_mean(res, axis=[1, 2]) reward_pred = tf.layers.dense(reward_pred_h1, 2, name="reward") # reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( # labels=tf.to_int32(features["reward"]), logits=reward_pred) # reward_loss = tf.reduce_mean(reward_loss) x = tf.layers.flatten(h2) # l = tf.shape(res)[1] # w = tf.shape(res)[2] l = 210 w = 160 res = tf.reshape(res, [-1, l, w, 768]) return {"targets": res, "reward": x}
def body(self, features): filters = self.hparams.hidden_size cur_frame = tf.to_float(features["inputs"]) prev_frame = tf.to_float(features["inputs_prev"]) action_embedding_size = 32 action_space_size = 10 kernel = (3, 3) # Gather all inputs. action = common_layers.embedding(tf.to_int64(features["action"]), action_space_size, action_embedding_size) action = tf.reshape(action, [-1, 1, 1, action_embedding_size]) frames = tf.concat([cur_frame, prev_frame, action], axis=3) x = tf.layers.conv2d(frames, filters, kernel, activation=tf.nn.relu, strides=(2, 2), padding="SAME") # Run a stack of convolutions. for _ in xrange(self.num_hidden_layers): y = tf.layers.conv2d(frames, filters, kernel, activation=tf.nn.relu, strides=(1, 1), padding="SAME") x = common_layers.layer_norm(x + y) # Up-convolve. x = tf.layers.conv2d_transpose(frames, filters, kernel, activation=tf.nn.relu, strides=(2, 2), padding="SAME") # Output size is 3 * 256 for 3-channel color space. res = tf.layers.conv2d(x, 3 * 256, kernel, padding="SAME") height = tf.shape(res)[1] width = tf.shape(res)[2] res = tf.reshape(res, [-1, height, width, 3, 256]) return res
def transformer_prepare_encoder(inputs, target_space, hparams): """Copied from tensor2tensor.models.transformer.""" ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding(target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding", use_eager_mode=hparams.use_eager_mode) emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space # random_uniform_mask = tf.expand_dims(tf.to_float(tf.to_int32(tf.random_uniform([tf.shape(encoder_input)[0], tf.shape(encoder_input)[1]]) < hparams.mask_noise_prob)), axis=2) # encoder_input = encoder_input * (1 - random_uniform_mask) if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def embed_target_space(target_space_id, hidden_size): target_space_emb = common_layers.embedding( target_space_id, 32, hidden_size, name="target_space_embedding") return tf.reshape(target_space_emb, [1, 1, 1, -1])
def testEmbedding(self): x = np.random.randint(1, high=9, size=(3, 5)) y = common_layers.embedding(x, 10, 16) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (3, 5, 16))
def testFlatten4D3D(self): x = np.random.randint(1, high=9, size=(3, 5, 2)) y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7)) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (3, 5 * 2, 7))
def cycle_gan_internal(inputs, targets, _, hparams): """Cycle GAN, main step used for training.""" with tf.variable_scope("cycle_gan"): # Embed inputs and targets. inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) inputs = common_layers.embedding(inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") targets = common_layers.embedding(targets_orig, hparams.vocab_size, hparams.hidden_size, "embed", reuse=True) # Split the batch into input-input and target-target parts. inputs1, _ = split_on_batch(inputs) _, targets2 = split_on_batch(targets) # Define F and G, called inp2tgt and tgt2inp here. def inp2tgt(x, reuse=False): return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt", reuse) def tgt2inp(x, reuse=False): return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp", reuse) # Input-input part. inp1_tgt = inp2tgt(inputs1) inp1_back = tgt2inp(inp1_tgt) # Target-target part. tgt2_inp = tgt2inp(targets2, reuse=True) tgt2_back = inp2tgt(tgt2_inp, reuse=True) # Reconstruction losses. inp1_orig, _ = split_on_batch(inputs_orig) _, tgt2_orig = split_on_batch(targets_orig) inp1_loss = reconstruct_loss(inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) tgt2_loss = reconstruct_loss(tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) # Discriminator losses. dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams, "inp_disc") dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams, "tgt_disc") # Reconstruct targets from inputs. tgt = inp2tgt(inputs, reuse=True) tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) # We use the reconstruction only for tracking progress, no gradients here! tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) losses = { "input_input": hparams.cycle_loss_multiplier * inp1_loss, "target_target": hparams.cycle_loss_multiplier * tgt2_loss, "input_disc": dloss1, "target_disc": dloss2 } return tgt, losses
def cycle_vae_gan_internal(inputs, targets, _, hparams): """Cycle GAN, main step used for training.""" with tf.variable_scope("cycle_vae_gan"): # Embed inputs and targets. inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets) k = 2**hparams.num_compress_steps inputs_orig, targets_orig = common_layers.pad_to_same_length( inputs_orig, targets_orig, final_length_divisible_by=k) inputs = common_layers.embedding(inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed") targets = common_layers.embedding(targets_orig, hparams.vocab_size, hparams.hidden_size, "embed", reuse=True) # Split the batch into input-input and target-target parts. inputs1, _ = split_on_batch(inputs) _, targets2 = split_on_batch(targets) # Input-input part. inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress( inputs1, None, hparams, "inp2hyp", "hyp2inp") inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3) # Target-target part. tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress( targets2, None, hparams, "tgt2hyp", "hyp2tgt") tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3) # Reconstruction losses. inp1_orig, _ = split_on_batch(inputs_orig) _, tgt2_orig = split_on_batch(targets_orig) inp1_loss = reconstruct_loss(inp1_back, tf.squeeze(inp1_orig, axis=3), hparams) tgt2_loss = reconstruct_loss(tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True) # Discriminator loss. dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss") # Reconstruct targets from inputs. tgt, _, _, _ = transformer_vae.vae_compress(inputs, None, hparams, "inp2hyp", "hyp2tgt", reuse=True) tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True) # We use the reconstruction only for tracking progress, no gradients here! tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2)) kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps) losses = { "input_input": hparams.cycle_loss_multiplier * inp1_loss, "target_target": hparams.cycle_loss_multiplier * tgt2_loss, "input_kl": kl_loss1 * kl_rev_decay * 15.0, "target_kl": kl_loss2 * kl_rev_decay * 15.0, "discriminator": dloss } return tgt, losses
def cycle_gan_internal(inputs, targets, _, hparams): """Cycle GAN, main step used for training.""" with tf.variable_scope("cycle_gan"): # Embed inputs and targets. inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)#[? ? 1 1] inputs = common_layers.embedding( inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")#[? ? 1 384] targets = common_layers.embedding( targets_orig, hparams.vocab_size, hparams.hidden_size, "embed", reuse=True) ###????? x, _ = split_on_batch(inputs) _, y = split_on_batch(targets) whether_compress=True#real #whether_compress=False # Y --> X y_fake = generator(y, hparams, "Fy", reuse=False)# [? ? 1 384] #y_to_x_loss = lossfn(y, y_fake, True, hparams, True, "YtoX")###??? wrong? y_to_x_loss = lossfn(x, y_fake, whether_compress, hparams, True, "YtoX")##yr add # X --> Y x_fake = generator(x, hparams, "Gx", reuse=False) x_to_y_loss = lossfn(y, x_fake, whether_compress, hparams, True, "XtoY") # Cycle-Consistency y_fake_ = generator(y_fake, hparams, "Gx", reuse=True) x_fake_ = generator(x_fake, hparams, "Fy", reuse=True) x_to_x_loss = hparams.cycle_loss_multiplier1 * tf.reduce_mean( tf.abs(x_fake_ - x)) y_to_y_loss = hparams.cycle_loss_multiplier2 * tf.reduce_mean( tf.abs(y_fake_ - y)) cycloss = x_to_x_loss + y_to_y_loss sample_generated = generator(inputs, hparams, "Gx", reuse=True)#[? ? 1 384] sample_generated = tf.layers.dense( sample_generated, hparams.vocab_size, name="softmax", reuse=None)#[? ? 1 6381] sample_generated = tf.stop_gradient( tf.expand_dims(sample_generated, axis=2)) # losses = {"cycloss": cycloss, # "y_to_x_loss": y_to_x_loss, # "x_to_y_loss": x_to_y_loss, # 'yr1':x_to_x_loss,'yr2':y_to_y_loss} #'x':[x,inputs],'yf':y_fake}#fail #cycloss | y_to_x_loss sometimes nan ,sometimes otherwise # losses = {"cycloss": 1.0, # "y_to_x_loss": 1.0, # "x_to_y_loss": x_to_y_loss, # "training":1.0}# no need to calc loss(generated_sample,target) # losses = {"cycloss": 1.0, # "y_to_x_loss": 1.0, # "x_to_y_loss": 1.0} losses = {"cycloss": cycloss, "y_to_x_loss": y_to_x_loss, "x_to_y_loss": x_to_y_loss}#real return sample_generated, losses# [? ? 1 1 1471] loss
def body(self, features): hparams = self._hparams ps_devices = self._ps_devices single_device = (len(ps_devices) == 1) assert hparams.num_model_shards % len(ps_devices) == 0 shards_per_device = hparams.num_model_shards // len(ps_devices) model_devices = [ps_devices[i // shards_per_device] for i in range(hparams.num_model_shards)] print("model_devices = %s" % model_devices) mp = expert_utils.Parallelism(model_devices, reuse=False) targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size # squeeze out channels, heights targets = tf.squeeze(features["targets_raw"], [2, 3]) targets_embedding_var = mp( tf.get_variable, "embedding", [[targets_vocab_size, hparams.hidden_size]] * mp.n, initializer=tf.random_normal_initializer( 0.0, hparams.hidden_size**-0.5)) shifted_targets = common_layers.shift_right_2d(targets) # Bypass the symbol modality and use a different embedding on each shard. if single_device: targets_embedding_var_combined = tf.concat(targets_embedding_var, 1) decoder_input_combined = common_layers.embedding( shifted_targets, targets_vocab_size, hparams.hidden_size * mp.n, multiplier=hparams.hidden_size**0.5, embedding_var=targets_embedding_var_combined, ) decoder_input = tf.split(decoder_input_combined, mp.n, axis=2) else: targets_embedding_var_combined = None decoder_input = mp( common_layers.embedding, shifted_targets, targets_vocab_size, hparams.hidden_size, multiplier=hparams.hidden_size**0.5, embedding_var=targets_embedding_var, ) decoder_self_attention_bias = mp( common_attention.attention_bias_lower_triangle, tf.shape(targets)[1]) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias = mp( tf.add, decoder_self_attention_bias, mp(common_attention.attention_bias_same_segment, targets_segmentation, targets_segmentation)) decoder_input = mp( common_attention.add_timing_signal_1d_given_position, decoder_input, targets_position) else: targets_position = None decoder_self_attention_bias = mp( common_attention.attention_bias_lower_triangle, tf.shape(targets)[1]) decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input) if self.has_input: inputs = tf.squeeze(features["inputs_raw"], [2, 3]) inputs_vocab_size = self._problem_hparams.vocabulary["inputs"].vocab_size # share everything for now share_inputs_and_targets_embedding = True if share_inputs_and_targets_embedding: assert inputs_vocab_size == targets_vocab_size inputs_embedding_var = targets_embedding_var inputs_embedding_var_combined = targets_embedding_var_combined if single_device: encoder_input_combined = common_layers.embedding( inputs, inputs_vocab_size, hparams.hidden_size * mp.n, multiplier=hparams.hidden_size**0.5, embedding_var=inputs_embedding_var_combined, ) encoder_input = tf.split(encoder_input_combined, mp.n, axis=2) else: encoder_input = mp( common_layers.embedding, inputs, inputs_vocab_size, hparams.hidden_size, multiplier=hparams.hidden_size**0.5, embedding_var=inputs_embedding_var, ) if "inputs_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] encoder_self_attention_bias = mp( common_attention.attention_bias_same_segment, inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = mp( common_attention.attention_bias_same_segment, targets_segmentation, inputs_segmentation) encoder_input = mp( common_attention.add_timing_signal_1d_given_position, encoder_input, inputs_position) else: encoder_padding = tf.to_float(tf.equal(inputs, 0)) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input) # encoder stack here with tf.variable_scope("encoder"): encoder_input = mp( tf.nn.dropout, encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = _layer_stack( mp, encoder_input, encoder_self_attention_bias, hparams.encoder_layers, hparams) else: encoder_decoder_attention_bias = None encoder_output = None with tf.variable_scope("decoder"): decoder_input = mp( tf.nn.dropout, decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = _layer_stack( mp, decoder_input, decoder_self_attention_bias, layers=hparams.decoder_layers, hparams=hparams, encoder_output=encoder_output, encoder_decoder_attention_bias=encoder_decoder_attention_bias) # Bypass the symbol modality and compute logits directly. # We compute a different set of logits on each shard, and sum them. # Share the weights with the target embedding. output_var = targets_embedding_var output_var_combined = targets_embedding_var_combined if single_device: decoder_output = tf.concat(decoder_output, 2) logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]]) num, denom = common_layers.padded_cross_entropy( logits, targets, hparams.label_smoothing) training_loss = num / denom else: logits = mp( tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n) logits = expert_utils.all_reduce_ring(logits, mp) # On each device, we compute the loss for a part of the batch. # This is faster than computing the whole loss on one shard. mp, logits = expert_utils.reduce_by_device(mp, logits, lambda l: l[0]) def _loss_for_shard(logits, targets, shard): logits = common_layers.approximate_split(logits, mp.n, 0)[shard] targets = common_layers.approximate_split(targets, mp.n, 0)[shard] return common_layers.padded_cross_entropy( logits, targets, hparams.label_smoothing) num, denom = mp(_loss_for_shard, logits, targets, range(mp.n)) training_loss = tf.add_n(num) / tf.add_n(denom) logits = logits[0] logits = tf.expand_dims(tf.expand_dims(logits, 2), 3) # override training loss so that it is not computed externally. losses = {"training": training_loss} return logits, losses
def testFlatten4D3D(self): x = np.random.random_integers(1, high=8, size=(3, 5, 2)) y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7)) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (3, 5 * 2, 7))
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] if (hasattr(hparams, "unidirectional_encoder") and hparams.unidirectional_encoder): tf.logging.info("Using unidirectional encoder") encoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(inputs)[1])) else: encoder_self_attention_bias = ( common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation)) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment(targets_segmentation, inputs_segmentation)) else: encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) if (hasattr(hparams, "unidirectional_encoder") and hparams.unidirectional_encoder): tf.logging.info("Using unidirectional encoder") encoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(inputs)[1])) else: # Usual case - not a packed dataset. encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) if hparams.get("use_target_space_embedding", True): # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding", dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" else tf.float32) emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d(encoder_input) elif hparams.pos == "emb": encoder_input = common_attention.add_positional_embedding( encoder_input, hparams.max_length, "inputs_positional_embedding", inputs_position) if hparams.activation_dtype == "bfloat16": encoder_self_attention_bias = tf.cast(encoder_self_attention_bias, tf.bfloat16) encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias, tf.bfloat16) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. sg: inputs here have been flattened to 3d [batch, height, width, embed_size] -> [batch, height*width, embed_size] target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] encoder_self_attention_bias = common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) # sg: [batch_size, sentence_len] ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) # sg: [batch_size, 1, 1, sentence_len] # an bias tensor to be added to attention logits # for padded words, the biases equal -1e9 # non padded words equal 0 encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, # sg: 32 vocab_size (comments in fun, may be not exactly) # this is because at current time t2t only have # SpaceID in problem.py from 1 to 32 ishape_static[-1], # sg: embedding dimension name="target_space_embedding", dtype=tf.bfloat16 if hparams.activation_dtype == "bfloat16" else tf.float32) # sg: [1,128] a dense vector to represent SpaceID emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) # sg: [1,1,128] encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d( encoder_input) if hparams.activation_dtype == "bfloat16": encoder_self_attention_bias = tf.cast(encoder_self_attention_bias, tf.bfloat16) encoder_decoder_attention_bias = tf.cast( encoder_decoder_attention_bias, tf.bfloat16) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None, type_ids=None, num_types=None, reuse_target_embedding=tf.AUTO_REUSE): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. type_ids: optional, an int64 Tensor of shape [batch, length] that allows for adding type embeddings, similar to positional embeddings. num_types: optional, an int that decides the number of types in type_ids. reuse_target_embedding: option to reuse variable name in the case that symbol modalities are reused between inputs/targets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] if (hasattr(hparams, "unidirectional_encoder") and hparams.unidirectional_encoder): tf.logging.info("Using unidirectional encoder") encoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(inputs)[1])) else: encoder_self_attention_bias = ( common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation)) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) if (hasattr(hparams, "unidirectional_encoder") and hparams.unidirectional_encoder): tf.logging.info("Using unidirectional encoder") encoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(inputs)[1])) else: # Usual case - not a packed dataset. encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) if target_space is not None and hparams.get("use_target_space_embedding", True): # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding", dtype=hparams.get("activation_dtype", "float32"), reuse=reuse_target_embedding) emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d( encoder_input) elif hparams.pos == "timing_from_features": encoder_input = common_attention.add_timing_signals_from_features( encoder_input, features, hparams.position_features) elif hparams.pos == "emb": encoder_input = common_attention.add_positional_embedding( encoder_input, hparams.max_length, "inputs_positional_embedding", inputs_position) # Add type embeddings if type_ids is not None: if not num_types: raise ValueError("Need to set num_types as well.") encoder_input = common_attention.add_positional_embedding( encoder_input, num_types, "inputs_type_embedding", type_ids) encoder_self_attention_bias = common_layers.cast_like( encoder_self_attention_bias, encoder_input) encoder_decoder_attention_bias = common_layers.cast_like( encoder_decoder_attention_bias, encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def _embedding(self, x, reuse=None): with tf.variable_scope(self.name): return common_layers.embedding(x, self.vocab_size, self.dense_size, reuse=reuse, multiplier=self.multiplier)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] encoder_self_attention_bias = common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding(target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space #if hparams.pos == "timing": # if inputs_position is not None: # encoder_input = common_attention.add_timing_signal_1d_given_position( # encoder_input, inputs_position) # else: # encoder_input = common_attention.add_timing_signal_1d(encoder_input) raw_encoder_input = tf.squeeze(features['inputs_raw'], axis=[-2, -1]) pos_signals = generate_positional_signals(raw_encoder_input, hparams) pos_embeddings = generate_positional_embeddings(pos_signals, hparams.encoder_pos, hparams) if "sum" in hparams.encoder_pos_integration: encoder_input = encoder_input + pos_embeddings elif "ffn" in hparams.encoder_pos_integration: with tf.variable_scope("encoder_pos_ffn"): encoder_input = tf.concat([encoder_input, pos_embeddings], axis=2) encoder_input = transformer_ffn_layer(encoder_input, hparams, conv_padding="SAME") return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def encode_lex(self, encoder_input, target_space, hparams): ''' encoder_input: [batch_size, input_len, hidden_dim] return: encoder_output: [batch_size, input_len, hidden_dim] encoder_decoder_attention_bias: [batch_size, input_len] ''' encoder_output_slices = [] for i in range(encoder_input.get_shape()[2].value): encoder_input_slice = encoder_input[:, :, i, :] # bias encoder_padding = common_attention.embedding_to_padding( encoder_input_slice) print(encoder_padding.shape.as_list() ) # ==> [None, None] (None, None, 4) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding print(ignore_padding.shape.as_list() ) # ==> [None, 1, 1, None] (None, 1, 1, None, 4) # add target space to encoder input? ishape_static = encoder_input_slice.shape.as_list() print(ishape_static) # ==> [None, None, 300] (None, None, 4, 300) emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") print(emb_target_space.shape.as_list()) # ==> [300] emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) print(emb_target_space.shape.as_list()) # ==> [1, 1, 300] encoder_input_slice += emb_target_space print(encoder_input_slice.shape.as_list() ) # ==> [None, None, 300] (None, None, 4, 300) # add timing signals to encoder input if hparams.pos == "timing": encoder_input_slice = common_attention.add_timing_signal_1d( encoder_input_slice) # dropout encoder_input_slice = tf.nn.dropout( encoder_input_slice, 1.0 - hparams.layer_prepostprocess_dropout) # encoder ''' multihead_attention( query_antecedent: [batch, length_q, channels], -- x, x memory_antecedent: [batch, length_m, channels], -- None, encoder_output bias: bias tensor, -- encoder_self_attention_bias total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size output_depth: integer, -- hparams.hidden_size num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8) dropout_rate: float, -- hparams.attention_dropout ... cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention) ''' x = encoder_input_slice with tf.variable_scope("encoder" + str(i)): # remove pad pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) # self-attention along the sentence dimension for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): query_antecedent = common_layers.layer_preprocess( x, hparams) y = common_attention.multihead_attention( query_antecedent=query_antecedent, memory_antecedent=None, bias=encoder_self_attention_bias, total_key_depth=hparams.attention_key_channels or hparams.hidden_size, total_value_depth=hparams. attention_value_channels or hparams.hidden_size, output_depth=hparams.hidden_size, num_heads=hparams.num_heads, dropout_rate=hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams. max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer.transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) encoder_output_slice = common_layers.layer_preprocess( x, hparams) print(encoder_output_slice.shape.as_list() ) # ==> [None, None, 300] (None, None, 4, 300) encoder_output_slices.append(encoder_output_slice) encoder_output = tf.stack(encoder_output_slices, 2) print(encoder_output.shape.as_list()) # ==> [None, None, 4, 300] # -------- encoder_output_slices = [] #hparams2 = copy.deepcopy(hparams) #hparams2.hidden_size = hparams.lex_cap num_heads = int(hparams.lex_cap / 2) hparams2 = tf.contrib.training.HParams( layer_preprocess_sequence=hparams.layer_preprocess_sequence, layer_postprocess_sequence=hparams.layer_postprocess_sequence, layer_prepostprocess_dropout=hparams.layer_prepostprocess_dropout, norm_type=hparams.norm_type, hidden_size=hparams.lex_cap, norm_epsilon=hparams.norm_epsilon, ffn_layer=hparams.ffn_layer, filter_size=hparams.filter_size, relu_dropout=hparams.relu_dropout, num_heads=num_heads, attention_dropout=hparams.attention_dropout, parameter_attention_key_channels=hparams. parameter_attention_key_channels, parameter_attention_value_channels=hparams. parameter_attention_value_channels) for i in range(encoder_output.get_shape()[3].value): encoder_input_slice = encoder_output[:, :, :, i] #print(encoder_input_slice.shape.as_list()) # ==> [None, None, 4] encoder_padding = common_attention.embedding_to_padding( encoder_input_slice) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding #print(encoder_self_attention_bias.shape.as_list()) # ==> [None, 1, 1, None] # encoder ''' multihead_attention( query_antecedent: [batch, length_q, channels], -- x, x memory_antecedent: [batch, length_m, channels], -- None, encoder_output bias: bias tensor, -- encoder_self_attention_bias total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size output_depth: integer, -- hparams.hidden_size num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8) dropout_rate: float, -- hparams.attention_dropout ... cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention) ''' x = encoder_input_slice with tf.variable_scope("encoder_extra" + str(i)): # remove pad pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) # self-attention along the lexicon dimension with tf.variable_scope("layer_extra"): with tf.variable_scope("self_attention"): #query_antecedent = layer_preprocess2(x, hparams, hparams.lex_cap) query_antecedent = common_layers.layer_preprocess( x, hparams2) y = common_attention.multihead_attention( query_antecedent=query_antecedent, memory_antecedent=None, bias=encoder_self_attention_bias, total_key_depth=hparams.attention_key_channels or hparams.lex_cap, total_value_depth=hparams.attention_value_channels or hparams.lex_cap, output_depth=hparams.lex_cap, num_heads=num_heads, dropout_rate=hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position ) #x = layer_postprocess2(x, y, hparams, hparams.lex_cap) x = common_layers.layer_postprocess(x, y, hparams2) with tf.variable_scope("ffn"): y = transformer.transformer_ffn_layer( common_layers.layer_preprocess(x, hparams2), hparams2, pad_remover) #x = layer_postprocess2(x, y, hparams, hparams.lex_cap) x = common_layers.layer_postprocess(x, y, hparams2) #encoder_output_slice = layer_preprocess2(x, hparams, hparams.lex_cap) encoder_output_slice = common_layers.layer_preprocess( x, hparams2) #print(encoder_output_slice.shape.as_list()) # ==> [None, None, 4] (None, None, 4, 300) encoder_output_slices.append(encoder_output_slice) encoder_output = tf.stack(encoder_output_slices, 3) print(encoder_output.shape.as_list()) # ==> [None, None, 4, 300] # -------- lex_cap = encoder_output.get_shape()[2].value embed_len = encoder_output.get_shape()[3].value assert (lex_cap == hparams.lex_cap) aggregate_layer = tf.get_variable( name="Aggregate", shape=[embed_len, embed_len, lex_cap], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) encoder_output = tf.tensordot(encoder_output, aggregate_layer, axes=[[2, 3], [1, 2]]) print(encoder_output.shape.as_list()) # ==> [None, None, 300] return encoder_output, encoder_decoder_attention_bias
def testEmbedding(self): x = np.random.random_integers(1, high=8, size=(3, 5)) y = common_layers.embedding(x, 10, 16) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (3, 5, 16))
def embed_target_space(target_space_id, model_d): target_space_emb = common_layers.embedding( target_space_id, 32, model_d, name="target_space_embedding") return tf.reshape(target_space_emb, [1, 1, 1, -1])
def embed_target_space(target_space_id, hidden_size): target_space_emb = common_layers.embedding(target_space_id, 32, hidden_size, name="target_space_embedding") return tf.reshape(target_space_emb, [1, 1, 1, -1])