def body(self, features): assert self._hparams.block_size > 0 assert not common_layers.is_xla_compiled() assert "targets_segmentation" not in features decoder_output = super(TransformerBlockParallel, self).body(features) assert not isinstance(decoder_output, tuple) assert len(decoder_output.shape) == 4 relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(self._hparams, "relu_dropout_broadcast_dims", ""))) with tf.variable_scope("block_size_%d" % self._hparams.block_size): block_output = common_layers.dense_relu_dense( decoder_output, self._hparams.block_size * self._hparams.filter_size, self._hparams.block_size * self._hparams.hidden_size, dropout=self._hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) batch_size, length = common_layers.shape_list(decoder_output)[:2] block_output = tf.reshape(block_output, [ batch_size, length, self._hparams.block_size, self._hparams.hidden_size ]) block_output = common_layers.layer_postprocess(decoder_output, block_output, self._hparams) return block_output
def body(self, features): assert self._hparams.block_size > 0 assert not common_layers.is_xla_compiled() hparams = copy.copy(self._hparams) targets = features["targets"] inputs = features["inputs"] if not (tf.get_variable_scope().reuse or hparams.mode == tf.estimator.ModeKeys.PREDICT): tf.summary.image("inputs", inputs, max_outputs=1) tf.summary.image("targets", targets, max_outputs=1) encoder_input = cia.prepare_encoder(inputs, hparams) encoder_output = cia.transformer_encoder_layers( encoder_input, hparams.num_encoder_layers, hparams, attention_type=hparams.enc_attention_type, name="encoder") decoder_input, rows, cols = cia.prepare_decoder(targets, hparams) decoder_output = cia.transformer_decoder_layers( decoder_input, encoder_output, hparams.num_decoder_layers, hparams, attention_type=hparams.dec_attention_type, name="decoder") assert not isinstance(decoder_output, tuple) assert len(decoder_output.shape) == 4 relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(self._hparams, "relu_dropout_broadcast_dims", ""))) with tf.variable_scope("block_size_%d" % self._hparams.block_size): tf.logging.info("Using block_size %d", self._hparams.block_size) block_output = common_layers.dense_relu_dense( decoder_output, self._hparams.block_size * self._hparams.filter_size, self._hparams.block_size * self._hparams.hidden_size, dropout=self._hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3] decoder_output = tf.reshape( decoder_output, [batch_size, rows, cols, 1, self._hparams.hidden_size]) block_output = tf.reshape(block_output, [ batch_size, rows, cols, self._hparams.block_size, self._hparams.hidden_size ]) block_output = common_layers.layer_postprocess(decoder_output, block_output, self._hparams) return block_output
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_biases, decoder_self_attention_biases, hparams, cache=None, decode_loop_step=None, nonpadding=None, losses=None): """Decode Transformer outputs from encoder representation. Args: decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_biases: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_biases: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparameters for model. cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. nonpadding: optional Tensor with shape [batch_size, decoder_length] losses: optional list onto which to append extra training losses Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer_decoder( decoder_input, encoder_output, decoder_self_attention_biases, encoder_decoder_attention_biases, hparams, cache=cache, decode_loop_step=decode_loop_step, nonpadding=nonpadding, save_weights_to=self.attention_weights, losses=losses) if (common_layers.is_xla_compiled() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, cache=None, nonpadding=None, losses=None): """Decode inputs using _decoder(). This performs the same way as transformer.Transformer.decode with the decoder portion replaced with _decoder(). Args: decoder_input: Inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: Hyperparmeters for model. cache: Dict, containing tensors which are the results of previous attentions, used for fast decoding. nonpadding: Optional Tensor with shape [batch_size, decoder_length] losses: Unused losses. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = self._decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=cache, nonpadding=nonpadding, save_weights_to=self.attention_weights) if (common_layers.is_xla_compiled() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. return decoder_output # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def transformer_encoder(features, hparams, embed_scope=None, embed_token_fn=common_embed.embed_tokens, attention_weights=None): """Encodes a screen using Transformer. Args: features: the feature dict. hparams: the hyperparameter. embed_scope: the scope for token embedding. embed_token_fn: the embed function. attention_weights: the attention_weights dict. Returns: encoder_outputs: a Tensor of shape [batch_size, num_steps, max_object_count, hidden_size] encoder_attn_bias: A tensor of shape [batch_size, num_steps, max_object_count] """ tf.logging.info("Using Transformer screen encoder") # Remove the default positional encoding in Transformer object_embed, object_mask, encoder_attn_bias = prepare_encoder_input( features=features, hparams=hparams, embed_scope=embed_scope, embed_token_fn=embed_token_fn) with tf.variable_scope("encode_screen", reuse=tf.AUTO_REUSE): shape = tf.shape(object_embed) with tf.control_dependencies( [tf.assert_equal(shape[3], hparams.hidden_size)]): object_embed = tf.reshape( object_embed, [shape[0] * shape[1], shape[2], hparams.hidden_size]) encoder_input = tf.nn.dropout(object_embed, keep_prob=1.0 - hparams.layer_prepostprocess_dropout) self_attention_bias = tf.expand_dims(tf.expand_dims(tf.reshape( encoder_attn_bias, [shape[0] * shape[1], shape[2]]), axis=1), axis=1) encoder_output = transformer.transformer_encoder( encoder_input=encoder_input, encoder_self_attention_bias=self_attention_bias, hparams=hparams, save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled()) encoder_output = tf.reshape(encoder_output, [shape[0], shape[1], shape[2], shape[3]]) return encoder_output, object_mask, encoder_attn_bias
def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for EVAL mode.""" del losses_dict hparams = self.hparams problem = hparams.problem if common_layers.is_xla_compiled(): raise NotImplementedError("TPU usage is not supported") outputs = tf.contrib.framework.nest.map_structure( lambda x: tf.squeeze(tf.argmax(x, axis=-1), axis=[2, 3]), logits) if hasattr(problem, "compute_predictions"): predictions = problem.compute_predictions(outputs, features, hparams, decode=False) else: predictions = outputs problem_metrics = problem.eval_metrics() if isinstance(problem_metrics, list): eval_metrics = metrics.create_evaluation_metrics([problem], hparams) for metric_name, metric_fn in eval_metrics.items(): eval_metrics[metric_name] = metric_fn(logits, features, features["targets"]) else: eval_metrics = {} for metric_key, metric_fn in problem_metrics.items(): metric_name = "metrics-%s/%s" % (problem.name, metric_key) first, second = metric_fn(predictions, labels, features) if isinstance(second, tf.Tensor): scores, weights = first, second eval_metrics[metric_name] = tf.metrics.mean( scores, weights) else: value, update_op = first, second eval_metrics[metric_name] = (value, update_op) return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.EVAL, eval_metric_ops=eval_metrics, loss=loss)
def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Constructs `tf.estimator.EstimatorSpec` for EVAL (evaluation) mode.""" estimator_spec = super(TransformerAE, self).estimator_spec_eval( features, logits, labels, loss, losses_dict) if common_layers.is_xla_compiled(): # For TPUs (and XLA more broadly?), do not add summary hooks that depend # on losses; they are not supported. return estimator_spec summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES, scope="losses") summary_op.extend(tf.get_collection(tf.GraphKeys.SUMMARIES, scope="loss")) summary_op.append(tf.summary.scalar("loss", loss)) summary_saver_hook = tf.train.SummarySaverHook( save_steps=100, summary_op=summary_op, output_dir=os.path.join(self.hparams.model_dir, "eval")) hooks = list(estimator_spec.evaluation_hooks) hooks.append(summary_saver_hook) return estimator_spec._replace(evaluation_hooks=hooks)
def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Constructs `tf.estimator.EstimatorSpec` for EVAL (evaluation) mode.""" estimator_spec = super(TransformerAE, self).estimator_spec_eval( features, logits, labels, loss, losses_dict) if common_layers.is_xla_compiled(): # For TPUs (and XLA more broadly?), do not add summary hooks that depend # on losses; they are not supported. return estimator_spec summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES, scope="losses") summary_op.extend(tf.get_collection(tf.GraphKeys.SUMMARIES, scope="loss")) summary_op.append(tf.summary.scalar("loss", loss)) summary_saver_hook = tf.train.SummarySaverHook( save_steps=100, summary_op=summary_op, output_dir=os.path.join(self.hparams.model_dir, "eval")) hooks = list(estimator_spec.evaluation_hooks) hooks.append(summary_saver_hook) return estimator_spec._replace(evaluation_hooks=hooks)
def top(self, body_output, _): """Generate logits. Args: body_output: A Tensor with shape [batch, p0, p1, body_input_depth] Returns: logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. """ if self._model_hparams.symbol_modality_skip_top: return tf.expand_dims(body_output, 3) if self._model_hparams.shared_embedding_and_softmax_weights: scope_name = "shared" reuse = True else: scope_name = "softmax" reuse = False with tf.variable_scope(scope_name, reuse=reuse): body_output_shape = common_layers.shape_list(body_output) var = self._get_weights(body_output_shape[-1]) if (self._model_hparams.factored_logits and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # insert channels dimension body_output = tf.expand_dims(body_output, 3) return common_layers.FactoredTensor(body_output, var) else: body_output = tf.reshape(body_output, [-1, body_output_shape[-1]]) logits = tf.matmul(body_output, var, transpose_b=True) if (common_layers.is_xla_compiled() and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return logits else: return tf.reshape( logits, body_output_shape[:-1] + [1, self._vocab_size])
def compute_gradients(self, loss, var_list=None, **kwargs): # pylint: disable=arguments-differ gradients = self._opt.compute_gradients(loss, var_list, **kwargs) def cast_grad_tpu(g, v): """Should match upstream t2t https://github.com/tensorflow/tensor2tensor/blob/1547c25571633f828ddd74accba76d07d8d043af/tensor2tensor/utils/optimize.py#L232 """ if v is not None and g is not None: g = common_layers.cast_like(g, v) if self._zero_grads and g is None: g = tf.zeros_like(v) return (g, v) def cast_grad_gpu(g, v): """ August 7 2018: We still need the code block below instead Refer to https://github.com/tensorflow/tensor2tensor/issues/979. We need `use_resource=False` in model_fn in utils/t2t_model.py and the old version of cast_grad here. Without both of these changes, we are very slow with large word embeddings on the CPU. Sept 30 2019: We tried removing this since we are off word embeddings but slowdown seems to still be around """ if v is None or g is None: return (g, v) if v.dtype.base_dtype == g.dtype.base_dtype: return (g, v) return (tf.cast(g, v.dtype), v) # separate out tpu vs gpu cast grad so that changes in # https://github.com/medicode/tensor2tensor/pull/130/files#diff-2b8e7a5e8b58c8e97ae722ba253dff43 # preserve speed on gpus cast_grad = (cast_grad_tpu if common_layers.is_xla_compiled() else cast_grad_gpu) gradients = [cast_grad(g, v) for g, v in gradients] return gradients
def encode_decode_task(features, hparams, train, attention_weights=None): """Model core graph for the one-shot action. Args: features: a dictionary contains "inputs" that is a tensor in shape of [batch_size, num_tokens], "verb_id_seq" that is in shape of [batch_size, num_actions], "object_spans" and "param_span" tensor in shape of [batch_size, num_actions, 2]. 0 is used as padding or non-existent values. hparams: the general hyperparameters for the model. train: the train mode. attention_weights: the dict to keep attention weights for analysis. Returns: loss_dict: the losses for training. prediction_dict: the predictions for action tuples. areas: the area encodings of the task. scope: the embedding scope. """ del train input_embeddings, scope = common_embed.embed_tokens( features["task"], hparams.task_vocab_size, hparams.hidden_size, hparams) with tf.variable_scope("encode_decode", reuse=tf.AUTO_REUSE): encoder_nonpadding = tf.minimum(tf.to_float(features["task"]), 1.0) input_embeddings = tf.multiply(tf.expand_dims(encoder_nonpadding, 2), input_embeddings) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(input_embeddings, None, hparams, features=None)) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1.0 - hparams.layer_prepostprocess_dropout) if hparams.instruction_encoder == "transformer": encoder_output = transformer.transformer_encoder( encoder_input, self_attention_bias, hparams, save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled()) else: raise ValueError("Unsupported instruction encoder %s" % (hparams.instruction_encoder)) span_rep = hparams.get("span_rep", "area") area_encodings, area_starts, area_ends = area_utils.compute_sum_image( encoder_output, max_area_width=hparams.max_span) current_shape = tf.shape(area_encodings) if span_rep == "area": area_encodings, _, _ = area_utils.compute_sum_image( encoder_output, max_area_width=hparams.max_span) elif span_rep == "basic": area_encodings = area_utils.compute_alternative_span_rep( encoder_output, input_embeddings, max_area_width=hparams.max_span, hidden_size=hparams.hidden_size, advanced=False) elif span_rep == "coref": area_encodings = area_utils.compute_alternative_span_rep( encoder_output, input_embeddings, max_area_width=hparams.max_span, hidden_size=hparams.hidden_size, advanced=True) else: raise ValueError("xyz") areas = {} areas["encodings"] = area_encodings areas["starts"] = area_starts areas["ends"] = area_ends with tf.control_dependencies([ tf.print("encoder_output", tf.shape(encoder_output)), tf.assert_equal(current_shape, tf.shape(area_encodings), summarize=100) ]): paddings = tf.cast(tf.less(self_attention_bias, -1), tf.int32) padding_sum, _, _ = area_utils.compute_sum_image( tf.expand_dims(tf.squeeze(paddings, [1, 2]), 2), max_area_width=hparams.max_span) num_areas = common_layers.shape_list(area_encodings)[1] area_paddings = tf.reshape(tf.minimum(tf.to_float(padding_sum), 1.0), [-1, num_areas]) areas["bias"] = area_paddings decoder_nonpadding = tf.to_float( tf.greater(features["verb_refs"][:, :, 1], features["verb_refs"][:, :, 0])) if hparams.instruction_encoder == "lstm": hparams_decoder = copy.copy(hparams) hparams_decoder.set_hparam("pos", "none") else: hparams_decoder = hparams decoder_input, decoder_self_attention_bias = _prepare_decoder_input( area_encodings, decoder_nonpadding, features, hparams_decoder, embed_scope=scope) decoder_input = tf.nn.dropout(decoder_input, keep_prob=1.0 - hparams.layer_prepostprocess_dropout) if hparams.instruction_decoder == "transformer": decoder_output = transformer.transformer_decoder( decoder_input=decoder_input, encoder_output=encoder_output, decoder_self_attention_bias=decoder_self_attention_bias, encoder_decoder_attention_bias=encoder_decoder_attention_bias, hparams=hparams_decoder) else: raise ValueError("Unsupported instruction encoder %s" % (hparams.instruction_encoder)) return decoder_output, decoder_nonpadding, areas, scope
def perf_transformer_encode(encoder_function, inputs, target_space, hparams, baseline, attention_weights=None, features=None, losses=None, prepare_encoder_fn=None, **kwargs): """Encoding for performance autoencoder, which mean-aggregates across time. Args: encoder_function: the encoder function inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparameters for model. baseline: if True, does not mean-aggregate the encoder output. attention_weights: weight to store attention to. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: optional list onto which to append extra training losses prepare_encoder_fn: optional, alternative to transformer_prepare_encoder. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] """ inputs = common_layers.flatten4d3d(inputs) if not prepare_encoder_fn: prepare_encoder_fn = transformer_prepare_encoder encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( prepare_encoder_fn(inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: attn_bias_for_padding = encoder_decoder_attention_bias encoder_output = encoder_function( encoder_input, self_attention_bias, hparams, name="encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=attn_bias_for_padding, **kwargs) if not baseline: encoder_output = tf.math.reduce_mean(encoder_output, axis=1, keep_dims=True) encoder_decoder_attention_bias = tf.math.reduce_mean( encoder_decoder_attention_bias, axis=-1, keep_dims=True) return encoder_output, encoder_decoder_attention_bias
def mel_perf_transformer_encode(encoder_function, perf_inputs, mel_inputs, target_space, hparams, attention_weights=None, features=None, losses=None, prepare_encoder_fn=None, **kwargs): """Encode transformer inputs. Used for melody & performance autoencoder. Performance is mean-aggregated across time and combined with melody in a variety of different ways. Args: encoder_function: the encoder function perf_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. mel_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparameters for model. attention_weights: weight to store attention to. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: optional list onto which to append extra training losses prepare_encoder_fn: optional, alternative to transformer_prepare_encoder. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] """ perf_inputs = common_layers.flatten4d3d(perf_inputs) mel_inputs = common_layers.flatten4d3d(mel_inputs) if not prepare_encoder_fn: prepare_encoder_fn = transformer_prepare_encoder perf_encoder_input, perf_self_attention_bias, perf_encdec_attention_bias = ( prepare_encoder_fn(perf_inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) perf_encoder_input = tf.nn.dropout( perf_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) perf_attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: perf_attn_bias_for_padding = perf_encdec_attention_bias # do the same thing for melody mel_encoder_input, mel_self_attention_bias, mel_encdec_attention_bias = ( prepare_encoder_fn(mel_inputs, target_space, hparams, features=features, reuse_target_embedding=tf.AUTO_REUSE)) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=hparams.layer_prepostprocess_dropout, hparams=hparams) mel_encoder_input = tf.nn.dropout( mel_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) mel_attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: mel_attn_bias_for_padding = mel_encdec_attention_bias # use the proper encoder function for perf/melody perf_encoder_output = encoder_function( perf_encoder_input, perf_self_attention_bias, hparams, name="perf_encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=perf_attn_bias_for_padding, **kwargs) # same thing for melody mel_encoder_output = encoder_function( mel_encoder_input, mel_self_attention_bias, hparams, name="mel_encoder", nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled(), losses=losses, attn_bias_for_padding=mel_attn_bias_for_padding, **kwargs) # concatenate the global mean vector/bias term with the full melody encoding perf_mean_vector = tf.math.reduce_mean(perf_encoder_output, axis=1, keep_dims=True) # different methods of aggregating over the performance + melody vectors! if hparams.aggregation == "sum": # add both mean performance and melody vectors together perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias, axis=-1, keep_dims=True) encoder_output = mel_encoder_output + perf_mean_vector encoder_decoder_attention_bias = mel_encdec_attention_bias + perf_mean_bias elif hparams.aggregation == "concat": # concatenate melody with mean-aggregated performance embedding stop_token = tf.zeros((1, 1, 384)) encoder_output = tf.concat( [mel_encoder_output, stop_token, perf_mean_vector], axis=1) perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias, axis=-1, keep_dims=True) stop_bias = tf.zeros((1, 1, 1, 1)) encoder_decoder_attention_bias = tf.concat( [mel_encdec_attention_bias, stop_bias, perf_mean_bias], axis=-1) elif hparams.aggregation == "tile": # tile performance embedding across each dimension of melody embedding! dynamic_val = tf.shape(mel_encoder_output)[1] shp = tf.convert_to_tensor([1, dynamic_val, 1], dtype=tf.int32) tiled_mean = tf.tile(perf_mean_vector, shp) encoder_output = tf.concat([mel_encoder_output, tiled_mean], axis=-1) encoder_decoder_attention_bias = mel_encdec_attention_bias else: NotImplementedError( "aggregation method must be in [sum, concat, tile].") return encoder_output, encoder_decoder_attention_bias
def hierarchical_attention_network_encoder( encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="hierarchical_attention_network_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], hparams, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope('word_abstraction', reuse=tf.AUTO_REUSE): encoder_word_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_w = f_w(h_t) encoder_word_level_abstraction = {} for context_name in context_encoded_outputs: encoder_word_level_abstraction[ context_name] = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess( encoder_word_level_query, hparams), context_encoded_outputs[context_name], context_self_attention_biases[context_name], hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) # s^j, sentence_information = tf.concat([ encoder_word_level_abstraction[context_name] for context_name in encoder_word_level_abstraction ], axis=1) with tf.variable_scope('sentence_abstraction', reuse=tf.AUTO_REUSE): encoder_sentence_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_s = f_s(h_t) context_padding = common_attention.embedding_to_padding( sentence_information) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) contextual_information = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess(encoder_sentence_level_query, hparams), sentence_information, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d") ) # MultiHead(q_s, s^j), [batch, encoder_length, hidden_dim] contextual_information = common_layers.dense_relu_dense( contextual_information, hparams.filter_size, hparams.hidden_size) with tf.variable_scope('context_gating', reuse=tf.AUTO_REUSE): gate_lambda = tf.nn.sigmoid( common_layers.dense(contextual_information, hparams.hidden_size) + common_layers.dense(encoder_output, hparams.hidden_size)) encoder_output = gate_lambda * encoder_output + ( 1 - gate_lambda) * contextual_information return common_layers.layer_preprocess(encoder_output, hparams)
def hierarchical_context_encoder(encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="discourse_aware_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], temp_hparam, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope("hierarchical_context_encoder", reuse=tf.AUTO_REUSE): for context_name in context_encoded_outputs: # self attention feed-forward _y = ffn_self_attention_layer( context_encoded_outputs[context_name], hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, name="attentive_sum") # mean over sequence length context_encoded_outputs[context_name] = tf.reduce_mean( _y, axis=1, keep_dims=True) encoded_contexts = [ context_encoded_outputs[context_name] for context_name in context_encoded_outputs ] encoded_contexts = tf.concat(encoded_contexts, axis=1) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", 1) context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) encoded_contexts = transformer_encoder(encoded_contexts, ignore_padding, temp_hparam) with tf.variable_scope("encoder/layer_%d" % hparams.num_hidden_layers, reuse=tf.AUTO_REUSE): with tf.variable_scope("context_input_attention"): context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), encoded_contexts, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoded_contexts = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("input_self_attention"): _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("gated_sum"): _depth = common_layers.shape_list(encoder_output)[-1] gate = tf.layers.dense(tf.concat( [encoded_contexts, encoder_output], axis=-1), _depth, activation=tf.nn.sigmoid) if save_weights_to: save_weights_to["gated_sum"] = gate encoder_output = gate * encoder_output + ( 1. - gate) * encoded_contexts with tf.variable_scope("ffn"): _y = transformer_ffn_layer(common_layers.layer_preprocess( encoder_output, hparams), hparams, input_pad_remover, conv_padding="SAME", nonpadding_mask=input_nonpadding, losses=losses) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding( attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): if layer < hparams.get("num_area_layers", 0): max_area_width = hparams.get("max_area_width", 1) max_area_height = hparams.get("max_area_height", 1) memory_height = hparams.get("memory_height", 1) else: max_area_width = 1 max_area_height = 1 memory_height = 1 y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32"), hard_attention_k=hparams.get("hard_attention_k", 0), gumbel_noise_weight=hparams.get( "gumbel_noise_weight", 0.0), max_area_width=max_area_width, max_area_height=max_area_height, memory_height=memory_height, area_key_mode=hparams.get("area_key_mode", "none"), area_value_mode=hparams.get("area_value_mode", "none"), training=(hparams.get("mode", tf.estimator.ModeKeys.TRAIN) == tf.estimator.ModeKeys.TRAIN)) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def universal_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Universal Transformer encoder function. Prepares all the arguments and the inputs and passes it to a universal_transformer_layer to encode the encoder_input. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors as the output of the encoder extra_output: which can be used to pass extra information to the body """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) ffn_unit = functools.partial( universal_transformer_util.transformer_encoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding, pad_remover=pad_remover) attention_unit = functools.partial( universal_transformer_util.transformer_encoder_attention_unit, hparams=hparams, encoder_self_attention_bias=encoder_self_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_layer(x, hparams, ffn_unit, attention_unit, pad_remover=pad_remover) if hparams.get("use_memory_as_last_state", False): x = extra_output # which is memory return common_layers.layer_preprocess(x, hparams), extra_output
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = sparse_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get( "mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity, split_heads=hparams.get("split_heads")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)