def body(self, features): assert self._hparams.block_size > 0 assert not common_layers.is_xla_compiled() assert "targets_segmentation" not in features decoder_output = super(TransformerBlockParallel, self).body(features) assert not isinstance(decoder_output, tuple) assert len(decoder_output.shape) == 4 relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(self._hparams, "relu_dropout_broadcast_dims", ""))) with tf.variable_scope("block_size_%d" % self._hparams.block_size): block_output = common_layers.dense_relu_dense( decoder_output, self._hparams.block_size * self._hparams.filter_size, self._hparams.block_size * self._hparams.hidden_size, dropout=self._hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) batch_size, length = common_layers.shape_list(decoder_output)[:2] block_output = tf.reshape(block_output, [ batch_size, length, self._hparams.block_size, self._hparams.hidden_size ]) block_output = common_layers.layer_postprocess(decoder_output, block_output, self._hparams) return block_output
def recurrent_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Recurrent decoder function.""" x = decoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): ffn_unit = functools.partial( # use encoder ffn, since decoder ffn use left padding universal_transformer_util.transformer_encoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding) attention_unit = functools.partial( universal_transformer_util.transformer_decoder_attention_unit, hparams=hparams, encoder_output=encoder_output, decoder_self_attention_bias=decoder_self_attention_bias, encoder_decoder_attention_bias=encoder_decoder_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_util.universal_transformer_layer( x, hparams, ffn_unit, attention_unit) return common_layers.layer_preprocess(x, hparams), extra_output
def body(self, features): assert self._hparams.block_size > 0 assert not common_layers.is_xla_compiled() hparams = copy.copy(self._hparams) targets = features["targets"] inputs = features["inputs"] if not (tf.get_variable_scope().reuse or hparams.mode == tf.estimator.ModeKeys.PREDICT): tf.summary.image("inputs", inputs, max_outputs=1) tf.summary.image("targets", targets, max_outputs=1) encoder_input = cia.prepare_encoder(inputs, hparams) encoder_output = cia.transformer_encoder_layers( encoder_input, hparams.num_encoder_layers, hparams, attention_type=hparams.enc_attention_type, name="encoder") decoder_input, rows, cols = cia.prepare_decoder(targets, hparams) decoder_output = cia.transformer_decoder_layers( decoder_input, encoder_output, hparams.num_decoder_layers, hparams, attention_type=hparams.dec_attention_type, name="decoder") assert not isinstance(decoder_output, tuple) assert len(decoder_output.shape) == 4 relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(self._hparams, "relu_dropout_broadcast_dims", ""))) with tf.variable_scope("block_size_%d" % self._hparams.block_size): tf.logging.info("Using block_size %d", self._hparams.block_size) block_output = common_layers.dense_relu_dense( decoder_output, self._hparams.block_size * self._hparams.filter_size, self._hparams.block_size * self._hparams.hidden_size, dropout=self._hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3] decoder_output = tf.reshape( decoder_output, [batch_size, rows, cols, 1, self._hparams.hidden_size]) block_output = tf.reshape(block_output, [ batch_size, rows, cols, self._hparams.block_size, self._hparams.hidden_size ]) block_output = common_layers.layer_postprocess(decoder_output, block_output, self._hparams) return block_output
def transformer_n_encoder(encoder_input, encoder_self_attention_bias, hparams, customize_params, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """ transformer with 2 sets of encoders """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(customize_params.num_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, customize_params.num_heads or hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=customize_params.get("max_length")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), customized_ffn=customize_params.ffn_layer, hparams=hparams, pad_remover=pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def recurrent_transformer_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Recurrent decoder function.""" x = decoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): ffn_unit = functools.partial( # use encoder ffn, since decoder ffn use left padding universal_transformer_util.transformer_encoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding) attention_unit = functools.partial( universal_transformer_util.transformer_decoder_attention_unit, hparams=hparams, encoder_output=encoder_output, decoder_self_attention_bias=decoder_self_attention_bias, encoder_decoder_attention_bias=encoder_decoder_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_util.universal_transformer_layer( x, hparams, ffn_unit, attention_unit) return common_layers.layer_preprocess(x, hparams), extra_output
def transformer_dual_decoder(decoder_input, wav_encoder_output, txt_encoder_output, decoder_self_attention_bias, wav_enc_dec_attention_bias, txt_enc_dec_attention_bias, hparams, cache=None, name="dual_decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. decoder with two attentive interaction with each encoder Args: decoder_input: a Tensor wav_encoder_output: a Tensor txt_encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) wav_enc_dec_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) txt_enc_dec_attention_bias: the same as former hparams: hyperparameters for model cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = decoder_input x1 = None x2 = None attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): for layer in range(hparams.num_decoder_layers or hparams.num_hidden_layers): layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): # decoder self-attention y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_target_seq_length")) # x = common_layers.layer_postprocess(x, y, hparams) if wav_encoder_output is not None: with tf.variable_scope("wav_encdec_attention"): y1 = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), wav_encoder_output, wav_enc_dec_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.wav_num_heads or hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length") ) #"max_wav_seq_length")*80 x1 = common_layers.layer_postprocess(x, y1, hparams) if txt_encoder_output is not None: with tf.variable_scope("txt_encdec_attention"): y2 = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), txt_encoder_output, txt_enc_dec_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.txt_num_heads or hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_txt_seq_length") )#max_txt_seq_length x2 = common_layers.layer_postprocess(x, y2, hparams) with tf.variable_scope("ffn"): if wav_encoder_output is not None and txt_encoder_output is not None: # with two encoder to attend to y = transformer_ffn_layer( common_layers.layer_preprocess(tf.concat([x1,x2],axis=-1), hparams), hparams, conv_padding="LEFT", nonpadding_mask=nonpadding, losses=losses, cache=layer_cache) else: y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, conv_padding="LEFT", nonpadding_mask=nonpadding, losses=losses, cache=layer_cache) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def evolved_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """Evolved Transformer encoder. See arxiv.org/abs/1901.11117 for more details. Note: Pad remover is not supported. Args: encoder_input: a Tensor. encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). hparams: hyperparameters for model. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not used. attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: Tensor encoder output. """ del losses hidden_state = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding(attention_bias) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("gated_linear_unit"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) values = layers().Dense(hparams.hidden_size)(hidden_state) gates = layers().Dense( hparams.hidden_size, activation=tf.nn.sigmoid)(hidden_state) hidden_state = values * gates hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("conv_branches"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask left_output_dim = int(hparams.hidden_size * 4) left_state = layers().Dense( left_output_dim, activation=tf.nn.relu)(hidden_state) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) right_state = layers().Conv1D( right_output_dim, 3, padding="SAME", name="standard_conv_3x1", activation=tf.nn.relu)(hidden_state) right_state = tf.nn.dropout(right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layer. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, left_output_dim]) hidden_state *= mask separable_conv_9x1 = layers().SeparableConv1D( right_output_dim, 9, padding="SAME", name="separable_conv_9x1") hidden_state = separable_conv_9x1(hidden_state) hidden_state = tf.pad( hidden_state, [[0, 0], [0, 0], [0, hparams.hidden_size - right_output_dim]], constant_values=0) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("self_attention"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = common_attention.multihead_attention( hidden_state, None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = layers().Dense( int(hparams.hidden_size * 4), activation=tf.nn.relu)(hidden_state) hidden_state = tf.nn.dropout(hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = layers().Dense(hparams.hidden_size)(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) # If normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(hidden_state, hparams)
def transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convoltutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = decoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): for layer in xrange(hparams.num_decoder_layers or hparams.num_hidden_layers): layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) if encoder_output is not None: with tf.variable_scope("encdec_attention"): # TODO(llion): Add caching. y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, conv_padding="LEFT", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def _layer_stack(mp, inputs, self_attention_bias, layers, hparams, encoder_output=None, encoder_decoder_attention_bias=None): """A stack of layers. Args: mp: a Parallelism object inputs: a list of Tensors self_attention_bias: list of bias Tensor for self-attention (see common_attention.attention_bias()) layers: a string hparams: hyperparameters for model encoder_output: optional list of tensors encoder_decoder_attention_bias: optional list of tensors Returns: y: a list of Tensors """ layers = layers.strip(",").split(",") # scaled_dot_product_attention_with_projections uses a 3d attention bias # (no heads), where multihead_attention uses 4d attention bias. self_attention_bias_3d = mp(tf.squeeze, self_attention_bias, 1) if encoder_decoder_attention_bias is not None: encoder_decoder_attention_bias_3d = mp( tf.squeeze, encoder_decoder_attention_bias, 1) relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) mix_size = int(hparams.mix_fraction * hparams.hidden_size) accumulator = inputs x = inputs for layer_num, layer_type in enumerate(layers): with tf.variable_scope("%s_%d" % (layer_type, layer_num)): tf.logging.info("%s_%d" % (layer_type, layer_num)) if layer_type == "a": # accumulate accumulator = mp(tf.add, x, accumulator) x = accumulator elif layer_type == "n": # normalize x = mp(common_layers.apply_norm, x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) elif layer_type == "d": # dropout x = mp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout) elif layer_type == "m": if mix_size > 0: # mix across shards def _split(t): return tuple(tf.split( t, [mix_size, hparams.hidden_size - mix_size], 2)) to_mix, to_keep = mp(_split, x) mixed = expert_utils.all_reduce_ring(to_mix, mp) mixed = mp(tf.multiply, mixed, mp.n ** -0.5) x = mp(lambda a, b: tf.concat([a, b], 2), mixed, to_keep) elif layer_type == "att": # single-head attention q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="q_transform") x = mp( common_attention.scaled_dot_product_attention_simple, q, x, x, self_attention_bias_3d) x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="o_transform") elif layer_type == "enc-att": # single-head attention over encoder q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="q_transform") assert encoder_output is not None x = mp( common_attention.scaled_dot_product_attention_simple, q, encoder_output, encoder_output, encoder_decoder_attention_bias_3d) x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="o_transform") elif layer_type == "multihead-att": # multi-head attention x = mp( common_attention.multihead_attention, x, None, self_attention_bias, # bias hparams.multihead_attention_key_channels or hparams.hidden_size, hparams.multihead_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.multihead_attention_num_heads, hparams.attention_dropout) elif layer_type == "enc-multihead-att": # multi-head attention x = mp( common_attention.multihead_attention, x, encoder_output, encoder_decoder_attention_bias, # bias hparams.multihead_attention_key_channels or hparams.hidden_size, hparams.multihead_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.multihead_attention_num_heads, hparams.attention_dropout) elif layer_type == "ffn": x = mp( common_layers.dense_relu_dense, x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=[relu_dropout_broadcast_dims] * mp.n) else: assert False, "unknown sublayer %s" % layer_type return x
def hierarchical_context_encoder(encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="discourse_aware_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], temp_hparam, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope("hierarchical_context_encoder", reuse=tf.AUTO_REUSE): for context_name in context_encoded_outputs: # self attention feed-forward _y = ffn_self_attention_layer( context_encoded_outputs[context_name], hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, name="attentive_sum") # mean over sequence length context_encoded_outputs[context_name] = tf.reduce_mean( _y, axis=1, keep_dims=True) encoded_contexts = [ context_encoded_outputs[context_name] for context_name in context_encoded_outputs ] encoded_contexts = tf.concat(encoded_contexts, axis=1) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", 1) context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) encoded_contexts = transformer_encoder(encoded_contexts, ignore_padding, temp_hparam) with tf.variable_scope("encoder/layer_%d" % hparams.num_hidden_layers, reuse=tf.AUTO_REUSE): with tf.variable_scope("context_input_attention"): context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), encoded_contexts, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoded_contexts = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("input_self_attention"): _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("gated_sum"): _depth = common_layers.shape_list(encoder_output)[-1] gate = tf.layers.dense(tf.concat( [encoded_contexts, encoder_output], axis=-1), _depth, activation=tf.nn.sigmoid) if save_weights_to: save_weights_to["gated_sum"] = gate encoder_output = gate * encoder_output + ( 1. - gate) * encoded_contexts with tf.variable_scope("ffn"): _y = transformer_ffn_layer(common_layers.layer_preprocess( encoder_output, hparams), hparams, input_pad_remover, conv_padding="SAME", nonpadding_mask=input_nonpadding, losses=losses) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convoltutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = decoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): for layer in xrange(hparams.num_decoder_layers or hparams.num_hidden_layers): layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) if encoder_output is not None: with tf.variable_scope("encdec_attention"): # TODO(llion): Add caching. y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, conv_padding="LEFT", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def universal_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Universal Transformer encoder function. Prepares all the arguments and the inputs and passes it to a universal_transformer_layer to encode the encoder_input. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors as the output of the encoder extra_output: which can be used to pass extra information to the body """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) ffn_unit = functools.partial( universal_transformer_util.transformer_encoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding, pad_remover=pad_remover) attention_unit = functools.partial( universal_transformer_util.transformer_encoder_attention_unit, hparams=hparams, encoder_self_attention_bias=encoder_self_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_layer(x, hparams, ffn_unit, attention_unit, pad_remover=pad_remover) if hparams.get("use_memory_as_last_state", False): x = extra_output # which is memory return common_layers.layer_preprocess(x, hparams), extra_output
def evolved_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details. Args: decoder_input: a Tensor. encoder_output: a Tensor. decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()). hparams: hyperparameters for model. cache: Not supported. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not supported. Returns: Decoder output tensor. """ del cache, losses attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): hidden_state = decoder_input layer_cache = None for layer in range(hparams.num_decoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("16_head_self_attention"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) # 16 head attention. Hard coding number of heads. left_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, 16, # Heads are hard coded to replicate paper. hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) if encoder_output is not None: with tf.variable_scope("first_attend_to_encoder"): right_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams. max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams. add_relative_to_values, save_weights_to=save_weights_to, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims= attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get( "activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) left_state = tf.nn.dropout( left_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = residual_state + left_state + right_state else: hidden_state = common_layers.layer_postprocess( residual_state, left_state, hparams) with tf.variable_scope("conv_branches"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask # Shift inputs so that future tokens cannot be seen. left_state = tf.pad(hidden_state, paddings=[[0, 0], [10, 0], [0, 0]]) left_output_dim = int(hparams.hidden_size * 2) separable_conv_11x1 = tf.layers.SeparableConv1D( left_output_dim, 11, padding="VALID", name="separable_conv11x1", activation=tf.nn.relu) left_state = separable_conv_11x1.apply(left_state) left_state = tf.nn.dropout( left_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad(hidden_state, paddings=[[0, 0], [6, 0], [0, 0]]) right_output_dim = int(hparams.hidden_size / 2) separable_conv_7x1_1 = tf.layers.SeparableConv1D( right_output_dim, 7, padding="VALID", name="separable_conv_7x1_1") right_state = separable_conv_7x1_1.apply(right_state) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size * 2]) hidden_state *= mask hidden_state = tf.pad(hidden_state, paddings=[[0, 0], [6, 0], [0, 0]]) separable_conv_7x1_2 = tf.layers.SeparableConv1D( hparams.hidden_size, 7, padding="VALID", name="separable_conv_7x1_2") hidden_state = separable_conv_7x1_2.apply(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("self_attention"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) if encoder_output is not None: with tf.variable_scope("second_attend_to_encoder"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams. max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams. add_relative_to_values, save_weights_to=save_weights_to, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims= attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get( "activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, int(hparams.hidden_size * 4), activation=tf.nn.swish) hidden_state = tf.nn.dropout( hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, hparams.hidden_size) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) return common_layers.layer_preprocess(hidden_state, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # sg: imdb comments y = common_attention.multihead_attention( common_layers.layer_preprocess( x, hparams), # added layer norm None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, # 128 hparams.attention_value_channels or hparams.hidden_size, # 128 hparams.hidden_size, # 128 hparams.num_heads, # 4 hparams.attention_dropout, # 0.1 attention_type=hparams. self_attention_type, # 'dot_product' save_weights_to=save_weights_to, max_relative_position=hparams. max_relative_position, # 0 make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length")) # 256 x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None, decode_loop_step=None, readout_filter_size=0): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. readout_filter_size: if it's greater than 0, then it will be used instead of filter_size Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache, decode_loop_step=decode_loop_step) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, readout_filter_size or hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) elif ffn_layer == "local_moe": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe( x, True, expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size], hparams.hidden_size), hparams.moe_num_experts, k=hparams.moe_k, hparams=hparams) losses.append(loss) return ret else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutoinal layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, hparams.filter_size, hparams.hidden_size, first_kernel_size=3, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, customized_ffn=None, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model customized_ffn: customized the ffn_layer string pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = customized_ffn or hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = (hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) if losses is None: raise ValueError( "transformer_ffn_layer with type local_moe_tpu must pass in " "a losses list") losses.append(loss) return ret else: assert ffn_layer == "none" return x
def universal_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Universal Transformer decoder function. Prepares all the arguments and the inputs and passes it to a core_universal_transformer_layer to decoder. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convoltutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: the output Tensors extra_output: which can be used to pass extra information to the body """ x = decoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): ffn_unit = functools.partial( universal_transformer_util.transformer_decoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding) attention_unit = functools.partial( universal_transformer_util.transformer_decoder_attention_unit, hparams=hparams, encoder_output=encoder_output, decoder_self_attention_bias=decoder_self_attention_bias, encoder_decoder_attention_bias=encoder_decoder_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_layer(x, hparams, ffn_unit, attention_unit) return common_layers.layer_preprocess(x, hparams), extra_output
def transformer_bidirectional_joint_decoder(left_decoder_output, right_decoder_output, encoder_output, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: decoder_input: a Tensor encoder_output: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()) hparams: hyperparameters for model cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = left_decoder_output + right_decoder_output attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): for layer in range(hparams.num_bidirectional_decoder_joint_layers): layer_name = "joint_layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): if encoder_output is not None: with tf.variable_scope("encdec_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams. max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams. add_relative_to_values, save_weights_to=save_weights_to, cache=layer_cache, make_image_summary=make_image_summary, dropout_broadcast_dims= attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, conv_padding="LEFT", nonpadding_mask=nonpadding, losses=losses, cache=layer_cache, decode_loop_step=decode_loop_step) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def apply_nas_layers(input_tensor, left_inputs, left_layers, left_activations, left_output_dims, left_norms, right_inputs, right_layers, right_activations, right_output_dims, right_norms, combiner_functions, final_combiner_function, num_cells, nonpadding, layer_registry, mask_future, hparams, var_scope, encoder_decoder_attention_bias=None, encoder_cell_outputs=None, decoder_self_attention_bias=None, final_layer_norm=True, enforce_fixed_output_sizes=True): """Applies layers with NasNet search space style branching. Args: input_tensor: Input [batch_size, input_length, hidden_dim] sequence tensor. left_inputs: Int list of left branch hidden layer input indexes. left_layers: String list of left branch layers. left_activations: String list of left branch activations. left_output_dims: String list of left branch output dimensions. left_norms: String list of left branch norms. right_inputs: Int list of right branch hidden layer input indexes. right_layers: String list of right branch layers. right_activations: String list of right branch activations. right_output_dims: String list of right branch output dimensions. right_norms: String list of right branch norms. combiner_functions: String list of branch combining functions. final_combiner_function: String. The final combiner function that combines all the unused hidden layers in a cell. num_cells: The number of cells. This is the number of times the given layers will be repeated. nonpadding: Tensor with 1s at all nonpadding time step positions and 0s everywhere else. layer_registry: The LayerRegistry that holds all valid layers. mask_future: Whether or not to mask future sequence values. hparams: Hyperparameters for the model. var_scope: The variable scope name. encoder_decoder_attention_bias: The attention bias for decoder attending to `encoder_output`. encoder_cell_outputs: List of tensors. The encoder cell outputs, listed in order. decoder_self_attention_bias: The self attention bias for decoders. This needs to be set for decoders. final_layer_norm: Whether or not to apply a final layer_norm to the output of the model. enforce_fixed_output_sizes: Whether or not to automatically resize output dimensions to match the input dimension if `should_alter_output_dim()` returns True. Raises: ValueError: When branching inputs are not of the same length. ValueError: If item in left_norms is not LAYER_NORM_KEY or NO_NORM_KEY. ValueError: If item in right_norms is not LAYER_NORM_KEY or NO_NORM_KEY. Returns: Output of applied layers and list of each cell's outputs in order. """ if not (len(left_inputs) == len(left_layers) == len(left_activations) == len(left_output_dims) == len(left_norms) == len(right_inputs) == len(right_layers) == len(right_activations) == len(right_output_dims) == len(right_norms) == len(combiner_functions)): raise ValueError("All branching inputs must be of the same length.") cell_output = None modified_left_inputs = [ left_inputs[i] for i in range(len(left_inputs)) if left_layers[i] != DEAD_BRANCH_KEY ] modified_right_inputs = [ right_inputs[i] for i in range(len(right_inputs)) if right_layers[i] != DEAD_BRANCH_KEY ] unused_cell_hidden_states = [ i for i in range(len(left_inputs) + 1) if i not in modified_left_inputs and i not in modified_right_inputs ] assert unused_cell_hidden_states cell_outputs = [] with tf.variable_scope(var_scope): dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) for cell_num in range(num_cells): # h_0 is the input tensor. # Keep a dict for layer norm states. if cell_output is not None: cell_hidden_states = [cell_output] else: cell_hidden_states = [input_tensor] layer_norm_dict = {} with tf.variable_scope("cell_%d" % cell_num): for i, (left_input, left_layer_name, left_activation_name, left_output_dim, left_norm, right_input, right_layer_name, right_activation_name, right_output_dim, right_norm, combiner) in enumerate( zip(left_inputs, left_layers, left_activations, left_output_dims, left_norms, right_inputs, right_layers, right_activations, right_output_dims, right_norms, combiner_functions)): left_input = int(left_input) right_input = int(right_input) with tf.variable_scope("layer_%d" % i): assert not (left_layer_name == DEAD_BRANCH_KEY and right_layer_name == DEAD_BRANCH_KEY) if left_layer_name != DEAD_BRANCH_KEY: left_raw_input_tensor = cell_hidden_states[left_input] left_input_dim = left_raw_input_tensor.shape.as_list()[-1] if should_alter_output_dim(left_layer_name, enforce_fixed_output_sizes, left_input_dim, left_output_dim): left_output_dim = left_input_dim # First process the left branch. left_tensor = _apply_nas_branch( norm=left_norm, layer_norm_dict=layer_norm_dict, hidden_states=cell_hidden_states, nonpadding=nonpadding, hparams=hparams, input_index=left_input, layer_name=left_layer_name, activation_name=left_activation_name, layer_registry=layer_registry, output_dim=left_output_dim, branch_scope_name="left_%s" % str(i), mask_future=mask_future, dropout_broadcast_dims=dropout_broadcast_dims, encoder_decoder_attention_bias=encoder_decoder_attention_bias, encoder_cell_outputs=encoder_cell_outputs, decoder_self_attention_bias=decoder_self_attention_bias, cell_number=cell_num) if right_layer_name != DEAD_BRANCH_KEY: right_raw_input_tensor = cell_hidden_states[right_input] right_input_dim = right_raw_input_tensor.shape.as_list()[-1] if should_alter_output_dim(right_layer_name, enforce_fixed_output_sizes, right_input_dim, right_output_dim): right_output_dim = right_input_dim # Next process the right branch. right_tensor = _apply_nas_branch( norm=right_norm, layer_norm_dict=layer_norm_dict, hidden_states=cell_hidden_states, nonpadding=nonpadding, hparams=hparams, input_index=right_input, layer_name=right_layer_name, activation_name=right_activation_name, layer_registry=layer_registry, output_dim=right_output_dim, branch_scope_name="right_%s" % str(i), mask_future=mask_future, dropout_broadcast_dims=dropout_broadcast_dims, encoder_decoder_attention_bias=encoder_decoder_attention_bias, encoder_cell_outputs=encoder_cell_outputs, decoder_self_attention_bias=decoder_self_attention_bias, cell_number=cell_num) # Combine the branches. if left_layer_name == DEAD_BRANCH_KEY: hidden_tensor = right_tensor elif right_layer_name == DEAD_BRANCH_KEY: hidden_tensor = left_tensor else: hidden_tensor = COMBINER_FUNCTIONS[combiner]().combine_tensors( [left_tensor, right_tensor]) cell_hidden_states.append(hidden_tensor) states_to_combine = [ cell_hidden_states[j] for j in unused_cell_hidden_states ] cell_output = COMBINER_FUNCTIONS[final_combiner_function]( ).combine_tensors(states_to_combine) cell_outputs.append(cell_output) if final_layer_norm: final_output = common_layers.layer_preprocess(cell_output, hparams) cell_outputs = [ common_layers.layer_preprocess(cell_output, hparams) for cell_output in cell_outputs ] return final_output, cell_outputs else: return cell_output, cell_outputs
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None, decode_loop_step=None, readout_filter_size=0): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. readout_filter_size: if it's greater than 0, then it will be used instead of filter_size Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache, decode_loop_step=decode_loop_step) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, readout_filter_size or hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) elif ffn_layer == "local_moe": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe( x, True, expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size], hparams.hidden_size), hparams.moe_num_experts, k=hparams.moe_k, hparams=hparams) losses.append(loss) return ret else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer if ffn_layer != "dense_relu_dense": raise ValueError( "sparse transformer only supports dense_relu_dense ffn.") relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") conv_output = sparse_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims, sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get("mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output
def hierarchical_attention_network_encoder( encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="hierarchical_attention_network_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], hparams, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope('word_abstraction', reuse=tf.AUTO_REUSE): encoder_word_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_w = f_w(h_t) encoder_word_level_abstraction = {} for context_name in context_encoded_outputs: encoder_word_level_abstraction[ context_name] = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess( encoder_word_level_query, hparams), context_encoded_outputs[context_name], context_self_attention_biases[context_name], hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) # s^j, sentence_information = tf.concat([ encoder_word_level_abstraction[context_name] for context_name in encoder_word_level_abstraction ], axis=1) with tf.variable_scope('sentence_abstraction', reuse=tf.AUTO_REUSE): encoder_sentence_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_s = f_s(h_t) context_padding = common_attention.embedding_to_padding( sentence_information) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) contextual_information = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess(encoder_sentence_level_query, hparams), sentence_information, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d") ) # MultiHead(q_s, s^j), [batch, encoder_length, hidden_dim] contextual_information = common_layers.dense_relu_dense( contextual_information, hparams.filter_size, hparams.hidden_size) with tf.variable_scope('context_gating', reuse=tf.AUTO_REUSE): gate_lambda = tf.nn.sigmoid( common_layers.dense(contextual_information, hparams.hidden_size) + common_layers.dense(encoder_output, hparams.hidden_size)) encoder_output = gate_lambda * encoder_output + ( 1 - gate_lambda) * contextual_information return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding( attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): if layer < hparams.get("num_area_layers", 0): max_area_width = hparams.get("max_area_width", 1) max_area_height = hparams.get("max_area_height", 1) memory_height = hparams.get("memory_height", 1) else: max_area_width = 1 max_area_height = 1 memory_height = 1 y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32"), hard_attention_k=hparams.get("hard_attention_k", 0), gumbel_noise_weight=hparams.get( "gumbel_noise_weight", 0.0), max_area_width=max_area_width, max_area_height=max_area_height, memory_height=memory_height, area_key_mode=hparams.get("area_key_mode", "none"), area_value_mode=hparams.get("area_value_mode", "none"), training=(hparams.get("mode", tf.estimator.ModeKeys.TRAIN) == tf.estimator.ModeKeys.TRAIN)) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutoinal layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, hparams.filter_size, hparams.hidden_size, first_kernel_size=3, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert ffn_layer == "none" return x
def evolved_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details. Args: decoder_input: a Tensor. encoder_output: a Tensor. decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()). hparams: hyperparameters for model. cache: dict, containing tensors which are the results of previous layers, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not supported. Returns: Decoder output tensor. """ del losses num_trainable_top_decoder_layers = hparams.get( "num_trainable_top_decoder_layers", -1) # -1 means train all weights. if num_trainable_top_decoder_layers >= 0: encoder_output = tf.stop_gradient(encoder_output) attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): hidden_state = decoder_input num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers for layer in range(num_layers): if num_trainable_top_decoder_layers == num_layers - layer: hidden_state = tf.stop_gradient(hidden_state) layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope(_SIXTEEN_HEAD_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = layer_cache[ _SIXTEEN_HEAD_ATTENTION_NAME] if layer_cache is not None else None left_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, _capped_double_heads(hparams.num_heads), hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) if encoder_output is not None: with tf.variable_scope(_FIRST_ATTEND_TO_ENCODER_NAME): attention_cache = ( layer_cache[_FIRST_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) right_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = residual_state + left_state + right_state else: hidden_state = common_layers.layer_postprocess( residual_state, left_state, hparams) with tf.variable_scope(_CONV_BRANCHES_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.concat( [ layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_LEFT_CONV_PADDING - 1:, :] left_state = hidden_state right_state = hidden_state[:, _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, decode_loop_step * tf.shape(hidden_state)[1] + _DECODER_LEFT_CONV_PADDING, tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] left_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_LEFT_CONV_PADDING + 1, hparams.hidden_size ]) right_state = tf.slice(hidden_state, [ 0, decode_loop_step + _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING, 0 ], [ batch_size, _DECODER_RIGHT_CONV_PADDING + 1, hparams.hidden_size ]) else: # No caching. left_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_LEFT_CONV_PADDING, 0], [0, 0]]) right_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_RIGHT_CONV_PADDING, 0], [0, 0]]) left_output_dim = int(hparams.hidden_size * 2) separable_conv_11x1 = tf.layers.SeparableConv1D( left_output_dim, 11, padding="VALID", name="separable_conv11x1", activation=tf.nn.relu) left_state = separable_conv_11x1.apply(left_state) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) separable_conv_7x1_1 = tf.layers.SeparableConv1D( right_output_dim, 7, padding="VALID", name="separable_conv_7x1_1") right_state = separable_conv_7x1_1.apply(right_state) right_state = tf.nn.dropout(right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size * 2]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.concat( [ layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_FINAL_CONV_PADDING - 1:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, (decode_loop_step + _DECODER_FINAL_CONV_PADDING) * tf.shape(hidden_state)[1], tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] hidden_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_FINAL_CONV_PADDING + 1, hparams.hidden_size * 2 ]) else: hidden_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_FINAL_CONV_PADDING, 0], [0, 0]]) separable_conv_7x1_2 = tf.layers.SeparableConv1D( hparams.hidden_size, 7, padding="VALID", name="separable_conv_7x1_2") hidden_state = separable_conv_7x1_2.apply(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope(_VANILLA_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = layer_cache[ _VANILLA_ATTENTION_NAME] if layer_cache is not None else None hidden_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) if encoder_output is not None: with tf.variable_scope(_SECOND_ATTEND_TO_ENCODER_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = ( layer_cache[_SECOND_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) hidden_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = tf.layers.dense( hidden_state, int(hparams.hidden_size * 4), activation=tf.nn.swish) hidden_state = tf.nn.dropout(hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, hparams.hidden_size) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) decoder_output = common_layers.layer_preprocess(hidden_state, hparams) if num_trainable_top_decoder_layers == 0: decoder_output = tf.stop_gradient(decoder_output) return decoder_output
def _layer_stack(mp, inputs, self_attention_bias, layers, hparams, encoder_output=None, encoder_decoder_attention_bias=None): """A stack of layers. Args: mp: a Parallelism object inputs: a list of Tensors self_attention_bias: list of bias Tensor for self-attention (see common_attention.attention_bias()) layers: a string hparams: hyperparameters for model encoder_output: optional list of tensors encoder_decoder_attention_bias: optional list of tensors Returns: y: a list of Tensors """ layers = layers.strip(",").split(",") # scaled_dot_product_attention_with_projections uses a 3d attention bias # (no heads), where multihead_attention uses 4d attention bias. self_attention_bias_3d = mp(tf.squeeze, self_attention_bias, 1) if encoder_decoder_attention_bias is not None: encoder_decoder_attention_bias_3d = mp( tf.squeeze, encoder_decoder_attention_bias, 1) relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) mix_size = int(hparams.mix_fraction * hparams.hidden_size) accumulator = inputs x = inputs for layer_num, layer_type in enumerate(layers): with tf.variable_scope("%s_%d" % (layer_type, layer_num)): tf.logging.info("%s_%d" % (layer_type, layer_num)) if layer_type == "a": # accumulate accumulator = mp(tf.add, x, accumulator) x = accumulator elif layer_type == "n": # normalize x = mp(common_layers.apply_norm, x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) elif layer_type == "d": # dropout x = mp(tf.nn.dropout, x, 1.0 - hparams.layer_prepostprocess_dropout) elif layer_type == "m": if mix_size > 0: # mix across shards def _split(t): return tuple(tf.split( t, [mix_size, hparams.hidden_size - mix_size], 2)) to_mix, to_keep = mp(_split, x) mixed = expert_utils.all_reduce_ring(to_mix, mp) mixed = mp(tf.multiply, mixed, mp.n ** -0.5) x = mp(lambda a, b: tf.concat([a, b], 2), mixed, to_keep) elif layer_type == "att": # single-head attention q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="q_transform") x = mp( common_attention.scaled_dot_product_attention_simple, q, x, x, self_attention_bias_3d) x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="o_transform") elif layer_type == "enc-att": # single-head attention over encoder q = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="q_transform") assert encoder_output is not None x = mp( common_attention.scaled_dot_product_attention_simple, q, encoder_output, encoder_output, encoder_decoder_attention_bias_3d) x = mp(tf.layers.dense, x, hparams.hidden_size, use_bias=False, name="o_transform") elif layer_type == "multihead-att": # multi-head attention x = mp( common_attention.multihead_attention, x, None, self_attention_bias, # bias hparams.multihead_attention_key_channels or hparams.hidden_size, hparams.multihead_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.multihead_attention_num_heads, hparams.attention_dropout) elif layer_type == "enc-multihead-att": # multi-head attention x = mp( common_attention.multihead_attention, x, encoder_output, encoder_decoder_attention_bias, # bias hparams.multihead_attention_key_channels or hparams.hidden_size, hparams.multihead_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.multihead_attention_num_heads, hparams.attention_dropout) elif layer_type == "ffn": x = mp( common_layers.dense_relu_dense, x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=[relu_dropout_broadcast_dims] * mp.n) else: assert False, "unknown sublayer %s" % layer_type return x
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = sparse_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get( "mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity, split_heads=hparams.get("split_heads")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def moe_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="moe-encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, attn_bias_for_padding=None): """A stack of transformer moe layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding(attention_bias) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = moe_transformer_encoder_layer( layer, x, encoder_self_attention_bias, hparams, attention_dropout_broadcast_dims, save_weights_to, make_image_summary) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)