def transformer_ffn_layer(x, hparams): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ if hparams.ffn_layer == "conv_hidden_relu": return common_layers.conv_hidden_relu(x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu(x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert hparams.ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ if hparams.ffn_layer == "conv_hidden_relu": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) else: assert hparams.ffn_layer == "none" return x
def attention_lm_decoder(decoder_input, residual_fn, decoder_self_attention_bias, hparams, name="decoder"): """A stack of attention_lm layers. Args: decoder_input: a Tensor residual_fn: a function from (layer_input, layer_output) -> combined_output decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = decoder_input # Summaries don't work in multi-problem setting yet. summaries = "problems" not in hparams.values() or len( hparams.problems) == 1 with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = residual_fn( x, common_attention.multihead_attention( x, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, summaries=summaries, name="decoder_self_attention")) x = residual_fn( x, common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout)) return x