def transformer_ffn_layer(x, hparams): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ if hparams.ffn_layer == "conv_hidden_relu": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert hparams.ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ if hparams.ffn_layer == "conv_hidden_relu": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = tf.shape(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) # two layers transformation. hidden_size->filter_size->hidden_size if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert hparams.ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None): """Copied from tensor2tensor.models.transformer.""" if hparams.ffn_layer == "conv_hidden_relu": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = tf.shape(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif hparams.ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif hparams.ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu(x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert hparams.ffn_layer == "none" return x
def ffn_layer_for_baseline(self, body_output, hparams): """Feed-forward layer for baseline in the REINFORCE-based transformer. Args: body_output: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model Returns: a Tensor of shape [batch_size, length, 1], which indicates the baseline value """ body_output = tf.squeeze( body_output, 0) # add one squeeze to remove non useful dimension conv_output = common_layers.conv_hidden_relu( body_output, hparams.filter_size, 1, dropout=hparams.relu_dropout ) # two layers transformation. hidden_size->filter_size->1 return conv_output
def attention_lm_decoder(decoder_input, residual_fn, decoder_self_attention_bias, hparams, name="decoder"): """A stack of attention_lm layers. Args: decoder_input: a Tensor residual_fn: a function from (layer_input, layer_output) -> combined_output decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = decoder_input with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = residual_fn( x, common_attention.multihead_attention( x, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, name="decoder_self_attention")) x = residual_fn( x, common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout)) return x
def cat_and_compress(self, decoder_output_context, decoder_output, hparams): """cantenant decoder_output_context and decoder_output_context at the last dimenstion, and then compress. Args: decoder_output_context: [batch_size, decoder_length, hidden_dim] decoder_output: [batch_size, input_length, hidden_dim] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] Returns: Final decoder representaiton. [batch_size, decoder_length, hidden_dim] """ with tf.variable_scope("cat_and_compress"): decoder_output_context = common_layers.flatten4d3d(decoder_output_context) decoder_output = common_layers.flatten4d3d(decoder_output) x = tf.concat([decoder_output_context, decoder_output], 2) conv_output = common_layers.conv_hidden_relu(x, 1, hparams.hidden_size) conv_output = tf.expand_dims(conv_output, axis=2) return conv_output
def attention_lm_decoder(decoder_input, decoder_self_attention_bias, hparams, name="decoder"): """A stack of attention_lm layers. Args: decoder_input: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = decoder_input with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = common_layers.conv_hidden_relu( common_layers.layer_preprocess(x, hparams), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) x = common_layers.layer_postprocess(x, y, hparams) return common_layers.layer_preprocess(x, hparams)
def attention_lm_decoder(decoder_input, decoder_self_attention_bias, hparams, name="decoder"): """A stack of attention_lm layers. Args: decoder_input: a Tensor decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = decoder_input with tf.variable_scope(name): for layer in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess( x, hparams), None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = common_layers.conv_hidden_relu( common_layers.layer_preprocess(x, hparams), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) x = common_layers.layer_postprocess(x, y, hparams) return common_layers.layer_preprocess(x, hparams)
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutoinal layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ ffn_layer = hparams.ffn_layer if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv(x, hparams.filter_size, hparams.hidden_size, first_kernel_size=3, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu(x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None, decode_loop_step=None, readout_filter_size=0): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. readout_filter_size: if it's greater than 0, then it will be used instead of filter_size Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "filter_size": hparams.filter_size, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "hidden_size": hparams.hidden_size, "use_bias": "True", }) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache, decode_loop_step=decode_loop_step) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, readout_filter_size or hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, readout_filter_size or hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) elif ffn_layer == "local_moe": overhead = ( hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe( x, True, expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size], hparams.hidden_size), hparams.moe_num_experts, k=hparams.moe_k, hparams=hparams) losses.append(loss) return ret else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, customized_ffn=None, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparameters for model customized_ffn: customized the ffn_layer string pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = customized_ffn or hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, hparams.filter_size, hparams.hidden_size, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = (hparams.moe_overhead_train if hparams.mode == tf.estimator.ModeKeys.TRAIN else hparams.moe_overhead_eval) ret, loss = expert_utils.local_moe_tpu( x, hparams.filter_size // 2, hparams.hidden_size, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) if losses is None: raise ValueError( "transformer_ffn_layer with type local_moe_tpu must pass in " "a losses list") losses.append(loss) return ret else: assert ffn_layer == "none" return x
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.hidden_size] hparams: hyperparmeters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutoinal layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. Returns: a Tensor of shape [batch_size, length, hparams.hidden_size] """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, hparams.filter_size, hparams.hidden_size, first_kernel_size=3, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.hidden_size, hparams.parameter_attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.filter_size, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu( x, hparams.filter_size, hparams.hidden_size, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) else: assert ffn_layer == "none" return x