def Params(cls): """Configs for `MTEncoderBiRNN`.""" p = super(MTEncoderBiRNN, cls).Params() p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.') p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(), 'Configs template for the RNN layer.') p.Define('proj_tpl', layers.ProjectionLayer.Params(), 'Configs template for the projection layer.') p.Define('lstm_cell_size', 512, 'LSTM cell size for the RNN layer.') p.Define('num_lstm_layers', 8, 'Number of rnn layers to create') p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.') p.Define('residual_start', 2, 'Layer at which we start residual connections.') p.Define('encoder_out_dim', 1024, 'Depth of the encoder output.') p.Define('bidi_rnn_type', 'func', 'Options: func. ' 'func: BidirectionalFRNN. ') p.Define('cc_schedule', None, 'Clipping cap schedule.') p.Define('is_transparent', False, 'If set, outputs a merger of layer outputs.') p.Define( 'transparent_merger_tpl', layers.WeightedSumLayer.Params().Set(add_weight_summaries=True), 'Merger op for layer outputs.') p.Define( 'packed_input', False, 'If True, encoder and all layers support ' 'multiple examples in a single sequence.') disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Uniform(0.04) # Default config for the embedding. p.emb.vn = disable_vn p.emb.vocab_size = 32000 p.emb.embedding_dim = 1024 p.emb.max_num_shards = 16 p.emb.params_init = default_params_init p.lstm_tpl.vn = disable_vn p.lstm_tpl.params_init = default_params_init return p
def Params(cls): """Configs for `MTEncoderV1`.""" p = super(MTEncoderV1, cls).Params() p.Define('emb', layers.EmbeddingLayer.Params(), 'Embedding layer params.') p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(), 'Configs template for the RNN layer.') p.Define( 'lstm_tpl_uni', None, 'Override configs template for the unidirectional RNN layers.') p.Define('lstm_tpl_bidi', None, 'Override configs template for the bidirectional RNN layer.') p.Define('lstm_cell_size', 1024, 'LSTM cell size for the RNN layer.') p.Define('num_lstm_layers', 8, 'Number of rnn layers to create') p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.') p.Define('unidi_rnn_type', 'func', 'Options: func. ' 'func: FRNN.') p.Define('bidi_rnn_type', 'func', 'Options: func. ' 'func: BidirectionalFRNN. ') p.Define('cc_schedule', None, 'Clipping cap schedule.') p.Define( 'packed_input', False, 'If True, encoder and all layers support ' 'multiple examples in a single sequence.') disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Uniform(0.04) # Default config for the embedding. p.emb.vn = disable_vn p.emb.vocab_size = 32000 p.emb.embedding_dim = 1024 p.emb.max_num_shards = 16 p.emb.params_init = default_params_init for tpl in [p.lstm_tpl, p.lstm_tpl_uni, p.lstm_tpl_bidi]: if tpl is not None: tpl.vn = disable_vn tpl.params_init = default_params_init return p
def SetupTransformerBatchMajorEncoderV1(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, activation='RELU', add_unnormalized_residuals=False, packed_input=False, use_fast_projection_layer=False, enable_per_dim_scale=True, use_fused_layernorm=False, use_bf16_activations=False, use_bias=True, xla_num_partitions=None): """Common setup for transformer batch major model encoder. Args: model_dim: specifies dimension of transformer layers, token embeddings, and positional embeddings as well context vectors (attention values). vocab_size: for token embeddings. num_layers: number of transformer layers. num_heads: number of attention heads. hidden_dim: in transformer feedforward layer. residual_dropout_prob: used in transformer feedforward and attention layer. input_dropout_prob: input dropout. atten_dropout_prob: used in attention layer. relu_dropout_prob: used in transformer feedforward layer. activation: Non-linearity for feed-forward layers. (Unused) add_unnormalized_residuals: If set, uses un-normalized residuals in TransformerAttentionLayer packed_input: Whether to enable packed input. use_fast_projection_layer: Whether to use fast projection layer to remove the data formatting overheads. enable_per_dim_scale: Whether to enable per_dim_scale. use_fused_layernorm: Whether to use fused layer normalization. use_bf16_activations: Whether to use bfloat16 for activations. use_bias: Whether to use bias for projection layer. Returns: Encoder params. """ # TODO(shibow) Add 'GLEU' activation option for batch major encoder. # GELU should be used to reduce quality loss when casting weights from fp32 # to bf16 for inference (b/123993529). del activation # EncoderV1 has already had fast projection layer as default. del use_fast_projection_layer disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Encoder encoder_params = encoder.TransformerBatchMajorEncoder.Params() encoder_params.params_init = default_params_init encoder_params.model_dim = model_dim encoder_params.input_dropout_prob = input_dropout_prob encoder_params.packed_input = packed_input encoder_params.use_fused_layernorm = use_fused_layernorm encoder_params.token_emb.Set(embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vocab_size=vocab_size, vn=disable_vn, scale_sqrt_depth=True) encoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) builder_params = self_attention_layer.Builder.Params().Set( model_dim=model_dim, num_heads=num_heads, ff_hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, atten_dropout_prob=atten_dropout_prob, selfatten_add_unnormalized_input=add_unnormalized_residuals, selfatten_enable_value_proj=True, packed_input=packed_input, enable_per_dim_scale=enable_per_dim_scale, use_fused_layernorm=use_fused_layernorm, fprop_dtype=tf.bfloat16 if use_bf16_activations else None, use_bias=use_bias, xla_num_partitions=xla_num_partitions) stack = (builder_params.Instantiate().TransformerStack( name='transformer_stack', num_layers=num_layers)) encoder_params.transformer_stack = ( self_attention_layer.StackedTransformerEncoderLayers.Cast(stack)) if add_unnormalized_residuals: encoder_params.final_layer_norm = True return encoder_params
def SetupTransformerBatchMajorDecoderV1(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, label_smoothing_uncertainty=0.1, activation='RELU', add_unnormalized_residuals=False, atten_hidden_dim=0, packed_input=False, use_fast_projection_layer=False, enable_per_dim_scale=True, use_fused_layernorm=False, use_bias=True): """Common setup for batch major transformer model decoder. Args: model_dim: specifies dimension of transformer layers, token embeddings, and positional embeddings as well context vectors (attention values). vocab_size: for token embeddings. num_layers: number of transformer layers. num_heads: number of attention heads. hidden_dim: in transformer feedforward layer. residual_dropout_prob: used in transformer feedforward and attention layer. input_dropout_prob: input dropout. atten_dropout_prob: used in attention layer. relu_dropout_prob: used in transformer feedforward layer. label_smoothing_uncertainty: A float, representing the uncertainty in label smoothing. If this value is 0, no label smoothing will be applied. activation: Non-linearity for feed-forward layers. add_unnormalized_residuals: If set, uses un-normalized residuals in TransformerAttentionLayer atten_hidden_dim: Explicitly set attention hidden dim. If 0, default is model_dim. packed_input: Whether to enable packed input. use_fast_projection_layer: Whether to use fast projection layer to remove the data formatting overheads. enable_per_dim_scale: Whether to enable per_dim_scale. use_fused_layernorm: Whether to use fused layer normalization. use_bias: Whether to use bias for projection layer. Returns: Decoder params. """ disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Decoder decoder_params = decoder.TransformerBatchMajorDecoder.Params() decoder_params.source_dim = model_dim decoder_params.model_dim = model_dim decoder_params.num_trans_layers = num_layers decoder_params.input_dropout_prob = input_dropout_prob decoder_params.packed_input = packed_input decoder_params.use_fused_layernorm = use_fused_layernorm decoder_params.token_emb.Set(vocab_size=vocab_size, embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vn=disable_vn, scale_sqrt_depth=True) decoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) decoder_params.trans_decoder_tpl.packed_input = packed_input decoder_params.trans_decoder_tpl.input_dim = model_dim decoder_params.trans_decoder_tpl.tr_atten_tpl.Set( input_dim=model_dim, num_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=default_params_init, add_unnormalized_input=add_unnormalized_residuals, hidden_dim=atten_hidden_dim, vn=disable_vn) decoder_params.trans_decoder_tpl.tr_atten_tpl.ln_tpl.Set( use_fused_layernorm=use_fused_layernorm) decoder_params.trans_decoder_tpl.tr_atten_tpl.atten_tpl.Set( vn=disable_vn, packed_input=packed_input, enable_per_dim_scale=enable_per_dim_scale, use_bias=use_bias) decoder_params.trans_decoder_tpl.tr_fflayer_tpl.Set( input_dim=model_dim, hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) decoder_params.trans_decoder_tpl.tr_fflayer_tpl.ln_tpl.Set( use_fused_layernorm=use_fused_layernorm) decoder_params.trans_decoder_tpl.tr_fflayer_tpl.fflayer_tpl.projection.Set( use_einsum=use_fast_projection_layer) if add_unnormalized_residuals: decoder_params.final_layer_norm = True decoder_params.softmax.Set(num_classes=vocab_size, vn=disable_vn, params_init=emb_params_init, num_shards=16) decoder_params.per_word_avg_loss = True decoder_params.label_smoothing = layers.UniformLabelSmoother.Params() decoder_params.label_smoothing.num_classes = vocab_size decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty return decoder_params
def SetupTransformerEncoder(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, is_transparent=False, activation='RELU', add_unnormalized_residuals=False, atten_hidden_dim=0): """Common setup for transformer model encoder. Args: model_dim: specifies dimension of transformer layers, token embeddings, and positional embeddings as well context vectors (attention values). vocab_size: for token embeddings. num_layers: number of transformer layers. num_heads: number of attention heads. hidden_dim: in transformer feedforward layer. residual_dropout_prob: used in transformer feedforward and attention layer. input_dropout_prob: input dropout. atten_dropout_prob: used in attention layer. relu_dropout_prob: used in transformer feedforward layer. is_transparent: if set, outputs a merger of embeddings and layer outputs. activation: Non-linearity for feed-forward layers. add_unnormalized_residuals: If set, uses un-normalized residuals in TransformerAttentionLayer atten_hidden_dim: Explicitly set attention hidden dim. Returns: Encoder params. """ disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Encoder encoder_params = encoder.TransformerEncoder.Params() encoder_params.token_emb.Set(embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vocab_size=vocab_size, vn=disable_vn, scale_sqrt_depth=True) encoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) # Encoder TransformerStack params encoder_params.model_dim = model_dim encoder_params.transformer_stack.model_dim = model_dim encoder_params.transformer_stack.num_transformer_layers = num_layers encoder_params.input_dropout_prob = input_dropout_prob encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.Set( num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=default_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.atten_tpl.Set( num_attention_heads=num_heads, enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, vn=disable_vn) encoder_params.transformer_stack.transformer_tpl.tr_fflayer_tpl.Set( hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) if is_transparent: encoder_params.transformer_stack.is_transparent = True return encoder_params
def SetupTransformerDecoder(model_dim, vocab_size, num_layers, num_heads, hidden_dim, residual_dropout_prob=0.1, input_dropout_prob=0.0, atten_dropout_prob=0.0, relu_dropout_prob=0.0, label_smoothing_uncertainty=0.1, is_transparent=False, activation='RELU', add_unnormalized_residuals=False, atten_hidden_dim=0): """Common setup for transformer model decoder.""" disable_vn = py_utils.VariationalNoiseParams(1.0, False, False) default_params_init = py_utils.WeightInit.Xavier(1.0) emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim)) # Decoder decoder_params = decoder.TransformerDecoder.Params() decoder_params.source_dim = model_dim decoder_params.model_dim = model_dim decoder_params.num_trans_layers = num_layers decoder_params.input_dropout_prob = input_dropout_prob decoder_params.token_emb.Set(vocab_size=vocab_size, embedding_dim=model_dim, max_num_shards=16, params_init=emb_params_init, vn=disable_vn, scale_sqrt_depth=True) decoder_params.position_emb.Set(embedding_dim=model_dim, trainable_scaling=False, vn=disable_vn) decoder_params.trans_tpl.source_dim = model_dim decoder_params.trans_tpl.tr_atten_tpl.Set( source_dim=model_dim, num_attention_heads=num_heads, residual_dropout_prob=residual_dropout_prob, atten_dropout_prob=atten_dropout_prob, params_init=default_params_init, add_unnormalized_input=add_unnormalized_residuals, atten_hidden_dim=atten_hidden_dim, vn=disable_vn) decoder_params.trans_tpl.tr_atten_tpl.atten_tpl.Set( enable_ctx_pre_proj=True, enable_ctx_post_proj=True, context_dim=model_dim, vn=disable_vn) decoder_params.trans_tpl.tr_fflayer_tpl.Set( input_dim=model_dim, hidden_dim=hidden_dim, residual_dropout_prob=residual_dropout_prob, relu_dropout_prob=relu_dropout_prob, params_init=default_params_init, vn=disable_vn, activation=activation) decoder_params.softmax.Set(num_classes=vocab_size, vn=disable_vn, params_init=emb_params_init, num_shards=16) decoder_params.per_word_avg_loss = True decoder_params.label_smoothing = layers.UniformLabelSmoother.Params() decoder_params.label_smoothing.num_classes = vocab_size decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty if is_transparent: decoder_params.is_transparent = True return decoder_params