Пример #1
0
    def __init__(self, params):
        super(TransformerBatchMajorEncoder, self).__init__(params)
        p = self.params

        assert p.output_data_format in ('TBC', 'BTC')

        if p.shared_emb:
            with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE):
                self.CreateChild('softmax', p.shared_emb)

        with tf.variable_scope(p.name):
            p.token_emb.dtype = p.dtype
            if not p.shared_emb:
                self.CreateChild('token_emb', p.token_emb)
            self.CreateChild('position_emb', p.position_emb)

            dropout_tpl = p.input_dropout_tpl.Copy()
            dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob)
            self.CreateChild('input_dropout', dropout_tpl)

            if p.transformer_stack:
                self.CreateChild('transformer_stack', p.transformer_stack)

            if p.final_layer_norm:
                layer_norm_p = layers.LayerNorm.Params().Set(
                    name='final_ln',
                    input_dim=p.model_dim,
                    use_fused_layernorm=p.use_fused_layernorm,
                    fprop_dtype=p.input_dropout_tpl.fprop_dtype)
                self.CreateChild('final_ln', layer_norm_p)
Пример #2
0
  def __init__(self, params):
    super(DepthwiseConv2DLayer, self).__init__(params)
    p = self.params
    assert p.name
    w_pc = py_utils.WeightParams(
        shape=p.filter_shape,
        init=p.params_init,
        dtype=p.dtype,
        collections=[self.__class__.__name__ + '_vars'])

    with tf.variable_scope(p.name):
      self.CreateVariable('w', w_pc)
      if p.weight_norm:
        self.CreateVariable(
            'g',
            py_utils.WeightParams(
                shape=[p.filter_shape[2], p.filter_shape[3]],
                init=py_utils.WeightInit.Constant(0.0),
                dtype=p.dtype,
                collections=[self.__class__.__name__ + '_vars']))
      if p.bias:
        # NOTE(jiahuiyu): bias is subject to LP regularization in this version.
        self.CreateVariable(
            'b',
            py_utils.WeightParams(
                shape=[self.output_channels],
                init=py_utils.WeightInit.Constant(0.0),
                dtype=p.dtype,
                collections=[self.__class__.__name__ + '_vars']))
 def __init__(self, params):
   super(AttentionBlockStep, self).__init__(params)
   p = self.params
   name = p.name
   with tf.variable_scope(name):
     self.CreateChild('query_generator', p.query_generator)
     self.CreateChild('attention', p.attention)
 def __init__(self, params):
     super(StackedRevNetLayer, self).__init__(params)
     p = params
     assert p.name
     assert p.sub_layer_params
     with tf.variable_scope(p.name):
         self.CreateChildren('sub_layers', p.sub_layer_params)
    def __init__(self, params):
        super(GPipeTransformerEmbeddingLayer, self).__init__(params)
        p = self.params
        with tf.variable_scope(p.name):
            p.token_emb.name = 'src_token_emb'
            p.position_emb.name = 'src_position_emb'
            self.CreateChild('src_token_emb', p.token_emb)
            self.CreateChild('src_pos_emb', p.position_emb)
            if p.enc_task_emb:
                self.CreateChild('src_task_emb', p.enc_task_emb)

            p.dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob)
            p.dropout_tpl.name = 'src_dropout'
            self.CreateChild('src_dropout', p.dropout_tpl)

            if p.add_tgt_embedding_layer:
                params = p.token_emb.Copy()
                if p.target_vocab_size:
                    params.vocab_size = p.target_vocab_size
                params.name = 'tgt_token_emb'
                self.CreateChild('tgt_token_emb', params)
                params = p.position_emb.Copy()
                params.name = 'tgt_position_emb'
                self.CreateChild('tgt_pos_emb', params)
                if p.dec_task_emb:
                    self.CreateChild('tgt_task_emb', p.dec_task_emb)

                params = p.dropout_tpl.Copy()
                params.keep_prob = (1.0 - p.input_dropout_prob)
                params.name = 'tgt_dropout'
                self.CreateChild('tgt_dropout', params)
        assert p.name
Пример #6
0
 def __init__(self, params):
     super(GraphStep, self).__init__(params)
     p = self.params
     assert p.name
     with tf.variable_scope(p.name):
         self._seq = []
         for i, (signature, external_signature,
                 sub_params) in enumerate(p.sub):
             assert signature
             sig = builder_layers.GraphSignature(signature)
             assert len(sig.inputs) == 1
             assert sig.outputs
             external_sig = None
             if external_signature:
                 external_sig = builder_layers.GraphSignature(
                     external_signature)
                 assert len(external_sig.inputs) == 1
                 assert not external_sig.outputs
             name = sub_params.name
             if not name:
                 name = '%s_%02d' % (sig.outputs[0], i)
                 sub_params.name = name
             self.CreateChild(name, sub_params)
             self._seq.append(
                 GraphStep._seq(name, sig, external_sig,
                                self.children[name]))
         self.output_signature = builder_layers.GraphSignature(
             p.output_signature)
Пример #7
0
 def __init__(self, params):
   super(RepeatLayer, self).__init__(params)
   p = self.params
   assert p.name
   assert p.repeat > 0
   with tf.variable_scope(p.name):
     with py_utils.VariableShapePrefixContext(p.repeat):
       self.CreateChild('body', p.body)
Пример #8
0
 def _CreateQStateVar(self, t_name, suffix, params):
     name = t_name + '_' + suffix
     assert name not in self._qvars, 'QState var already exists: %s' % (
         name, )
     var_name = self._qvars_scope.name + '/' + name
     with tf.variable_scope(py_utils.GetGlobalVariableScope()):
         _, v = py_utils.CreateVariable(var_name, params, trainable=False)
     self._qvars[name] = v
     return v
 def __init__(self, params):
     super(RevNetLayer, self).__init__(params)
     p = params
     assert p.name
     assert p.f_params
     assert p.g_params
     with tf.variable_scope(p.name):
         self.CreateChild('f_block', p.f_params)
         self.CreateChild('g_block', p.g_params)
Пример #10
0
 def __init__(self, params):
   super(ParallelLayer, self).__init__(params)
   p = self.params
   assert p.name
   self._seq = []
   with tf.variable_scope(p.name):
     for sub in p.sub:
       self.CreateChild(sub.name, sub)
       self._seq.append((sub.name, self.children[sub.name]))
Пример #11
0
    def __init__(self, params):
        super(TransformerEncoder, self).__init__(params)
        p = self.params

        if p.shared_emb:
            with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE):
                # Naming this 'softmax' to match the name of the same component in the
                # decoder. Variable names need to be the same in order to be reused.
                self.CreateChild('softmax', p.shared_emb)

        with tf.variable_scope(p.name):
            assert p.token_emb.embedding_dim == p.position_emb.embedding_dim
            p.transformer_stack.Set(model_dim=p.model_dim,
                                    packed_input=p.packed_input)
            if p.model_dim != p.token_emb.embedding_dim:
                tf.logging.warning(
                    'token_emb.embedding_dim != model_dim (%s vs. %s), '
                    'creating a projection!')
                proj_p = layers.ProjectionLayer.Params().Copy()
                proj_p.name = 'emb_proj'
                proj_p.input_dim = p.token_emb.embedding_dim
                proj_p.output_dim = p.model_dim
                proj_p.batch_norm = True
                self.CreateChild('emb_proj', proj_p)

            # Token embeddings
            if not p.shared_emb:
                p.token_emb.dtype = p.dtype
                self.CreateChild('token_emb', p.token_emb)

            # Positional embeddings
            self.CreateChild('position_emb', p.position_emb)

            # Task embeddings.
            if p.task_emb:
                assert p.task_emb.embedding_dim == p.token_emb.embedding_dim
                self.CreateChild('task_emb', p.task_emb)

            dropout_tpl = layers.DropoutLayer.Params()
            dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob)
            self.CreateChild('input_dropout', dropout_tpl)

        p.transformer_stack.name = p.name
        self.CreateChild('transformer_stack', p.transformer_stack)
Пример #12
0
    def __init__(self, params):
        super(PassiveAsymQDomain, self).__init__(params)
        p = self.params

        self._t_names = set()  # set of known t_name (from CreateTensor)
        self._qvars = py_utils.NestedMap()  # var_name -> tf.Variable

        # Save a scope for lazily created variables.
        with tf.variable_scope(p.name + '/q'):
            self._qvars_scope = tf.get_variable_scope()
Пример #13
0
    def __init__(self, params):
        super(MTBaseModel, self).__init__(params)
        p = self.params

        with tf.variable_scope(p.name):
            with self._EncoderDevice():
                if p.encoder:
                    self.CreateChild('enc', p.encoder)
            with self._DecoderDevice():
                self.CreateChild('dec', p.decoder)
Пример #14
0
    def __init__(self, params):
        super(BatchNormLayer, self).__init__(params)
        p = self.params
        assert p.name

        pc = py_utils.WeightParams(
            shape=[p.dim],
            init=py_utils.WeightInit.Constant(0.0),
            dtype=p.dtype,
            collections=[self.__class__.__name__ + '_vars'])

        with tf.variable_scope(p.name):
            if not p.use_moving_avg_in_training:
                self.CreateVariable('beta', pc)
                if p.gamma_zero_init:
                    # zero initialization to BN gamma
                    self.CreateVariable('gamma', pc)
                else:
                    # Note, The real gamma to use is 1 + gamma.
                    self.CreateVariable('gamma', pc, lambda x: 1.0 + x)

            # Two statistics.
            moving_collections = [
                'moving_vars', self.__class__.__name__ + '_vars'
            ]
            if p.add_stats_to_moving_average_variables:
                moving_collections += [tf.GraphKeys.MOVING_AVERAGE_VARIABLES]
            elif p.add_stats_to_moving_average_variables is None:
                # TODO(rpang): force all models to set this param explicitly.
                tf.logging.warning(
                    'BatchNormLayer.add_stats_to_moving_average_variables should be '
                    'set to True for new models, and to False explicitly for '
                    'checkpoint compatibility.')
            # Add to the MOVING_AVERAGE_VARIABLES collection so that they are returned
            # by tf.moving_average_variables() and included in EMA variables if
            # ema_decay is enabled.
            mva = py_utils.WeightParams(shape=[p.dim],
                                        init=py_utils.WeightInit.Constant(0.0),
                                        dtype=p.dtype,
                                        collections=moving_collections)
            self.CreateVariable('moving_mean',
                                mva,
                                trainable=False,
                                aggregation=tf.VariableAggregation.MEAN)

            mvv = py_utils.WeightParams(shape=[p.dim],
                                        init=py_utils.WeightInit.Constant(1.0),
                                        dtype=p.dtype,
                                        collections=moving_collections)
            self.CreateVariable('moving_variance',
                                mvv,
                                trainable=False,
                                aggregation=tf.VariableAggregation.MEAN)
        self._epsilon = 0.001
        self._decay = p.decay
Пример #15
0
 def __init__(self, params):
   super(BiasLayer, self).__init__(params)
   p = self.params
   with tf.variable_scope(p.name):
     self.CreateVariable(
         'b',
         py_utils.WeightParams(
             shape=[p.dims],
             init=py_utils.WeightInit.Constant(0.0),
             dtype=p.dtype,
             collections=[self.__class__.__name__ + '_vars']))
Пример #16
0
 def __init__(self, params):
   super(LinearLayer, self).__init__(params)
   p = self.params
   with tf.variable_scope(p.name):
     self.CreateVariable(
         'w',
         py_utils.WeightParams(
             shape=[p.input_dims, p.output_dims],
             init=p.params_init,
             dtype=p.dtype,
             collections=[self.__class__.__name__ + '_vars']))
Пример #17
0
    def __init__(self, params):
        super(Learner, self).__init__(params)
        p = self.params

        self._var_grads = None
        self._eval_metrics = {}
        if p.grad_norm_tracker:
            # Use parent's name for backwards compatibility.
            with tf.variable_scope(self.parent.params.name):
                self.CreateChild('grad_norm_tracker', p.grad_norm_tracker)
        self.CreateChild('lr_schedule', p.lr_schedule)
        self.CreateChild('optimizer', p.optimizer)
Пример #18
0
        def _Acc(vg):
            """Updating accumulators."""

            v, g = vg
            with tf.variable_scope(v.op.name):
                _, a = py_utils.CreateVariable(
                    'grad_accumulator',
                    py_utils.WeightParams(v.get_shape(),
                                          py_utils.WeightInit.Constant(0.0),
                                          self.params.dtype),
                    trainable=False)
                a = tf.assign_add(a, g)

            return py_utils.VarGrad(v, a)
Пример #19
0
 def __init__(self, params):
     super(IdentityRegressionTask, self).__init__(params)
     with tf.variable_scope('IdentityRegressionTask'):
         self.CreateVariable(
             'm',
             py_utils.WeightParams(shape=[],
                                   init=py_utils.WeightInit.Uniform()))
         self.CreateVariable(
             'b',
             py_utils.WeightParams(shape=[],
                                   init=py_utils.WeightInit.Uniform()))
     self.global_steps = []
     self.metrics = []
     self.result_per_example_tensors = []
Пример #20
0
    def __init__(self, params):
        super(TransformerStack, self).__init__(params)
        p = self.params

        with tf.variable_scope(p.name):
            # Add transformer layers.
            transformer_layer_params = []
            denom = 1
            if isinstance(p.transformer_tpl, list):
                denom = len(p.transformer_tpl)
                assert p.num_transformer_layers % len(p.transformer_tpl) == 0
            for i in range(p.num_transformer_layers // denom):
                if isinstance(p.transformer_tpl, list):
                    for q in p.transformer_tpl:
                        params = q.Copy()
                        transformer_layer_params.append(params)
                else:
                    params = p.transformer_tpl.Copy()
                    transformer_layer_params.append(params)

            for i, params in enumerate(transformer_layer_params):
                params.name = 'trans_%d' % (i)
                params.source_dim = p.model_dim
                params.packed_input = p.packed_input
                params.has_aux_atten = p.has_aux_attention
                params.mask_self_atten = p.mask_self_atten

            self.CreateChildren('trans', transformer_layer_params)

            # Initialize TransformerStack output layer norm
            if p.ln_output:
                params = p.ln_tpl.Copy()
                # Keeping historic 'enc_out_ln' name for checkpoint compatibility.
                params.name = 'enc_out_ln'
                params.input_dim = p.model_dim
                self.CreateChild('layer_norm_out', params)

            if p.is_transparent:
                transparent_params = []
                if not p.num_transparent_outputs:
                    raise ValueError(
                        'num_transparent_outputs should be greater than 0.')
                for i in range(p.num_transparent_outputs):
                    transparent_param = p.transparent_merger_tpl.Copy()
                    transparent_param.name = 'transparent_%d' % i
                    transparent_param.num_sources = 1 + len(
                        transformer_layer_params)
                    transparent_params.append(transparent_param)
                self.CreateChildren('transparent_merger', transparent_params)
Пример #21
0
 def __init__(self, params):
   super(GraphLayer, self).__init__(params)
   p = self.params
   assert p.name
   assert p.input_endpoints
   with tf.variable_scope(p.name):
     self._seq = []
     for i, (signature, sub) in enumerate(p.sub):
       assert signature
       sig = GraphSignature(signature)
       assert sig.outputs, '{}'.format(signature)
       name = sub.name
       if not name:
         name = '%s_%02d' % (sig.outputs[0], i)
         sub.name = name
       self.CreateChild(name, sub)
       self._seq.append((name, sig, self.children[name]))
    def __init__(self, params):
        super(DeterministicWeightsLayer, self).__init__(params)
        p = self.params
        if not p.name:
            raise ValueError('Layer must have a specified name!')

        assert p.num_sources > 0, ('Must specify num_sources > 0.')
        params_init = py_utils.WeightInit.Constant(0.0)
        # Weights to be learned.
        pw = py_utils.WeightParams(
            shape=[p.num_sources],
            init=params_init,
            dtype=p.dtype,
            collections=[self.__class__.__name__ + '_vars'])
        with tf.variable_scope(p.name):
            self.CreateVariable('sum_weight', pw)
        p.dropout_tpl.name = 'dropout'
        self.CreateChild('weighted_merger_dropout', p.dropout_tpl)
Пример #23
0
 def __init__(self, params):
   super(SequentialLayer, self).__init__(params)
   p = self.params
   assert p.name
   with tf.variable_scope(p.name):
     if p.repeat <= 1:
       self._seq = []
       for sub in p.sub:
         self.CreateChild(sub.name, sub)
         self._seq.append((sub.name, self.children[sub.name]))
     else:
       # We create 'repeat' number of sub layers. Each sub layer is a
       # sequential layer specified by 'sub'.  This allows us to name each
       # repetition with a unique name.
       children = []
       for i in range(p.repeat):
         children.append(p.Copy().Set(name='%03d' % i, repeat=1))
       self.CreateChildren('rep', children)
Пример #24
0
    def __init__(self, params):
        super(BatchNormLayerNoPadding, self).__init__(params)
        p = self.params
        assert p.name, 'Name of BatchNormLayerNoPadding is not set.'
        p.fprop_dtype = None

        # Skip L-P regularization for these variables.
        collections = [
            self.__class__.__name__ + '_vars', py_utils.SKIP_LP_REGULARIZATION
        ]
        pc = py_utils.WeightParams(shape=[p.dim],
                                   init=py_utils.WeightInit.Constant(0.0),
                                   dtype=p.dtype,
                                   collections=collections)

        with tf.variable_scope(p.name):
            self.CreateVariable('beta', pc)
            # Note, The real gamma to use is 1 + gamma.
            self.CreateVariable('gamma', pc, lambda x: 1.0 + x)

            moving_collections = [
                'moving_vars', tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
                self.__class__.__name__ + '_vars'
            ]
            mva = py_utils.WeightParams(shape=[p.dim],
                                        init=py_utils.WeightInit.Constant(0.0),
                                        dtype=p.dtype,
                                        collections=moving_collections)
            # Two statistics computed from sufficient stats.
            self.CreateVariable('moving_mean', mva, trainable=False)
            mvv = py_utils.WeightParams(shape=[p.dim],
                                        init=py_utils.WeightInit.Constant(1.0),
                                        dtype=p.dtype,
                                        collections=moving_collections)
            self.CreateVariable('moving_variance', mvv, trainable=False)

        # Accumulate bn sufficient stats over micro-batches.
        dim = self.vars.beta.shape[0]
        self.RegisterAccumulator('counts', AddingAccumulator([], p.dtype))
        self.RegisterAccumulator('mean_ss', AddingAccumulator([dim], p.dtype))
        self.RegisterAccumulator('variance_ss',
                                 AddingAccumulator([dim], p.dtype))
Пример #25
0
    def __init__(self, params):
        super(MTEncoderUniRNN, self).__init__(params)
        p = self.params
        assert not p.packed_input, ('Packed inputs are not yet supported for '
                                    'MTEncoderUniRNN.')

        with tf.variable_scope(p.name):
            if p.cc_schedule is None:
                self.cc_schedule = None
            else:
                self.CreateChild('cc_schedule', p.cc_schedule)

            self.CreateChild('emb', p.emb)

            rnn_layers_params = []

            num_input_nodes = p.emb.embedding_dim
            for i in range(p.num_lstm_layers):
                cell = p.lstm_tpl.Copy()
                cell.name = 'L%d_rnn' % i
                cell.num_input_nodes = num_input_nodes
                cell.num_output_nodes = p.lstm_cell_size
                params = model_helper.CreateUnidirectionalRNNParams(
                    self.params, cell)
                params.name = 'L%d' % i
                rnn_layers_params.append(params)
                num_input_nodes = cell.num_output_nodes

            self.CreateChildren('rnn', rnn_layers_params)

            dropout_p = layers.DropoutLayer.Params().Set(
                name='dropout_layer',
                keep_prob=1.0 - p.dropout_prob,
                random_seed=p.random_seed +
                827366448 if p.random_seed else None)
            self.CreateChild('dropout', dropout_p)

            if p.is_transparent:
                transparent_params = p.transparent_merger_tpl.Copy()
                transparent_params.name = 'transparent'
                transparent_params.num_sources = p.num_lstm_layers
                self.CreateChild('transparent_merger', transparent_params)
Пример #26
0
    def __init__(self, params):
        super(QuantizableLayer, self).__init__(params)
        p = self.params

        self._tracked_tensors = dict()  # tracked t_name -> (QDomain)
        self._qstate = None  # t_name -> Tensor

        # Instantiate quantization domains.
        with tf.variable_scope(p.name + '/q'):
            self._qdomains = dict()  # Dict of qdname -> QDomain or None
            for qdname in dir(p.qdomain):
                qdparams = p.qdomain.Get(qdname)
                if qdparams is None:
                    continue
                assert issubclass(qdparams.cls, QDomain), (
                    'Expected quantized domain %s to extend QDomain' % qdname)
                qdchild_name = 'qdomain_' + qdname
                self.CreateChild(qdchild_name, qdparams)
                self._qdomains[qdname] = self.children[qdchild_name]
        self._AddQuantizationFunctions()
Пример #27
0
 def __init__(self, params):
   super(SoftCondLayer, self).__init__(params)
   p = self.params
   assert p.name
   assert p.num_experts
   assert p.cond_dim
   with tf.variable_scope(p.name):
     # Create Variables for task weight mapping.
     collections = [
         self.__class__.__name__ + '_vars',
     ]
     w_p = py_utils.WeightParams(
         shape=[p.cond_dim, p.num_experts],
         init=p.params_init,  # TODO(huangyp): try zero init instead.
         dtype=p.dtype,
         collections=collections)
     self.CreateVariable('w', w_p)
     # Prepends p.num_experts to the tensor shape of every variable created
     # by p.body.
     with py_utils.VariableShapePrefixContext(p.num_experts):
       self.CreateChild('body', p.body)
Пример #28
0
    def __init__(self, params):
        super(DevBasedSchedule, self).__init__(params)

        p = self.params

        with tf.variable_scope(p.name):
            wp = py_utils.WeightParams(shape=[],
                                       init=py_utils.WeightInit.Constant(1.0),
                                       collections=['DevBasedSchedule_vars'],
                                       dtype=tf.float32)
            _, self._cur_factor, = py_utils.CreateVariable('cur_factor',
                                                           wp,
                                                           trainable=False)
            wp = py_utils.WeightParams(shape=[],
                                       init=py_utils.WeightInit.Constant(0),
                                       collections=['DevBasedSchedule_vars'],
                                       dtype=tf.int64)
            _, self._ref_step, = py_utils.CreateVariable('ref_step',
                                                         wp,
                                                         trainable=False)

            self._metric_history = early_stop.MetricHistory(p.metric_history)
            self._best_step = ops.best_step(self._metric_history.hist_file,
                                            p.tolerance)
Пример #29
0
    def Apply(self, lr, var_grad):
        """Applies the gradient to the variable.

    Args:
      lr: A scalar. The base learning rate.
      var_grad: A `.NestedMap` of (var, grad) pairs.

    Returns:
      The variable update op.
    """
        optimizer = self.GetOptimizer(lr)

        def _Apply():
            if self.params.use_bf16_gradients_ar:
                return optimizer.apply_gradients(
                    [(tf.cast(g, tf.float32), v)
                     for (v, g) in var_grad.Flatten()],
                    name='meta_backprop')
            else:
                return optimizer.apply_gradients(
                    [(g, v) for (v, g) in var_grad.Flatten()],
                    name='meta_backprop')

        if not py_utils.use_resource_variables():
            var_update_op = _Apply()
        else:
            # Many optimizers, e.g., Adam, Adagrad, etc., create
            # variables. We need to ensure name scope and variable scope are
            # cleared. Otherwise, tpu.batch_parallel does not work.
            with tf.name_scope(None):
                with tf.variable_scope(
                        tf.VariableScope(use_resource=True,
                                         reuse=self.VarReuseForSlotVars())):
                    var_update_op = _Apply()
        self.AddSummary(lr, optimizer, var_grad)
        return var_update_op
    def __init__(self, params):
        p = params.Copy()
        num_layers = p.num_encoder_layers + p.num_decoder_layers

        if isinstance(p.splits, (list, tuple)):
            assert p.splits[-1] == num_layers
            for i, j in zip(p.splits[:-1], p.splits[1:]):
                assert i <= j, 'Splits must be in increasing order.'
        else:
            num_splits = p.splits
            layers_per_split = (num_layers - 1) // num_splits + 1
            p.splits = []
            for i in range(num_splits):
                p.splits.append((i + 1) * layers_per_split)
            p.splits[-1] = num_layers

        with tf.variable_scope(p.name):
            transformers = []

            if p.is_transparent:
                p.transparent_merger_tpl.num_sources = p.num_encoder_layers + 1
                p.transparent_merger_tpl.dropout_tpl.keep_prob = (
                    1 - p.transparent_merger_dropout_prob)

            # Encoder Embedding layer.
            if len(p.splits) > 1 or p.num_micro_batches > 1:
                p.emb_tpl.dropout_tpl = layers.DeterministicDropoutLayer.Params(
                )
            p.emb_tpl.packed_input = p.packed_input
            p.emb_tpl.is_transparent = p.is_transparent
            p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0)
            p.emb_tpl.name = 'emb'
            p.emb_tpl.batch_dim = p.batch_dim
            transformers.append(p.emb_tpl)
            if p.softmax_tpl:
                p.softmax_tpl.name = 'softmax'
                p.softmax_tpl.inputs_from_decoder = p.num_decoder_layers > 0
            # Encoder layers.
            for i in range(p.num_encoder_layers):
                params = p.encoder_tpl.Copy()
                params.name = 'encoder_%d' % (i)
                if p.is_transparent:
                    params.is_transparent = p.is_transparent
                    params.final_enc_layer = (i == (p.num_encoder_layers - 1))
                if p.normalize_encoder and (i == (p.num_encoder_layers - 1)):
                    params.normalize_output = p.normalize_encoder
                    params.final_enc_layer = (i == (p.num_encoder_layers - 1))
                if p.packed_input:
                    params.packed_input = p.packed_input
                # Use DeterministicDropoutLayer when used in temp graphs.
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = params.cls.SetupDeterministicDropout(params)
                assert not params.has_aux_atten
                if p.is_transparent and i == 0:
                    params.transparent_merger_tpl = p.transparent_merger_tpl.Copy(
                    )
                transformers.append(params)

            # Decoder layers.
            for i in range(p.num_decoder_layers):
                params = p.decoder_tpl.Copy()
                params.name = 'decoder_%d' % (i)
                params.mask_self_atten = True
                if p.packed_input:
                    params.packed_input = p.packed_input
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = params.cls.SetupDeterministicDropout(params)
                assert params.has_aux_atten
                transformers.append(params)
            cells = []
            cell_start = 0
            # To account for embedding layers in the pipeline.
            offset = 1
            for split, cell_end in enumerate(p.splits):
                # Layer 0 (embeddings) is always in split 0.
                sub = transformers[cell_start:(cell_end + offset)]
                if split == len(p.splits) - 1 and p.softmax_tpl:
                    sub.append(p.softmax_tpl)
                cell = FeatureExtractionLayer.Params().Set(
                    name='cell_{}'.format(split), sub=sub)
                cells.append(cell)
                cell_start = cell_end + offset
            p.cell_tpl = cells
        super(GPipeTransformerStack, self).__init__(p)

        if p.label_smoothing:
            self.CreateChild('smoother', p.label_smoothing)