示例#1
0
    def __init__(self, params):
        super(TransformerBatchMajorEncoder, self).__init__(params)
        p = self.params

        assert p.output_data_format in ('TBC', 'BTC')

        if p.shared_emb:
            with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE):
                self.CreateChild('softmax', p.shared_emb)

        with tf.variable_scope(p.name):
            p.token_emb.dtype = p.dtype
            if not p.shared_emb:
                self.CreateChild('token_emb', p.token_emb)
            self.CreateChild('position_emb', p.position_emb)

            dropout_tpl = p.input_dropout_tpl.Copy()
            dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob)
            self.CreateChild('input_dropout', dropout_tpl)

            if p.transformer_stack:
                self.CreateChild('transformer_stack', p.transformer_stack)

            if p.final_layer_norm:
                layer_norm_p = layers.LayerNorm.Params().Set(
                    name='final_ln',
                    input_dim=p.model_dim,
                    use_fused_layernorm=p.use_fused_layernorm,
                    fprop_dtype=p.input_dropout_tpl.fprop_dtype)
                self.CreateChild('final_ln', layer_norm_p)
 def __init__(self, params):
     super(StackedRevNetLayer, self).__init__(params)
     p = params
     assert p.name
     assert p.sub_layer_params
     with tf.variable_scope(p.name):
         self.CreateChildren('sub_layers', p.sub_layer_params)
示例#3
0
    def __init__(self, params):
        super(DepthwiseConv2DLayer, self).__init__(params)
        p = self.params
        assert p.name
        w_pc = py_utils.WeightParams(
            shape=p.filter_shape,
            init=p.params_init,
            dtype=p.dtype,
            collections=[self.__class__.__name__ + '_vars'])

        with tf.variable_scope(p.name):
            self.CreateVariable('w', w_pc)
            if p.weight_norm:
                self.CreateVariable(
                    'g',
                    py_utils.WeightParams(
                        shape=[p.filter_shape[2], p.filter_shape[3]],
                        init=py_utils.WeightInit.Constant(0.0),
                        dtype=p.dtype,
                        collections=[self.__class__.__name__ + '_vars']))
            if p.bias:
                # NOTE(jiahuiyu): bias is subject to LP regularization in this version.
                self.CreateVariable(
                    'b',
                    py_utils.WeightParams(
                        shape=[self.output_channels],
                        init=py_utils.WeightInit.Constant(0.0),
                        dtype=p.dtype,
                        collections=[self.__class__.__name__ + '_vars']))
示例#4
0
 def __init__(self, params):
     super(GraphStep, self).__init__(params)
     p = self.params
     assert p.name
     with tf.variable_scope(p.name):
         self._seq = []
         for i, (signature, external_signature,
                 sub_params) in enumerate(p.sub):
             assert signature
             sig = builder_layers.GraphSignature(signature)
             assert len(sig.inputs) == 1
             assert sig.outputs
             external_sig = None
             if external_signature:
                 external_sig = builder_layers.GraphSignature(
                     external_signature)
                 assert len(external_sig.inputs) == 1
                 assert not external_sig.outputs
             name = sub_params.name
             if not name:
                 name = '%s_%02d' % (sig.outputs[0], i)
                 sub_params.name = name
             self.CreateChild(name, sub_params)
             self._seq.append(
                 GraphStep._seq(name, sig, external_sig,
                                self.children[name]))
         self.output_signature = builder_layers.GraphSignature(
             p.output_signature)
 def __init__(self, params):
     super(AttentionBlockStep, self).__init__(params)
     p = self.params
     name = p.name
     with tf.variable_scope(name):
         self.CreateChild('query_generator', p.query_generator)
         self.CreateChild('attention', p.attention)
示例#6
0
 def __init__(self, params):
     super(RepeatLayer, self).__init__(params)
     p = self.params
     assert p.name
     assert p.repeat > 0
     with tf.variable_scope(p.name):
         with py_utils.VariableShapePrefixContext(p.repeat):
             self.CreateChild('body', p.body)
 def __init__(self, params):
     super(RevNetLayer, self).__init__(params)
     p = params
     assert p.name
     assert p.f_params
     assert p.g_params
     with tf.variable_scope(p.name):
         self.CreateChild('f_block', p.f_params)
         self.CreateChild('g_block', p.g_params)
示例#8
0
 def __init__(self, params):
     super(ParallelLayer, self).__init__(params)
     p = self.params
     assert p.name
     self._seq = []
     with tf.variable_scope(p.name):
         for sub in p.sub:
             self.CreateChild(sub.name, sub)
             self._seq.append((sub.name, self.children[sub.name]))
  def __init__(self, params):
    super(BatchNormLayer, self).__init__(params)
    p = self.params
    assert p.name

    pc = py_utils.WeightParams(
        shape=[p.dim],
        init=py_utils.WeightInit.Constant(0.0),
        dtype=p.dtype,
        collections=[self.__class__.__name__ + '_vars'])

    with tf.variable_scope(p.name):
      if not p.use_moving_avg_in_training:
        self.CreateVariable('beta', pc)
        if p.gamma_zero_init:
          # zero initialization to BN gamma
          self.CreateVariable('gamma', pc)
        else:
          # Note, The real gamma to use is 1 + gamma.
          self.CreateVariable('gamma', pc, lambda x: 1.0 + x)

      # Two statistics.
      moving_collections = ['moving_vars', self.__class__.__name__ + '_vars']
      if p.add_stats_to_moving_average_variables:
        moving_collections += [tf.GraphKeys.MOVING_AVERAGE_VARIABLES]
      elif p.add_stats_to_moving_average_variables is None:
        # TODO(rpang): force all models to set this param explicitly.
        tf.logging.warning(
            'BatchNormLayer.add_stats_to_moving_average_variables should be '
            'set to True for new models, and to False explicitly for '
            'checkpoint compatibility.')
      # Add to the MOVING_AVERAGE_VARIABLES collection so that they are returned
      # by tf.moving_average_variables() and included in EMA variables if
      # ema_decay is enabled.
      mva = py_utils.WeightParams(
          shape=[p.dim],
          init=py_utils.WeightInit.Constant(0.0),
          dtype=p.dtype,
          collections=moving_collections)
      self.CreateVariable(
          'moving_mean',
          mva,
          trainable=False,
          aggregation=tf.VariableAggregation.MEAN)

      mvv = py_utils.WeightParams(
          shape=[p.dim],
          init=py_utils.WeightInit.Constant(1.0),
          dtype=p.dtype,
          collections=moving_collections)
      self.CreateVariable(
          'moving_variance',
          mvv,
          trainable=False,
          aggregation=tf.VariableAggregation.MEAN)
    self._epsilon = 0.001
    self._decay = p.decay
示例#10
0
 def _CreateQStateVar(self, t_name, suffix, params):
     name = t_name + '_' + suffix
     assert name not in self._qvars, 'QState var already exists: %s' % (
         name, )
     var_name = self._qvars_scope.name + '/' + name
     with tf.variable_scope(py_utils.GetGlobalVariableScope()):
         _, v = py_utils.CreateVariable(var_name, params, trainable=False)
     self._qvars[name] = v
     return v
示例#11
0
    def __init__(self, params):
        super(TransformerEncoder, self).__init__(params)
        p = self.params

        if p.shared_emb:
            with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE):
                # Naming this 'softmax' to match the name of the same component in the
                # decoder. Variable names need to be the same in order to be reused.
                self.CreateChild('softmax', p.shared_emb)

        with tf.variable_scope(p.name):
            assert p.token_emb.embedding_dim == p.position_emb.embedding_dim
            p.transformer_stack.Set(model_dim=p.model_dim,
                                    packed_input=p.packed_input)
            if p.model_dim != p.token_emb.embedding_dim:
                tf.logging.warning(
                    'token_emb.embedding_dim != model_dim (%s vs. %s), '
                    'creating a projection!')
                proj_p = layers.ProjectionLayer.Params().Copy()
                proj_p.name = 'emb_proj'
                proj_p.input_dim = p.token_emb.embedding_dim
                proj_p.output_dim = p.model_dim
                proj_p.batch_norm = True
                self.CreateChild('emb_proj', proj_p)

            # Token embeddings
            if not p.shared_emb:
                p.token_emb.dtype = p.dtype
                self.CreateChild('token_emb', p.token_emb)

            # Positional embeddings
            self.CreateChild('position_emb', p.position_emb)

            # Task embeddings.
            if p.task_emb:
                assert p.task_emb.embedding_dim == p.token_emb.embedding_dim
                self.CreateChild('task_emb', p.task_emb)

            dropout_tpl = layers.DropoutLayer.Params()
            dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob)
            self.CreateChild('input_dropout', dropout_tpl)

        p.transformer_stack.name = p.name
        self.CreateChild('transformer_stack', p.transformer_stack)
示例#12
0
    def __init__(self, params):
        super(MTBaseModel, self).__init__(params)
        p = self.params

        with tf.variable_scope(p.name):
            with self._EncoderDevice():
                if p.encoder:
                    self.CreateChild('enc', p.encoder)
            with self._DecoderDevice():
                self.CreateChild('dec', p.decoder)
示例#13
0
    def __init__(self, params):
        super(PassiveAsymQDomain, self).__init__(params)
        p = self.params

        self._t_names = set()  # set of known t_name (from CreateTensor)
        self._qvars = py_utils.NestedMap()  # var_name -> tf.Variable

        # Save a scope for lazily created variables.
        with tf.variable_scope(p.name + '/q'):
            self._qvars_scope = tf.get_variable_scope()
示例#14
0
 def __init__(self, params):
     super(LinearLayer, self).__init__(params)
     p = self.params
     with tf.variable_scope(p.name):
         self.CreateVariable(
             'w',
             py_utils.WeightParams(
                 shape=[p.input_dims, p.output_dims],
                 init=p.params_init,
                 dtype=p.dtype,
                 collections=[self.__class__.__name__ + '_vars']))
示例#15
0
 def __init__(self, params):
     super(BiasLayer, self).__init__(params)
     p = self.params
     with tf.variable_scope(p.name):
         self.CreateVariable(
             'b',
             py_utils.WeightParams(
                 shape=[p.dims],
                 init=py_utils.WeightInit.Constant(0.0),
                 dtype=p.dtype,
                 collections=[self.__class__.__name__ + '_vars']))
示例#16
0
    def __init__(self, params):
        super(Learner, self).__init__(params)
        p = self.params

        self._var_grads = None
        self._eval_metrics = {}
        if p.grad_norm_tracker:
            # Use parent's name for backwards compatibility.
            with tf.variable_scope(self.parent.params.name):
                self.CreateChild('grad_norm_tracker', p.grad_norm_tracker)
        self.CreateChild('lr_schedule', p.lr_schedule)
        self.CreateChild('optimizer', p.optimizer)
 def __init__(self, params):
   super(IdentityRegressionTask, self).__init__(params)
   with tf.variable_scope('IdentityRegressionTask'):
     self.CreateVariable(
         'm',
         py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform()))
     self.CreateVariable(
         'b',
         py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform()))
   self.global_steps = []
   self.metrics = []
   self.result_per_example_tensors = []
示例#18
0
        def _Acc(vg):
            """Updating accumulators."""

            v, g = vg
            with tf.variable_scope(v.op.name):
                _, a = py_utils.CreateVariable(
                    'grad_accumulator',
                    py_utils.WeightParams(v.get_shape(),
                                          py_utils.WeightInit.Constant(0.0),
                                          self.params.dtype),
                    trainable=False)
                a = tf.assign_add(a, g)

            return py_utils.VarGrad(v, a)
示例#19
0
    def __init__(self, params):
        super(TransformerStack, self).__init__(params)
        p = self.params

        with tf.variable_scope(p.name):
            # Add transformer layers.
            transformer_layer_params = []
            denom = 1
            if isinstance(p.transformer_tpl, list):
                denom = len(p.transformer_tpl)
                assert p.num_transformer_layers % len(p.transformer_tpl) == 0
            for i in range(p.num_transformer_layers // denom):
                if isinstance(p.transformer_tpl, list):
                    for q in p.transformer_tpl:
                        params = q.Copy()
                        transformer_layer_params.append(params)
                else:
                    params = p.transformer_tpl.Copy()
                    transformer_layer_params.append(params)

            for i, params in enumerate(transformer_layer_params):
                params.name = 'trans_%d' % (i)
                params.source_dim = p.model_dim
                params.packed_input = p.packed_input
                params.has_aux_atten = p.has_aux_attention
                params.mask_self_atten = p.mask_self_atten

            self.CreateChildren('trans', transformer_layer_params)

            # Initialize TransformerStack output layer norm
            if p.ln_output:
                params = p.ln_tpl.Copy()
                # Keeping historic 'enc_out_ln' name for checkpoint compatibility.
                params.name = 'enc_out_ln'
                params.input_dim = p.model_dim
                self.CreateChild('layer_norm_out', params)

            if p.is_transparent:
                transparent_params = []
                if not p.num_transparent_outputs:
                    raise ValueError(
                        'num_transparent_outputs should be greater than 0.')
                for i in range(p.num_transparent_outputs):
                    transparent_param = p.transparent_merger_tpl.Copy()
                    transparent_param.name = 'transparent_%d' % i
                    transparent_param.num_sources = 1 + len(
                        transformer_layer_params)
                    transparent_params.append(transparent_param)
                self.CreateChildren('transparent_merger', transparent_params)
示例#20
0
 def __init__(self, params):
     super(GraphLayer, self).__init__(params)
     p = self.params
     assert p.name
     assert p.input_endpoints
     with tf.variable_scope(p.name):
         self._seq = []
         for i, (signature, sub) in enumerate(p.sub):
             assert signature
             sig = GraphSignature(signature)
             assert sig.outputs, '{}'.format(signature)
             name = sub.name
             if not name:
                 name = '%s_%02d' % (sig.outputs[0], i)
                 sub.name = name
             self.CreateChild(name, sub)
             self._seq.append((name, sig, self.children[name]))
示例#21
0
 def __init__(self, params):
     super(SequentialLayer, self).__init__(params)
     p = self.params
     assert p.name
     with tf.variable_scope(p.name):
         if p.repeat <= 1:
             self._seq = []
             for sub in p.sub:
                 self.CreateChild(sub.name, sub)
                 self._seq.append((sub.name, self.children[sub.name]))
         else:
             # We create 'repeat' number of sub layers. Each sub layer is a
             # sequential layer specified by 'sub'.  This allows us to name each
             # repetition with a unique name.
             children = []
             for i in range(p.repeat):
                 children.append(p.Copy().Set(name='%03d' % i, repeat=1))
             self.CreateChildren('rep', children)
示例#22
0
  def __init__(self, params):
    super(BatchNormLayerNoPadding, self).__init__(params)
    p = self.params
    assert p.name, 'Name of BatchNormLayerNoPadding is not set.'
    p.fprop_dtype = None

    # Skip L-P regularization for these variables.
    collections = [
        self.__class__.__name__ + '_vars', py_utils.SKIP_LP_REGULARIZATION
    ]
    pc = py_utils.WeightParams(
        shape=[p.dim],
        init=py_utils.WeightInit.Constant(0.0),
        dtype=p.dtype,
        collections=collections)

    with tf.variable_scope(p.name):
      self.CreateVariable('beta', pc)
      # Note, The real gamma to use is 1 + gamma.
      self.CreateVariable('gamma', pc, lambda x: 1.0 + x)

      moving_collections = [
          'moving_vars', tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
          self.__class__.__name__ + '_vars'
      ]
      mva = py_utils.WeightParams(
          shape=[p.dim],
          init=py_utils.WeightInit.Constant(0.0),
          dtype=p.dtype,
          collections=moving_collections)
      # Two statistics computed from sufficient stats.
      self.CreateVariable('moving_mean', mva, trainable=False)
      mvv = py_utils.WeightParams(
          shape=[p.dim],
          init=py_utils.WeightInit.Constant(1.0),
          dtype=p.dtype,
          collections=moving_collections)
      self.CreateVariable('moving_variance', mvv, trainable=False)

    # Accumulate bn sufficient stats over micro-batches.
    dim = self.vars.beta.shape[0]
    self.RegisterAccumulator('counts', AddingAccumulator([], p.dtype))
    self.RegisterAccumulator('mean_ss', AddingAccumulator([dim], p.dtype))
    self.RegisterAccumulator('variance_ss', AddingAccumulator([dim], p.dtype))
示例#23
0
    def __init__(self, params):
        super(MTEncoderUniRNN, self).__init__(params)
        p = self.params
        assert not p.packed_input, ('Packed inputs are not yet supported for '
                                    'MTEncoderUniRNN.')

        with tf.variable_scope(p.name):
            if p.cc_schedule is None:
                self.cc_schedule = None
            else:
                self.CreateChild('cc_schedule', p.cc_schedule)

            self.CreateChild('emb', p.emb)

            rnn_layers_params = []

            num_input_nodes = p.emb.embedding_dim
            for i in range(p.num_lstm_layers):
                cell = p.lstm_tpl.Copy()
                cell.name = 'L%d_rnn' % i
                cell.num_input_nodes = num_input_nodes
                cell.num_output_nodes = p.lstm_cell_size
                params = model_helper.CreateUnidirectionalRNNParams(
                    self.params, cell)
                params.name = 'L%d' % i
                rnn_layers_params.append(params)
                num_input_nodes = cell.num_output_nodes

            self.CreateChildren('rnn', rnn_layers_params)

            dropout_p = layers.DropoutLayer.Params().Set(
                name='dropout_layer',
                keep_prob=1.0 - p.dropout_prob,
                random_seed=p.random_seed +
                827366448 if p.random_seed else None)
            self.CreateChild('dropout', dropout_p)

            if p.is_transparent:
                transparent_params = p.transparent_merger_tpl.Copy()
                transparent_params.name = 'transparent'
                transparent_params.num_sources = p.num_lstm_layers
                self.CreateChild('transparent_merger', transparent_params)
示例#24
0
    def __init__(self, params):
        super(QuantizableLayer, self).__init__(params)
        p = self.params

        self._tracked_tensors = dict()  # tracked t_name -> (QDomain)
        self._qstate = None  # t_name -> Tensor

        # Instantiate quantization domains.
        with tf.variable_scope(p.name + '/q'):
            self._qdomains = dict()  # Dict of qdname -> QDomain or None
            for qdname in dir(p.qdomain):
                qdparams = p.qdomain.Get(qdname)
                if qdparams is None:
                    continue
                assert issubclass(qdparams.cls, QDomain), (
                    'Expected quantized domain %s to extend QDomain' % qdname)
                qdchild_name = 'qdomain_' + qdname
                self.CreateChild(qdchild_name, qdparams)
                self._qdomains[qdname] = self.children[qdchild_name]
        self._AddQuantizationFunctions()
示例#25
0
 def __init__(self, params):
     super(SoftCondLayer, self).__init__(params)
     p = self.params
     assert p.name
     assert p.num_experts
     assert p.cond_dim
     with tf.variable_scope(p.name):
         # Create Variables for task weight mapping.
         collections = [
             self.__class__.__name__ + '_vars',
         ]
         w_p = py_utils.WeightParams(
             shape=[p.cond_dim, p.num_experts],
             init=p.params_init,  # TODO(huangyp): try zero init instead.
             dtype=p.dtype,
             collections=collections)
         self.CreateVariable('w', w_p)
         # Prepends p.num_experts to the tensor shape of every variable created
         # by p.body.
         with py_utils.VariableShapePrefixContext(p.num_experts):
             self.CreateChild('body', p.body)
示例#26
0
 def CreateChildrenHelper(params_list, child_scopes):
     """Helper to create children recursively."""
     if child_scopes and len(child_scopes) != len(params_list):
         raise ValueError(
             'child_scopes must be same structure as params_list.')
     children = []
     for i, p in enumerate(params_list):
         if isinstance(p, list):
             children.append(
                 CreateChildrenHelper(
                     p, child_scopes[i] if child_scopes else None))
         else:
             p = self.CopyBaseParams(self.params, p.Copy())
             if not p.name:
                 p.name = '%s_%d' % (name, i)
             if child_scopes:
                 with tf.variable_scope(child_scopes[i]):
                     children.append(p.Instantiate())
             else:
                 children.append(p.Instantiate())
     return children
示例#27
0
    def __init__(self, params):
        super(DevBasedSchedule, self).__init__(params)

        p = self.params

        with tf.variable_scope(p.name):
            wp = py_utils.WeightParams(shape=[],
                                       init=py_utils.WeightInit.Constant(1.0),
                                       collections=['DevBasedSchedule_vars'],
                                       dtype=tf.float32)
            _, self._cur_factor, = py_utils.CreateVariable('cur_factor',
                                                           wp,
                                                           trainable=False)
            wp = py_utils.WeightParams(shape=[],
                                       init=py_utils.WeightInit.Constant(0),
                                       collections=['DevBasedSchedule_vars'],
                                       dtype=tf.int64)
            _, self._ref_step, = py_utils.CreateVariable('ref_step',
                                                         wp,
                                                         trainable=False)

            self._metric_history = early_stop.MetricHistory(p.metric_history)
            self._best_step = ops.best_step(self._metric_history.hist_file,
                                            p.tolerance)
示例#28
0
    def Apply(self, lr, var_grad):
        """Applies the gradient to the variable.

    Args:
      lr: A scalar. The base learning rate.
      var_grad: A `.NestedMap` of (var, grad) pairs.

    Returns:
      The variable update op.
    """
        optimizer = self.GetOptimizer(lr)

        def _Apply():
            if self.params.use_bf16_gradients_ar:
                return optimizer.apply_gradients(
                    [(tf.cast(g, tf.float32), v)
                     for (v, g) in var_grad.Flatten()],
                    name='meta_backprop')
            else:
                return optimizer.apply_gradients(
                    [(g, v) for (v, g) in var_grad.Flatten()],
                    name='meta_backprop')

        if not py_utils.use_resource_variables():
            var_update_op = _Apply()
        else:
            # Many optimizers, e.g., Adam, Adagrad, etc., create
            # variables. We need to ensure name scope and variable scope are
            # cleared. Otherwise, tpu.batch_parallel does not work.
            with tf.name_scope(None):
                with tf.variable_scope(
                        tf.VariableScope(use_resource=True,
                                         reuse=self.VarReuseForSlotVars())):
                    var_update_op = _Apply()
        self.AddSummary(lr, optimizer, var_grad)
        return var_update_op
示例#29
0
    def __init__(self, params):
        super(MTEncoderV1, self).__init__(params)
        p = self.params
        assert not p.packed_input, ('Packed inputs are not yet supported for '
                                    'MTEncoderV1.')

        with tf.variable_scope(p.name):
            if p.cc_schedule is not None:
                self.CreateChild('cc_schedule', p.cc_schedule)

            self.CreateChild('emb', p.emb)

            rnn_layers_params = []

            # L0 is a bi-directional lstm.

            # L0's forward lstm cell
            if p.lstm_tpl_bidi is None:
                params = p.lstm_tpl.Copy()
            else:
                params = p.lstm_tpl_bidi.Copy()
            params.name = 'L0_rnn_fwd'
            params.num_input_nodes = p.emb.embedding_dim
            params.num_output_nodes = p.lstm_cell_size
            forward_lstm = params

            # L0's backward lstm cell
            params = params.Copy()
            params.name = 'L0_rnn_bak'
            backward_lstm = params

            # L0 layer.
            params = model_helper.CreateBidirectionalRNNParams(
                self.params, forward_lstm, backward_lstm)
            params.name = 'L0'
            rnn_layers_params.append(params)

            # The latter layers are all uni-directional lstm.
            input_size = 2 * p.lstm_cell_size
            for i in range(1, p.num_lstm_layers):
                # Forward lstm cell.
                if p.lstm_tpl_uni is None:
                    cell = p.lstm_tpl.Copy()
                else:
                    cell = p.lstm_tpl_uni.Copy()
                cell.name = 'L%d_rnn' % i
                cell.num_input_nodes = input_size
                cell.num_output_nodes = p.lstm_cell_size
                # Forward lstm layer.
                params = model_helper.CreateUnidirectionalRNNParams(
                    self.params, cell)
                params.name = 'L%d' % i
                rnn_layers_params.append(params)
                input_size = p.lstm_cell_size

            self.CreateChildren('rnn', rnn_layers_params)

            dropout_p = layers.DropoutLayer.Params().Set(
                name='dropout_layer',
                keep_prob=1.0 - p.dropout_prob,
                random_seed=p.random_seed +
                84828474 if p.random_seed else None)
            self.CreateChild('dropout', dropout_p)
示例#30
0
    def __init__(self, params):
        super(MTEncoderBiRNN, self).__init__(params)
        p = self.params

        with tf.variable_scope(p.name):
            if p.cc_schedule is None:
                self.cc_schedule = None
            else:
                self.CreateChild('cc_schedule', p.cc_schedule)

            self.CreateChild('emb', p.emb)

            rnn_layers_params = []

            for i in range(p.num_lstm_layers):
                params = p.lstm_tpl.Copy()
                params.name = 'L%d_rnn_fwd' % i
                if i == 0:
                    params.num_input_nodes = p.emb.embedding_dim
                else:
                    params.num_input_nodes = 2 * p.lstm_cell_size
                params.num_output_nodes = p.lstm_cell_size
                params.reset_cell_state = p.packed_input
                forward_lstm = params

                params = params.Copy()
                params.name = 'L%d_rnn_bak' % i
                params.reset_cell_state = p.packed_input
                backward_lstm = params

                params = model_helper.CreateBidirectionalRNNParams(
                    self.params, forward_lstm, backward_lstm)
                params.packed_input = p.packed_input
                params.name = 'L%d' % i
                rnn_layers_params.append(params)

            self.CreateChildren('rnn', rnn_layers_params)

            if p.lstm_cell_size * 2 != p.encoder_out_dim:
                # Project the encoder output to the desired dim.
                proj_p = p.proj_tpl.Copy().Set(name='proj',
                                               batch_norm=False,
                                               input_dim=p.lstm_cell_size * 2,
                                               output_dim=p.encoder_out_dim)
                if p.cc_schedule is not None:
                    proj_p.has_bias = False
                    proj_p.activation = 'TANH'
                else:
                    proj_p.has_bias = True
                    proj_p.activation = 'NONE'
                self.CreateChild('final_proj', proj_p)

            dropout_p = layers.DropoutLayer.Params().Set(
                name='dropout_layer',
                keep_prob=1.0 - p.dropout_prob,
                random_seed=p.random_seed +
                827366448 if p.random_seed else None)
            self.CreateChild('dropout', dropout_p)

            if p.is_transparent:
                transparent_params = p.transparent_merger_tpl.Copy()
                transparent_params.name = 'transparent'
                transparent_params.num_sources = p.num_lstm_layers
                self.CreateChild('transparent_merger', transparent_params)