Exemplo n.º 1
0
def _BuildDummyPipelineCnn(num_splits=4,
                           num_micro_batches=8,
                           micro_batch_size=None):
    """Construct a dummy layer that consist of 16 3x3 conv layers.

  In addition, each conv layer increments a count every time step.

  Args:
    num_splits: number of cells for pipeline cnn
    num_micro_batches: number of time steps.
    micro_batch_size: Size of a micro batch.

  Returns:
    A PipeliningLayer layer.
  """
    assert num_splits in [1, 2, 4, 8, 16]
    num_layers = 16
    layers = []
    for i in range(num_layers):
        layers.append(_SimpyLayer.Params().Set(name='layer_{}'.format(i)))

    if num_splits == 1:
        p = FeatureExtractionLayer.Params().Set(name='seq', sub=layers)
    else:
        cell_tpl = []
        layers_per_split = num_layers // num_splits
        num_act_outputs = 0
        num_act_inputs = 0
        act_fetch_layers = None
        for split in range(num_splits):
            sub = layers[split * layers_per_split:(split + 1) *
                         layers_per_split]
            if split == 0:
                sub.append(FetchLayer.Params().Set(name='fetch'))
                num_act_outputs = 1
                act_fetch_layers = ['fetch']
            else:
                num_act_inputs = 1
                act_fetch_layers = []
            split_layer = FeatureExtractionLayer.Params().Set(
                name='split_{}'.format(split),
                sub=sub,
                act_fetch_layers=act_fetch_layers,
                num_act_inputs=num_act_inputs,
                num_act_outputs=num_act_outputs)
            cell_tpl.append(split_layer)
        p = PipeliningLayer.Params().Set(name='pipeline',
                                         num_micro_batches=num_micro_batches,
                                         micro_batch_size=micro_batch_size,
                                         cell_tpl=cell_tpl,
                                         before_tpl=[])
    layer = p.Instantiate()
    return layer
Exemplo n.º 2
0
    def __init__(self, params):
        p = params.Copy()
        num_layers = p.num_encoder_layers + p.num_decoder_layers

        if isinstance(p.splits, (list, tuple)):
            assert p.splits[-1] == num_layers
            for i, j in zip(p.splits[:-1], p.splits[1:]):
                assert i < j, 'Splits must be in increasing order.'
        else:
            num_splits = max(p.splits,
                             p.num_splits)  # Supporting deprecated param.
            layers_per_split = num_layers // num_splits
            p.splits = []
            for i in range(num_splits):
                p.splits.append((i + 1) * layers_per_split)

        with tf.variable_scope(p.name):
            p.encoder_tpl.source_dim = p.model_dim
            p.decoder_tpl.source_dim = p.model_dim
            transformers = []
            for i in range(p.num_encoder_layers):
                params = p.encoder_tpl.Copy()
                params.name = 'encoder_%d' % (i)
                params.is_transparent = p.is_transparent
                params.packed_input = p.packed_input
                # Use DeterministicDropoutLayer when used in temp graphs.
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = self.SetupDeterministicDropout(params)
                assert not params.has_aux_atten
                last_layer = (i == p.num_encoder_layers - 1)
                if p.is_transparent and last_layer:
                    transparent_merger_tpl = DeterministicWeightedSumLayer.Params(
                    )
                    transparent_merger_tpl.num_sources = p.num_encoder_layers + 1
                    transparent_merger_tpl.dropout_tpl.keep_prob = (
                        1 - p.transparent_merger_dropout_prob)
                    params.transparent_merger_tpl = transparent_merger_tpl
                    params.num_transparent_outputs = p.num_transparent_outputs
                transformers.append(params)
            for i in range(p.num_decoder_layers):
                params = p.decoder_tpl.Copy()
                params.name = 'decoder_%d' % (i)
                params.mask_self_atten = True
                params.packed_input = p.packed_input
                params.is_transparent = p.is_transparent and (
                    p.num_transparent_outputs == p.num_decoder_layers)
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = self.SetupDeterministicDropout(params)
                assert params.has_aux_atten
                transformers.append(params)
            cells = []
            cell_start = 0
            for split, cell_end in enumerate(p.splits):
                sub = transformers[cell_start:cell_end]
                cell = FeatureExtractionLayer.Params().Set(
                    name='cell_{}'.format(split), sub=sub)
                cells.append(cell)
                cell_start = cell_end
            p.cell_tpl = cells
        super(GPipeTransformerStack, self).__init__(p)
Exemplo n.º 3
0
  def __init__(self, params):
    p = params.Copy()
    num_layers = p.num_encoder_layers + p.num_decoder_layers
    assert num_layers % p.num_splits == 0
    with tf.variable_scope(p.name):
      p.encoder_tpl.source_dim = p.model_dim
      p.decoder_tpl.source_dim = p.model_dim
      layers_per_split = num_layers // p.num_splits
      transformers = []
      for i in range(p.num_encoder_layers):
        params = p.encoder_tpl.Copy()
        params.name = 'encoder_%d' % (i)
        params.is_transparent = p.is_transparent
        params.packed_input = p.packed_input
        # Use DeterministicDropoutLayer when used in temp graphs.
        if p.num_splits > 1:
          params.tr_atten_tpl.residual_dropout_tpl = (
              DeterministicDropoutLayer.Params())
          params.tr_atten_tpl.atten_tpl.atten_dropout_deterministic = True
          params.tr_atten_tpl.atten_tpl.inner_atten_params \
          .atten_dropout_deterministic = True
          params.tr_fflayer_tpl.residual_dropout_tpl = (
              DeterministicDropoutLayer.Params())
          params.tr_fflayer_tpl.fflayer_tpl.dropout = (
              DeterministicDropoutLayer.Params())

        assert not params.has_aux_atten
        last_layer = (i == p.num_encoder_layers - 1)
        if p.is_transparent and last_layer:
          transparent_merger_tpl = DeterministicWeightedSumLayer.Params()
          transparent_merger_tpl.num_sources = p.num_encoder_layers + 1
          transparent_merger_tpl.dropout_tpl.keep_prob = (
              1 - p.transparent_merger_dropout_prob)
          params.transparent_merger_tpl = transparent_merger_tpl
          params.num_transparent_outputs = p.num_transparent_outputs
        transformers.append(params)
      for i in range(p.num_decoder_layers):
        params = p.decoder_tpl.Copy()
        params.name = 'decoder_%d' % (i)
        params.mask_self_atten = True
        params.packed_input = p.packed_input
        params.is_transparent = p.is_transparent and (
            p.num_transparent_outputs == p.num_decoder_layers)
        assert params.has_aux_atten
        transformers.append(params)
      cells = []
      for split in range(p.num_splits):
        sub = transformers[split * layers_per_split:(split + 1) *
                           layers_per_split]
        cell = FeatureExtractionLayer.Params().Set(
            name='cell_{}'.format(split), sub=sub)
        cells.append(cell)
      p.cell_tpl = cells
    super(GPipeTransformerStack, self).__init__(p)
Exemplo n.º 4
0
 def testDeterministicDropoutInsideFunctionalWhile(self):
     with self.session() as sess:
         cells = FeatureExtractionLayer.Params().Set(
             name='cell',
             sub=[
                 DeterministicDropoutLayer.Params().Set(name='dropout',
                                                        keep_prob=0.7)
             ])
         p = PipeliningLayer.Params().Set(name='pipe', cell_tpl=[cells])
         x = tf.ones([2, 3], dtype=tf.float32)
         model = p.cls(p)
         y = model.FPropDefaultTheta(x)
         py_utils.GetOrCreateGlobalStep()
         tf.global_variables_initializer().run()
         y_val = sess.run(y)
         self.assertAllClose([
             [1.0 / 0.7, 1.0 / 0.7, 1.0 / 0.7],
             [0.0, 0.0, 1.0 / 0.7],
         ], y_val)
         self.assertAllClose(5.7142859, np.sum(y_val))
Exemplo n.º 5
0
 def testDropoutInRecurrent(self, splits=1, num_micro_batches=1):
   assert splits in [1, 2, 4]
   with self.session() as sess:
     tf.set_random_seed(12345)
     num_layers = 4
     py_utils.GetOrCreateGlobalStep()
     # Build a model with 4 dropout layers.
     layers = []
     for l in range(num_layers):
       layers.append(DeterministicDropoutLayer.Params().Set(
           name='dropout_{}'.format(l), keep_prob=0.7))
     # Divide the model into splits partitions.
     cell_tpl = []
     layers_per_split = num_layers // splits
     for i in range(splits):
       sub = layers[i * layers_per_split:(i + 1) * layers_per_split]
       cell_tpl.append(FeatureExtractionLayer.Params().Set(
           name='cell_{}'.format(i), sub=sub))
     # Parallelize partitions using pipeline.
     p = PipeliningLayer.Params().Set(
         name='pipeline',
         num_micro_batches=num_micro_batches,
         cell_tpl=cell_tpl)
     # Fake input
     x = tf.ones([2, 3])
     # Construct weights.
     w = tf.get_variable(
         'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2))
     mdl = p.cls(p)
     y = mdl.FPropDefaultTheta(x * w)
     # Construct loss function such that gradients = final activation.
     loss = tf.reduce_sum(y)
     grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w))
     tf.global_variables_initializer().run()
     y_val = sess.run(y)
     grads_val = sess.run(grads)['w'][1]
     self.assertAllClose(y_val, grads_val)
Exemplo n.º 6
0
def _Partition(params, num_splits, *shapes):
    seqs = PartitionSequentialLayers(params, num_splits, *shapes)
    return [
        FeatureExtractionLayer.Params().Set(name='d%d' % i, sub=seqs[i].sub)
        for i in range(len(seqs))
    ]
Exemplo n.º 7
0
    def __init__(self, params):
        p = params.Copy()
        num_layers = p.num_encoder_layers + p.num_decoder_layers

        if isinstance(p.splits, (list, tuple)):
            assert p.splits[-1] == num_layers
            for i, j in zip(p.splits[:-1], p.splits[1:]):
                assert i <= j, 'Splits must be in increasing order.'
        else:
            num_splits = p.splits
            layers_per_split = (num_layers - 1) // num_splits + 1
            p.splits = []
            for i in range(num_splits):
                p.splits.append((i + 1) * layers_per_split)
            p.splits[-1] = num_layers

        with tf.variable_scope(p.name):
            transformers = []

            if p.is_transparent:
                p.transparent_merger_tpl.num_sources = p.num_encoder_layers + 1
                p.transparent_merger_tpl.dropout_tpl.keep_prob = (
                    1 - p.transparent_merger_dropout_prob)

            # Encoder Embedding layer.
            if len(p.splits) > 1 or p.num_micro_batches > 1:
                p.emb_tpl.dropout_tpl = layers.DeterministicDropoutLayer.Params(
                )
            p.emb_tpl.packed_input = p.packed_input
            p.emb_tpl.is_transparent = p.is_transparent
            p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0)
            p.emb_tpl.name = 'emb'
            p.emb_tpl.batch_dim = p.batch_dim
            transformers.append(p.emb_tpl)
            if p.softmax_tpl:
                p.softmax_tpl.name = 'softmax'
                p.softmax_tpl.inputs_from_decoder = p.num_decoder_layers > 0
            # Encoder layers.
            for i in range(p.num_encoder_layers):
                params = p.encoder_tpl.Copy()
                params.name = 'encoder_%d' % (i)
                if p.is_transparent:
                    params.is_transparent = p.is_transparent
                    params.final_enc_layer = (i == (p.num_encoder_layers - 1))
                if p.normalize_encoder and (i == (p.num_encoder_layers - 1)):
                    params.normalize_output = p.normalize_encoder
                    params.final_enc_layer = (i == (p.num_encoder_layers - 1))
                if p.packed_input:
                    params.packed_input = p.packed_input
                # Use DeterministicDropoutLayer when used in temp graphs.
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = params.cls.SetupDeterministicDropout(params)
                assert not params.has_aux_atten
                if p.is_transparent and i == 0:
                    params.transparent_merger_tpl = p.transparent_merger_tpl.Copy(
                    )
                transformers.append(params)

            # Decoder layers.
            for i in range(p.num_decoder_layers):
                params = p.decoder_tpl.Copy()
                params.name = 'decoder_%d' % (i)
                params.mask_self_atten = True
                if p.packed_input:
                    params.packed_input = p.packed_input
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = params.cls.SetupDeterministicDropout(params)
                assert params.has_aux_atten
                transformers.append(params)
            cells = []
            cell_start = 0
            # To account for embedding layers in the pipeline.
            offset = 1
            for split, cell_end in enumerate(p.splits):
                # Layer 0 (embeddings) is always in split 0.
                sub = transformers[cell_start:(cell_end + offset)]
                if split == len(p.splits) - 1 and p.softmax_tpl:
                    sub.append(p.softmax_tpl)
                cell = FeatureExtractionLayer.Params().Set(
                    name='cell_{}'.format(split), sub=sub)
                cells.append(cell)
                cell_start = cell_end + offset
            p.cell_tpl = cells
        super(GPipeTransformerStack, self).__init__(p)

        if p.label_smoothing:
            self.CreateChild('smoother', p.label_smoothing)
Exemplo n.º 8
0
    def __init__(self, params):
        p = params.Copy()
        num_layers = p.num_encoder_layers + p.num_decoder_layers

        if isinstance(p.splits, (list, tuple)):
            assert p.splits[-1] == num_layers
            for i, j in zip(p.splits[:-1], p.splits[1:]):
                assert i < j, 'Splits must be in increasing order.'
        else:
            num_splits = max(p.splits,
                             p.num_splits)  # Supporting deprecated param.
            layers_per_split = num_layers // num_splits
            p.splits = []
            for i in range(num_splits):
                p.splits.append((i + 1) * layers_per_split)

        with tf.variable_scope(p.name):
            p.encoder_tpl.source_dim = p.model_dim
            p.decoder_tpl.source_dim = p.model_dim
            transformers = []

            # Encoder Embedding layer.
            if p.use_pipelined_embeddings:
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    p.emb_tpl.dropout_tpl = DeterministicDropoutLayer.Params()
                p.emb_tpl.packed_input = p.packed_input
                p.emb_tpl.is_transparent = p.is_transparent
                p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0)
                p.emb_tpl.name = 'emb'
                transformers.append(p.emb_tpl)

            # Encoder layers.
            for i in range(p.num_encoder_layers):
                params = p.encoder_tpl.Copy()
                if i % p.apply_dropout_every_n != 0:
                    params.tr_atten_tpl.residual_dropout_prob = 0.0
                    params.tr_atten_tpl.atten_dropout_prob = 0.0
                    params.tr_fflayer_tpl.residual_dropout_prob = 0.0
                    params.tr_fflayer_tpl.relu_dropout_prob = 0.0
                params.name = 'encoder_%d' % (i)
                params.is_transparent = p.is_transparent
                params.packed_input = p.packed_input
                # Use DeterministicDropoutLayer when used in temp graphs.
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = self.SetupDeterministicDropout(params)
                assert not params.has_aux_atten
                last_layer = (i == p.num_encoder_layers - 1)
                if p.is_transparent and last_layer:
                    transparent_merger_tpl = DeterministicWeightedSumLayer.Params(
                    )
                    transparent_merger_tpl.num_sources = p.num_encoder_layers + 1
                    transparent_merger_tpl.dropout_tpl.keep_prob = (
                        1 - p.transparent_merger_dropout_prob)
                    params.transparent_merger_tpl = transparent_merger_tpl
                    params.num_transparent_outputs = p.num_transparent_outputs
                transformers.append(params)

            # Decoder layers.
            for i in range(p.num_decoder_layers):
                params = p.decoder_tpl.Copy()
                params.name = 'decoder_%d' % (i)
                params.mask_self_atten = True
                params.packed_input = p.packed_input
                params.is_transparent = p.is_transparent and (
                    p.num_transparent_outputs == p.num_decoder_layers)
                if i % p.apply_dropout_every_n != 0:
                    params.tr_atten_tpl.residual_dropout_prob = 0.0
                    params.tr_atten_tpl.atten_dropout_prob = 0.0
                    params.tr_fflayer_tpl.residual_dropout_prob = 0.0
                    params.tr_fflayer_tpl.relu_dropout_prob = 0.0
                if len(p.splits) > 1 or p.num_micro_batches > 1:
                    params = self.SetupDeterministicDropout(params)
                assert params.has_aux_atten
                transformers.append(params)
            cells = []
            cell_start = 0
            # To account for embedding layers in the pipeline.
            offset = 0
            if p.use_pipelined_embeddings:
                offset += 1
            for split, cell_end in enumerate(p.splits):
                # Layer 0 (embeddings) is always in split 0.
                sub = transformers[cell_start:(cell_end + offset)]
                cell = FeatureExtractionLayer.Params().Set(
                    name='cell_{}'.format(split), sub=sub)
                cells.append(cell)
                cell_start = cell_end + offset
            p.cell_tpl = cells
        super(GPipeTransformerStack, self).__init__(p)