def _BuildDummyPipelineCnn(num_splits=4, num_micro_batches=8, micro_batch_size=None): """Construct a dummy layer that consist of 16 3x3 conv layers. In addition, each conv layer increments a count every time step. Args: num_splits: number of cells for pipeline cnn num_micro_batches: number of time steps. micro_batch_size: Size of a micro batch. Returns: A PipeliningLayer layer. """ assert num_splits in [1, 2, 4, 8, 16] num_layers = 16 layers = [] for i in range(num_layers): layers.append(_SimpyLayer.Params().Set(name='layer_{}'.format(i))) if num_splits == 1: p = FeatureExtractionLayer.Params().Set(name='seq', sub=layers) else: cell_tpl = [] layers_per_split = num_layers // num_splits num_act_outputs = 0 num_act_inputs = 0 act_fetch_layers = None for split in range(num_splits): sub = layers[split * layers_per_split:(split + 1) * layers_per_split] if split == 0: sub.append(FetchLayer.Params().Set(name='fetch')) num_act_outputs = 1 act_fetch_layers = ['fetch'] else: num_act_inputs = 1 act_fetch_layers = [] split_layer = FeatureExtractionLayer.Params().Set( name='split_{}'.format(split), sub=sub, act_fetch_layers=act_fetch_layers, num_act_inputs=num_act_inputs, num_act_outputs=num_act_outputs) cell_tpl.append(split_layer) p = PipeliningLayer.Params().Set(name='pipeline', num_micro_batches=num_micro_batches, micro_batch_size=micro_batch_size, cell_tpl=cell_tpl, before_tpl=[]) layer = p.Instantiate() return layer
def __init__(self, params): p = params.Copy() num_layers = p.num_encoder_layers + p.num_decoder_layers if isinstance(p.splits, (list, tuple)): assert p.splits[-1] == num_layers for i, j in zip(p.splits[:-1], p.splits[1:]): assert i < j, 'Splits must be in increasing order.' else: num_splits = max(p.splits, p.num_splits) # Supporting deprecated param. layers_per_split = num_layers // num_splits p.splits = [] for i in range(num_splits): p.splits.append((i + 1) * layers_per_split) with tf.variable_scope(p.name): p.encoder_tpl.source_dim = p.model_dim p.decoder_tpl.source_dim = p.model_dim transformers = [] for i in range(p.num_encoder_layers): params = p.encoder_tpl.Copy() params.name = 'encoder_%d' % (i) params.is_transparent = p.is_transparent params.packed_input = p.packed_input # Use DeterministicDropoutLayer when used in temp graphs. if len(p.splits) > 1 or p.num_micro_batches > 1: params = self.SetupDeterministicDropout(params) assert not params.has_aux_atten last_layer = (i == p.num_encoder_layers - 1) if p.is_transparent and last_layer: transparent_merger_tpl = DeterministicWeightedSumLayer.Params( ) transparent_merger_tpl.num_sources = p.num_encoder_layers + 1 transparent_merger_tpl.dropout_tpl.keep_prob = ( 1 - p.transparent_merger_dropout_prob) params.transparent_merger_tpl = transparent_merger_tpl params.num_transparent_outputs = p.num_transparent_outputs transformers.append(params) for i in range(p.num_decoder_layers): params = p.decoder_tpl.Copy() params.name = 'decoder_%d' % (i) params.mask_self_atten = True params.packed_input = p.packed_input params.is_transparent = p.is_transparent and ( p.num_transparent_outputs == p.num_decoder_layers) if len(p.splits) > 1 or p.num_micro_batches > 1: params = self.SetupDeterministicDropout(params) assert params.has_aux_atten transformers.append(params) cells = [] cell_start = 0 for split, cell_end in enumerate(p.splits): sub = transformers[cell_start:cell_end] cell = FeatureExtractionLayer.Params().Set( name='cell_{}'.format(split), sub=sub) cells.append(cell) cell_start = cell_end p.cell_tpl = cells super(GPipeTransformerStack, self).__init__(p)
def __init__(self, params): p = params.Copy() num_layers = p.num_encoder_layers + p.num_decoder_layers assert num_layers % p.num_splits == 0 with tf.variable_scope(p.name): p.encoder_tpl.source_dim = p.model_dim p.decoder_tpl.source_dim = p.model_dim layers_per_split = num_layers // p.num_splits transformers = [] for i in range(p.num_encoder_layers): params = p.encoder_tpl.Copy() params.name = 'encoder_%d' % (i) params.is_transparent = p.is_transparent params.packed_input = p.packed_input # Use DeterministicDropoutLayer when used in temp graphs. if p.num_splits > 1: params.tr_atten_tpl.residual_dropout_tpl = ( DeterministicDropoutLayer.Params()) params.tr_atten_tpl.atten_tpl.atten_dropout_deterministic = True params.tr_atten_tpl.atten_tpl.inner_atten_params \ .atten_dropout_deterministic = True params.tr_fflayer_tpl.residual_dropout_tpl = ( DeterministicDropoutLayer.Params()) params.tr_fflayer_tpl.fflayer_tpl.dropout = ( DeterministicDropoutLayer.Params()) assert not params.has_aux_atten last_layer = (i == p.num_encoder_layers - 1) if p.is_transparent and last_layer: transparent_merger_tpl = DeterministicWeightedSumLayer.Params() transparent_merger_tpl.num_sources = p.num_encoder_layers + 1 transparent_merger_tpl.dropout_tpl.keep_prob = ( 1 - p.transparent_merger_dropout_prob) params.transparent_merger_tpl = transparent_merger_tpl params.num_transparent_outputs = p.num_transparent_outputs transformers.append(params) for i in range(p.num_decoder_layers): params = p.decoder_tpl.Copy() params.name = 'decoder_%d' % (i) params.mask_self_atten = True params.packed_input = p.packed_input params.is_transparent = p.is_transparent and ( p.num_transparent_outputs == p.num_decoder_layers) assert params.has_aux_atten transformers.append(params) cells = [] for split in range(p.num_splits): sub = transformers[split * layers_per_split:(split + 1) * layers_per_split] cell = FeatureExtractionLayer.Params().Set( name='cell_{}'.format(split), sub=sub) cells.append(cell) p.cell_tpl = cells super(GPipeTransformerStack, self).__init__(p)
def testDeterministicDropoutInsideFunctionalWhile(self): with self.session() as sess: cells = FeatureExtractionLayer.Params().Set( name='cell', sub=[ DeterministicDropoutLayer.Params().Set(name='dropout', keep_prob=0.7) ]) p = PipeliningLayer.Params().Set(name='pipe', cell_tpl=[cells]) x = tf.ones([2, 3], dtype=tf.float32) model = p.cls(p) y = model.FPropDefaultTheta(x) py_utils.GetOrCreateGlobalStep() tf.global_variables_initializer().run() y_val = sess.run(y) self.assertAllClose([ [1.0 / 0.7, 1.0 / 0.7, 1.0 / 0.7], [0.0, 0.0, 1.0 / 0.7], ], y_val) self.assertAllClose(5.7142859, np.sum(y_val))
def testDropoutInRecurrent(self, splits=1, num_micro_batches=1): assert splits in [1, 2, 4] with self.session() as sess: tf.set_random_seed(12345) num_layers = 4 py_utils.GetOrCreateGlobalStep() # Build a model with 4 dropout layers. layers = [] for l in range(num_layers): layers.append(DeterministicDropoutLayer.Params().Set( name='dropout_{}'.format(l), keep_prob=0.7)) # Divide the model into splits partitions. cell_tpl = [] layers_per_split = num_layers // splits for i in range(splits): sub = layers[i * layers_per_split:(i + 1) * layers_per_split] cell_tpl.append(FeatureExtractionLayer.Params().Set( name='cell_{}'.format(i), sub=sub)) # Parallelize partitions using pipeline. p = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=cell_tpl) # Fake input x = tf.ones([2, 3]) # Construct weights. w = tf.get_variable( 'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2)) mdl = p.cls(p) y = mdl.FPropDefaultTheta(x * w) # Construct loss function such that gradients = final activation. loss = tf.reduce_sum(y) grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w)) tf.global_variables_initializer().run() y_val = sess.run(y) grads_val = sess.run(grads)['w'][1] self.assertAllClose(y_val, grads_val)
def _Partition(params, num_splits, *shapes): seqs = PartitionSequentialLayers(params, num_splits, *shapes) return [ FeatureExtractionLayer.Params().Set(name='d%d' % i, sub=seqs[i].sub) for i in range(len(seqs)) ]
def __init__(self, params): p = params.Copy() num_layers = p.num_encoder_layers + p.num_decoder_layers if isinstance(p.splits, (list, tuple)): assert p.splits[-1] == num_layers for i, j in zip(p.splits[:-1], p.splits[1:]): assert i <= j, 'Splits must be in increasing order.' else: num_splits = p.splits layers_per_split = (num_layers - 1) // num_splits + 1 p.splits = [] for i in range(num_splits): p.splits.append((i + 1) * layers_per_split) p.splits[-1] = num_layers with tf.variable_scope(p.name): transformers = [] if p.is_transparent: p.transparent_merger_tpl.num_sources = p.num_encoder_layers + 1 p.transparent_merger_tpl.dropout_tpl.keep_prob = ( 1 - p.transparent_merger_dropout_prob) # Encoder Embedding layer. if len(p.splits) > 1 or p.num_micro_batches > 1: p.emb_tpl.dropout_tpl = layers.DeterministicDropoutLayer.Params( ) p.emb_tpl.packed_input = p.packed_input p.emb_tpl.is_transparent = p.is_transparent p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0) p.emb_tpl.name = 'emb' p.emb_tpl.batch_dim = p.batch_dim transformers.append(p.emb_tpl) if p.softmax_tpl: p.softmax_tpl.name = 'softmax' p.softmax_tpl.inputs_from_decoder = p.num_decoder_layers > 0 # Encoder layers. for i in range(p.num_encoder_layers): params = p.encoder_tpl.Copy() params.name = 'encoder_%d' % (i) if p.is_transparent: params.is_transparent = p.is_transparent params.final_enc_layer = (i == (p.num_encoder_layers - 1)) if p.normalize_encoder and (i == (p.num_encoder_layers - 1)): params.normalize_output = p.normalize_encoder params.final_enc_layer = (i == (p.num_encoder_layers - 1)) if p.packed_input: params.packed_input = p.packed_input # Use DeterministicDropoutLayer when used in temp graphs. if len(p.splits) > 1 or p.num_micro_batches > 1: params = params.cls.SetupDeterministicDropout(params) assert not params.has_aux_atten if p.is_transparent and i == 0: params.transparent_merger_tpl = p.transparent_merger_tpl.Copy( ) transformers.append(params) # Decoder layers. for i in range(p.num_decoder_layers): params = p.decoder_tpl.Copy() params.name = 'decoder_%d' % (i) params.mask_self_atten = True if p.packed_input: params.packed_input = p.packed_input if len(p.splits) > 1 or p.num_micro_batches > 1: params = params.cls.SetupDeterministicDropout(params) assert params.has_aux_atten transformers.append(params) cells = [] cell_start = 0 # To account for embedding layers in the pipeline. offset = 1 for split, cell_end in enumerate(p.splits): # Layer 0 (embeddings) is always in split 0. sub = transformers[cell_start:(cell_end + offset)] if split == len(p.splits) - 1 and p.softmax_tpl: sub.append(p.softmax_tpl) cell = FeatureExtractionLayer.Params().Set( name='cell_{}'.format(split), sub=sub) cells.append(cell) cell_start = cell_end + offset p.cell_tpl = cells super(GPipeTransformerStack, self).__init__(p) if p.label_smoothing: self.CreateChild('smoother', p.label_smoothing)
def __init__(self, params): p = params.Copy() num_layers = p.num_encoder_layers + p.num_decoder_layers if isinstance(p.splits, (list, tuple)): assert p.splits[-1] == num_layers for i, j in zip(p.splits[:-1], p.splits[1:]): assert i < j, 'Splits must be in increasing order.' else: num_splits = max(p.splits, p.num_splits) # Supporting deprecated param. layers_per_split = num_layers // num_splits p.splits = [] for i in range(num_splits): p.splits.append((i + 1) * layers_per_split) with tf.variable_scope(p.name): p.encoder_tpl.source_dim = p.model_dim p.decoder_tpl.source_dim = p.model_dim transformers = [] # Encoder Embedding layer. if p.use_pipelined_embeddings: if len(p.splits) > 1 or p.num_micro_batches > 1: p.emb_tpl.dropout_tpl = DeterministicDropoutLayer.Params() p.emb_tpl.packed_input = p.packed_input p.emb_tpl.is_transparent = p.is_transparent p.emb_tpl.add_tgt_embedding_layer = (p.num_decoder_layers > 0) p.emb_tpl.name = 'emb' transformers.append(p.emb_tpl) # Encoder layers. for i in range(p.num_encoder_layers): params = p.encoder_tpl.Copy() if i % p.apply_dropout_every_n != 0: params.tr_atten_tpl.residual_dropout_prob = 0.0 params.tr_atten_tpl.atten_dropout_prob = 0.0 params.tr_fflayer_tpl.residual_dropout_prob = 0.0 params.tr_fflayer_tpl.relu_dropout_prob = 0.0 params.name = 'encoder_%d' % (i) params.is_transparent = p.is_transparent params.packed_input = p.packed_input # Use DeterministicDropoutLayer when used in temp graphs. if len(p.splits) > 1 or p.num_micro_batches > 1: params = self.SetupDeterministicDropout(params) assert not params.has_aux_atten last_layer = (i == p.num_encoder_layers - 1) if p.is_transparent and last_layer: transparent_merger_tpl = DeterministicWeightedSumLayer.Params( ) transparent_merger_tpl.num_sources = p.num_encoder_layers + 1 transparent_merger_tpl.dropout_tpl.keep_prob = ( 1 - p.transparent_merger_dropout_prob) params.transparent_merger_tpl = transparent_merger_tpl params.num_transparent_outputs = p.num_transparent_outputs transformers.append(params) # Decoder layers. for i in range(p.num_decoder_layers): params = p.decoder_tpl.Copy() params.name = 'decoder_%d' % (i) params.mask_self_atten = True params.packed_input = p.packed_input params.is_transparent = p.is_transparent and ( p.num_transparent_outputs == p.num_decoder_layers) if i % p.apply_dropout_every_n != 0: params.tr_atten_tpl.residual_dropout_prob = 0.0 params.tr_atten_tpl.atten_dropout_prob = 0.0 params.tr_fflayer_tpl.residual_dropout_prob = 0.0 params.tr_fflayer_tpl.relu_dropout_prob = 0.0 if len(p.splits) > 1 or p.num_micro_batches > 1: params = self.SetupDeterministicDropout(params) assert params.has_aux_atten transformers.append(params) cells = [] cell_start = 0 # To account for embedding layers in the pipeline. offset = 0 if p.use_pipelined_embeddings: offset += 1 for split, cell_end in enumerate(p.splits): # Layer 0 (embeddings) is always in split 0. sub = transformers[cell_start:(cell_end + offset)] cell = FeatureExtractionLayer.Params().Set( name='cell_{}'.format(split), sub=sub) cells.append(cell) cell_start = cell_end + offset p.cell_tpl = cells super(GPipeTransformerStack, self).__init__(p)