def __init__(self, params): super(TransformerBatchMajorEncoder, self).__init__(params) p = self.params assert p.output_data_format in ('TBC', 'BTC') if p.shared_emb: with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE): self.CreateChild('softmax', p.shared_emb) with tf.variable_scope(p.name): p.token_emb.dtype = p.dtype if not p.shared_emb: self.CreateChild('token_emb', p.token_emb) self.CreateChild('position_emb', p.position_emb) dropout_tpl = p.input_dropout_tpl.Copy() dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob) self.CreateChild('input_dropout', dropout_tpl) if p.transformer_stack: self.CreateChild('transformer_stack', p.transformer_stack) if p.final_layer_norm: layer_norm_p = layers.LayerNorm.Params().Set( name='final_ln', input_dim=p.model_dim, use_fused_layernorm=p.use_fused_layernorm, fprop_dtype=p.input_dropout_tpl.fprop_dtype) self.CreateChild('final_ln', layer_norm_p)
def __init__(self, params): super(StackedRevNetLayer, self).__init__(params) p = params assert p.name assert p.sub_layer_params with tf.variable_scope(p.name): self.CreateChildren('sub_layers', p.sub_layer_params)
def __init__(self, params): super(DepthwiseConv2DLayer, self).__init__(params) p = self.params assert p.name w_pc = py_utils.WeightParams( shape=p.filter_shape, init=p.params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): self.CreateVariable('w', w_pc) if p.weight_norm: self.CreateVariable( 'g', py_utils.WeightParams( shape=[p.filter_shape[2], p.filter_shape[3]], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars'])) if p.bias: # NOTE(jiahuiyu): bias is subject to LP regularization in this version. self.CreateVariable( 'b', py_utils.WeightParams( shape=[self.output_channels], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(GraphStep, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): self._seq = [] for i, (signature, external_signature, sub_params) in enumerate(p.sub): assert signature sig = builder_layers.GraphSignature(signature) assert len(sig.inputs) == 1 assert sig.outputs external_sig = None if external_signature: external_sig = builder_layers.GraphSignature( external_signature) assert len(external_sig.inputs) == 1 assert not external_sig.outputs name = sub_params.name if not name: name = '%s_%02d' % (sig.outputs[0], i) sub_params.name = name self.CreateChild(name, sub_params) self._seq.append( GraphStep._seq(name, sig, external_sig, self.children[name])) self.output_signature = builder_layers.GraphSignature( p.output_signature)
def __init__(self, params): super(AttentionBlockStep, self).__init__(params) p = self.params name = p.name with tf.variable_scope(name): self.CreateChild('query_generator', p.query_generator) self.CreateChild('attention', p.attention)
def __init__(self, params): super(RepeatLayer, self).__init__(params) p = self.params assert p.name assert p.repeat > 0 with tf.variable_scope(p.name): with py_utils.VariableShapePrefixContext(p.repeat): self.CreateChild('body', p.body)
def __init__(self, params): super(RevNetLayer, self).__init__(params) p = params assert p.name assert p.f_params assert p.g_params with tf.variable_scope(p.name): self.CreateChild('f_block', p.f_params) self.CreateChild('g_block', p.g_params)
def __init__(self, params): super(ParallelLayer, self).__init__(params) p = self.params assert p.name self._seq = [] with tf.variable_scope(p.name): for sub in p.sub: self.CreateChild(sub.name, sub) self._seq.append((sub.name, self.children[sub.name]))
def __init__(self, params): super(BatchNormLayer, self).__init__(params) p = self.params assert p.name pc = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']) with tf.variable_scope(p.name): if not p.use_moving_avg_in_training: self.CreateVariable('beta', pc) if p.gamma_zero_init: # zero initialization to BN gamma self.CreateVariable('gamma', pc) else: # Note, The real gamma to use is 1 + gamma. self.CreateVariable('gamma', pc, lambda x: 1.0 + x) # Two statistics. moving_collections = ['moving_vars', self.__class__.__name__ + '_vars'] if p.add_stats_to_moving_average_variables: moving_collections += [tf.GraphKeys.MOVING_AVERAGE_VARIABLES] elif p.add_stats_to_moving_average_variables is None: # TODO(rpang): force all models to set this param explicitly. tf.logging.warning( 'BatchNormLayer.add_stats_to_moving_average_variables should be ' 'set to True for new models, and to False explicitly for ' 'checkpoint compatibility.') # Add to the MOVING_AVERAGE_VARIABLES collection so that they are returned # by tf.moving_average_variables() and included in EMA variables if # ema_decay is enabled. mva = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable( 'moving_mean', mva, trainable=False, aggregation=tf.VariableAggregation.MEAN) mvv = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(1.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable( 'moving_variance', mvv, trainable=False, aggregation=tf.VariableAggregation.MEAN) self._epsilon = 0.001 self._decay = p.decay
def _CreateQStateVar(self, t_name, suffix, params): name = t_name + '_' + suffix assert name not in self._qvars, 'QState var already exists: %s' % ( name, ) var_name = self._qvars_scope.name + '/' + name with tf.variable_scope(py_utils.GetGlobalVariableScope()): _, v = py_utils.CreateVariable(var_name, params, trainable=False) self._qvars[name] = v return v
def __init__(self, params): super(TransformerEncoder, self).__init__(params) p = self.params if p.shared_emb: with tf.variable_scope('shared_emb', reuse=tf.AUTO_REUSE): # Naming this 'softmax' to match the name of the same component in the # decoder. Variable names need to be the same in order to be reused. self.CreateChild('softmax', p.shared_emb) with tf.variable_scope(p.name): assert p.token_emb.embedding_dim == p.position_emb.embedding_dim p.transformer_stack.Set(model_dim=p.model_dim, packed_input=p.packed_input) if p.model_dim != p.token_emb.embedding_dim: tf.logging.warning( 'token_emb.embedding_dim != model_dim (%s vs. %s), ' 'creating a projection!') proj_p = layers.ProjectionLayer.Params().Copy() proj_p.name = 'emb_proj' proj_p.input_dim = p.token_emb.embedding_dim proj_p.output_dim = p.model_dim proj_p.batch_norm = True self.CreateChild('emb_proj', proj_p) # Token embeddings if not p.shared_emb: p.token_emb.dtype = p.dtype self.CreateChild('token_emb', p.token_emb) # Positional embeddings self.CreateChild('position_emb', p.position_emb) # Task embeddings. if p.task_emb: assert p.task_emb.embedding_dim == p.token_emb.embedding_dim self.CreateChild('task_emb', p.task_emb) dropout_tpl = layers.DropoutLayer.Params() dropout_tpl.keep_prob = (1.0 - p.input_dropout_prob) self.CreateChild('input_dropout', dropout_tpl) p.transformer_stack.name = p.name self.CreateChild('transformer_stack', p.transformer_stack)
def __init__(self, params): super(MTBaseModel, self).__init__(params) p = self.params with tf.variable_scope(p.name): with self._EncoderDevice(): if p.encoder: self.CreateChild('enc', p.encoder) with self._DecoderDevice(): self.CreateChild('dec', p.decoder)
def __init__(self, params): super(PassiveAsymQDomain, self).__init__(params) p = self.params self._t_names = set() # set of known t_name (from CreateTensor) self._qvars = py_utils.NestedMap() # var_name -> tf.Variable # Save a scope for lazily created variables. with tf.variable_scope(p.name + '/q'): self._qvars_scope = tf.get_variable_scope()
def __init__(self, params): super(LinearLayer, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateVariable( 'w', py_utils.WeightParams( shape=[p.input_dims, p.output_dims], init=p.params_init, dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(BiasLayer, self).__init__(params) p = self.params with tf.variable_scope(p.name): self.CreateVariable( 'b', py_utils.WeightParams( shape=[p.dims], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=[self.__class__.__name__ + '_vars']))
def __init__(self, params): super(Learner, self).__init__(params) p = self.params self._var_grads = None self._eval_metrics = {} if p.grad_norm_tracker: # Use parent's name for backwards compatibility. with tf.variable_scope(self.parent.params.name): self.CreateChild('grad_norm_tracker', p.grad_norm_tracker) self.CreateChild('lr_schedule', p.lr_schedule) self.CreateChild('optimizer', p.optimizer)
def __init__(self, params): super(IdentityRegressionTask, self).__init__(params) with tf.variable_scope('IdentityRegressionTask'): self.CreateVariable( 'm', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform())) self.CreateVariable( 'b', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Uniform())) self.global_steps = [] self.metrics = [] self.result_per_example_tensors = []
def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): _, a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a)
def __init__(self, params): super(TransformerStack, self).__init__(params) p = self.params with tf.variable_scope(p.name): # Add transformer layers. transformer_layer_params = [] denom = 1 if isinstance(p.transformer_tpl, list): denom = len(p.transformer_tpl) assert p.num_transformer_layers % len(p.transformer_tpl) == 0 for i in range(p.num_transformer_layers // denom): if isinstance(p.transformer_tpl, list): for q in p.transformer_tpl: params = q.Copy() transformer_layer_params.append(params) else: params = p.transformer_tpl.Copy() transformer_layer_params.append(params) for i, params in enumerate(transformer_layer_params): params.name = 'trans_%d' % (i) params.source_dim = p.model_dim params.packed_input = p.packed_input params.has_aux_atten = p.has_aux_attention params.mask_self_atten = p.mask_self_atten self.CreateChildren('trans', transformer_layer_params) # Initialize TransformerStack output layer norm if p.ln_output: params = p.ln_tpl.Copy() # Keeping historic 'enc_out_ln' name for checkpoint compatibility. params.name = 'enc_out_ln' params.input_dim = p.model_dim self.CreateChild('layer_norm_out', params) if p.is_transparent: transparent_params = [] if not p.num_transparent_outputs: raise ValueError( 'num_transparent_outputs should be greater than 0.') for i in range(p.num_transparent_outputs): transparent_param = p.transparent_merger_tpl.Copy() transparent_param.name = 'transparent_%d' % i transparent_param.num_sources = 1 + len( transformer_layer_params) transparent_params.append(transparent_param) self.CreateChildren('transparent_merger', transparent_params)
def __init__(self, params): super(GraphLayer, self).__init__(params) p = self.params assert p.name assert p.input_endpoints with tf.variable_scope(p.name): self._seq = [] for i, (signature, sub) in enumerate(p.sub): assert signature sig = GraphSignature(signature) assert sig.outputs, '{}'.format(signature) name = sub.name if not name: name = '%s_%02d' % (sig.outputs[0], i) sub.name = name self.CreateChild(name, sub) self._seq.append((name, sig, self.children[name]))
def __init__(self, params): super(SequentialLayer, self).__init__(params) p = self.params assert p.name with tf.variable_scope(p.name): if p.repeat <= 1: self._seq = [] for sub in p.sub: self.CreateChild(sub.name, sub) self._seq.append((sub.name, self.children[sub.name])) else: # We create 'repeat' number of sub layers. Each sub layer is a # sequential layer specified by 'sub'. This allows us to name each # repetition with a unique name. children = [] for i in range(p.repeat): children.append(p.Copy().Set(name='%03d' % i, repeat=1)) self.CreateChildren('rep', children)
def __init__(self, params): super(BatchNormLayerNoPadding, self).__init__(params) p = self.params assert p.name, 'Name of BatchNormLayerNoPadding is not set.' p.fprop_dtype = None # Skip L-P regularization for these variables. collections = [ self.__class__.__name__ + '_vars', py_utils.SKIP_LP_REGULARIZATION ] pc = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=collections) with tf.variable_scope(p.name): self.CreateVariable('beta', pc) # Note, The real gamma to use is 1 + gamma. self.CreateVariable('gamma', pc, lambda x: 1.0 + x) moving_collections = [ 'moving_vars', tf.GraphKeys.MOVING_AVERAGE_VARIABLES, self.__class__.__name__ + '_vars' ] mva = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(0.0), dtype=p.dtype, collections=moving_collections) # Two statistics computed from sufficient stats. self.CreateVariable('moving_mean', mva, trainable=False) mvv = py_utils.WeightParams( shape=[p.dim], init=py_utils.WeightInit.Constant(1.0), dtype=p.dtype, collections=moving_collections) self.CreateVariable('moving_variance', mvv, trainable=False) # Accumulate bn sufficient stats over micro-batches. dim = self.vars.beta.shape[0] self.RegisterAccumulator('counts', AddingAccumulator([], p.dtype)) self.RegisterAccumulator('mean_ss', AddingAccumulator([dim], p.dtype)) self.RegisterAccumulator('variance_ss', AddingAccumulator([dim], p.dtype))
def __init__(self, params): super(MTEncoderUniRNN, self).__init__(params) p = self.params assert not p.packed_input, ('Packed inputs are not yet supported for ' 'MTEncoderUniRNN.') with tf.variable_scope(p.name): if p.cc_schedule is None: self.cc_schedule = None else: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] num_input_nodes = p.emb.embedding_dim for i in range(p.num_lstm_layers): cell = p.lstm_tpl.Copy() cell.name = 'L%d_rnn' % i cell.num_input_nodes = num_input_nodes cell.num_output_nodes = p.lstm_cell_size params = model_helper.CreateUnidirectionalRNNParams( self.params, cell) params.name = 'L%d' % i rnn_layers_params.append(params) num_input_nodes = cell.num_output_nodes self.CreateChildren('rnn', rnn_layers_params) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 827366448 if p.random_seed else None) self.CreateChild('dropout', dropout_p) if p.is_transparent: transparent_params = p.transparent_merger_tpl.Copy() transparent_params.name = 'transparent' transparent_params.num_sources = p.num_lstm_layers self.CreateChild('transparent_merger', transparent_params)
def __init__(self, params): super(QuantizableLayer, self).__init__(params) p = self.params self._tracked_tensors = dict() # tracked t_name -> (QDomain) self._qstate = None # t_name -> Tensor # Instantiate quantization domains. with tf.variable_scope(p.name + '/q'): self._qdomains = dict() # Dict of qdname -> QDomain or None for qdname in dir(p.qdomain): qdparams = p.qdomain.Get(qdname) if qdparams is None: continue assert issubclass(qdparams.cls, QDomain), ( 'Expected quantized domain %s to extend QDomain' % qdname) qdchild_name = 'qdomain_' + qdname self.CreateChild(qdchild_name, qdparams) self._qdomains[qdname] = self.children[qdchild_name] self._AddQuantizationFunctions()
def __init__(self, params): super(SoftCondLayer, self).__init__(params) p = self.params assert p.name assert p.num_experts assert p.cond_dim with tf.variable_scope(p.name): # Create Variables for task weight mapping. collections = [ self.__class__.__name__ + '_vars', ] w_p = py_utils.WeightParams( shape=[p.cond_dim, p.num_experts], init=p.params_init, # TODO(huangyp): try zero init instead. dtype=p.dtype, collections=collections) self.CreateVariable('w', w_p) # Prepends p.num_experts to the tensor shape of every variable created # by p.body. with py_utils.VariableShapePrefixContext(p.num_experts): self.CreateChild('body', p.body)
def CreateChildrenHelper(params_list, child_scopes): """Helper to create children recursively.""" if child_scopes and len(child_scopes) != len(params_list): raise ValueError( 'child_scopes must be same structure as params_list.') children = [] for i, p in enumerate(params_list): if isinstance(p, list): children.append( CreateChildrenHelper( p, child_scopes[i] if child_scopes else None)) else: p = self.CopyBaseParams(self.params, p.Copy()) if not p.name: p.name = '%s_%d' % (name, i) if child_scopes: with tf.variable_scope(child_scopes[i]): children.append(p.Instantiate()) else: children.append(p.Instantiate()) return children
def __init__(self, params): super(DevBasedSchedule, self).__init__(params) p = self.params with tf.variable_scope(p.name): wp = py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(1.0), collections=['DevBasedSchedule_vars'], dtype=tf.float32) _, self._cur_factor, = py_utils.CreateVariable('cur_factor', wp, trainable=False) wp = py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0), collections=['DevBasedSchedule_vars'], dtype=tf.int64) _, self._ref_step, = py_utils.CreateVariable('ref_step', wp, trainable=False) self._metric_history = early_stop.MetricHistory(p.metric_history) self._best_step = ops.best_step(self._metric_history.hist_file, p.tolerance)
def Apply(self, lr, var_grad): """Applies the gradient to the variable. Args: lr: A scalar. The base learning rate. var_grad: A `.NestedMap` of (var, grad) pairs. Returns: The variable update op. """ optimizer = self.GetOptimizer(lr) def _Apply(): if self.params.use_bf16_gradients_ar: return optimizer.apply_gradients( [(tf.cast(g, tf.float32), v) for (v, g) in var_grad.Flatten()], name='meta_backprop') else: return optimizer.apply_gradients( [(g, v) for (v, g) in var_grad.Flatten()], name='meta_backprop') if not py_utils.use_resource_variables(): var_update_op = _Apply() else: # Many optimizers, e.g., Adam, Adagrad, etc., create # variables. We need to ensure name scope and variable scope are # cleared. Otherwise, tpu.batch_parallel does not work. with tf.name_scope(None): with tf.variable_scope( tf.VariableScope(use_resource=True, reuse=self.VarReuseForSlotVars())): var_update_op = _Apply() self.AddSummary(lr, optimizer, var_grad) return var_update_op
def __init__(self, params): super(MTEncoderV1, self).__init__(params) p = self.params assert not p.packed_input, ('Packed inputs are not yet supported for ' 'MTEncoderV1.') with tf.variable_scope(p.name): if p.cc_schedule is not None: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] # L0 is a bi-directional lstm. # L0's forward lstm cell if p.lstm_tpl_bidi is None: params = p.lstm_tpl.Copy() else: params = p.lstm_tpl_bidi.Copy() params.name = 'L0_rnn_fwd' params.num_input_nodes = p.emb.embedding_dim params.num_output_nodes = p.lstm_cell_size forward_lstm = params # L0's backward lstm cell params = params.Copy() params.name = 'L0_rnn_bak' backward_lstm = params # L0 layer. params = model_helper.CreateBidirectionalRNNParams( self.params, forward_lstm, backward_lstm) params.name = 'L0' rnn_layers_params.append(params) # The latter layers are all uni-directional lstm. input_size = 2 * p.lstm_cell_size for i in range(1, p.num_lstm_layers): # Forward lstm cell. if p.lstm_tpl_uni is None: cell = p.lstm_tpl.Copy() else: cell = p.lstm_tpl_uni.Copy() cell.name = 'L%d_rnn' % i cell.num_input_nodes = input_size cell.num_output_nodes = p.lstm_cell_size # Forward lstm layer. params = model_helper.CreateUnidirectionalRNNParams( self.params, cell) params.name = 'L%d' % i rnn_layers_params.append(params) input_size = p.lstm_cell_size self.CreateChildren('rnn', rnn_layers_params) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 84828474 if p.random_seed else None) self.CreateChild('dropout', dropout_p)
def __init__(self, params): super(MTEncoderBiRNN, self).__init__(params) p = self.params with tf.variable_scope(p.name): if p.cc_schedule is None: self.cc_schedule = None else: self.CreateChild('cc_schedule', p.cc_schedule) self.CreateChild('emb', p.emb) rnn_layers_params = [] for i in range(p.num_lstm_layers): params = p.lstm_tpl.Copy() params.name = 'L%d_rnn_fwd' % i if i == 0: params.num_input_nodes = p.emb.embedding_dim else: params.num_input_nodes = 2 * p.lstm_cell_size params.num_output_nodes = p.lstm_cell_size params.reset_cell_state = p.packed_input forward_lstm = params params = params.Copy() params.name = 'L%d_rnn_bak' % i params.reset_cell_state = p.packed_input backward_lstm = params params = model_helper.CreateBidirectionalRNNParams( self.params, forward_lstm, backward_lstm) params.packed_input = p.packed_input params.name = 'L%d' % i rnn_layers_params.append(params) self.CreateChildren('rnn', rnn_layers_params) if p.lstm_cell_size * 2 != p.encoder_out_dim: # Project the encoder output to the desired dim. proj_p = p.proj_tpl.Copy().Set(name='proj', batch_norm=False, input_dim=p.lstm_cell_size * 2, output_dim=p.encoder_out_dim) if p.cc_schedule is not None: proj_p.has_bias = False proj_p.activation = 'TANH' else: proj_p.has_bias = True proj_p.activation = 'NONE' self.CreateChild('final_proj', proj_p) dropout_p = layers.DropoutLayer.Params().Set( name='dropout_layer', keep_prob=1.0 - p.dropout_prob, random_seed=p.random_seed + 827366448 if p.random_seed else None) self.CreateChild('dropout', dropout_p) if p.is_transparent: transparent_params = p.transparent_merger_tpl.Copy() transparent_params.name = 'transparent' transparent_params.num_sources = p.num_lstm_layers self.CreateChild('transparent_merger', transparent_params)