def Apply(self, lr, var_grad): p = self.params def _Acc(vg): """Updating accumulators.""" v, g = vg with tf.variable_scope(v.op.name): a = py_utils.CreateVariable( 'grad_accumulator', py_utils.WeightParams(v.get_shape(), py_utils.WeightInit.Constant(0.0), self.params.dtype), trainable=False) a = tf.assign_add(a, g) return py_utils.VarGrad(v, a) var_grad = var_grad.Transform(_Acc) def _ApplyAndReset(): with tf.control_dependencies([ self._opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grad, 1. / p.accum_steps)) ]): return tf.group( *[tf.assign(a, tf.zeros_like(a)) for _, a in var_grad.Flatten()]) if self.params.add_summary_in_apply: self.AddSummary(lr, self.GetOptimizer(lr), var_grad) return tf.cond( tf.equal( tf.math.floormod(self.global_step, p.accum_steps), p.accum_steps - 1), _ApplyAndReset, lambda: tf.group(tf.no_op()))
def invoke_async_preconditioner_computation(self, global_step_int32): """Invokes SVD preconditioner and graph runs on the CPU.""" keys_stats_and_rank = [] for var in self._all_vars_for_preconditioning: shape = var.get_shape() if not self._fallback_to_diagonal_for_shape(shape): partitioned_v = TensorPartitioner.partition_tensor( var, self._partition_info) num_partitions = len(partitioned_v) for pt_idx, pt_v in enumerate(partitioned_v): pt_v_shape = pt_v.get_shape() preconditioner_exists_for_dim = ( self._preconditioner_available_for_dims(pt_v_shape)) for i in range(len(pt_v_shape)): if preconditioner_exists_for_dim[i]: rank = sum(preconditioner_exists_for_dim) key = self._key_for_var(var, i, pt_idx) stat = self.get_slot( var, self._statistics_key_for_partition_and_dim( i, pt_idx, num_partitions)) keys_stats_and_rank.append((key, stat, rank)) if not keys_stats_and_rank: return tf.no_op() keys, stats, ranks = zip(*keys_stats_and_rank) return x_ops.compute_preconditioners( stats, [-1.0 / (2.0 * r) for r in ranks], global_step_int32, keys=keys, sync=self._synchronous_preconditioning, preconditioner_compute_graphdef=self. _preconditioner_compute_graphdef)
def ApplyPostTrainingLoop(self): """Applies any computation to run after each tpu trainining loop. Returns: Ops to run after training loop ends. """ return tf.no_op()
def _OutfeedEnqueue(self, per_example_tensors): if not per_example_tensors: return tf.no_op() per_example_tensors = py_utils.NestedMap(per_example_tensors) device = tpu.core(0) if self.spmd else '' with tf.device(device): return tpu_ops.outfeed_enqueue_tuple(per_example_tensors.Flatten())
def FProp(self, theta, inputs, *extra_inputs): initial_step_seed = py_utils.GetStepSeed() final_step_seed = py_utils.GenerateSeedFromName( tf.no_op(name='new_step_seed').name) num_layers = len(self.sub_layers) def Bak(inputs, outputs, d_outputs): """Backward step.""" del inputs # unused output_acts, step_seeds = outputs d_outputs = d_outputs[0] d_layer_thetas = [] for layer_idx in reversed(range(num_layers)): f_seed, g_seed = step_seeds[layer_idx] layer = self.sub_layers[layer_idx] layer_theta = theta.sub_layers[layer_idx] input_acts, d_inputs, d_theta = layer.ReverseAndGrad( layer_theta, output_acts, d_outputs, f_seed, g_seed, *extra_inputs) d_layer_thetas.append(d_theta) # Passes reconstructed inputs to the previous layer. output_acts = input_acts d_outputs = d_inputs py_utils.ResetStepSeed(final_step_seed) d_theta = py_utils.NestedMap() d_theta.sub_layers = list(reversed(d_layer_thetas)) extra_grads = [tf.zeros_like(t) for t in extra_inputs] return [ tf.zeros_like(initial_step_seed), d_theta, d_inputs, extra_grads ] def Fwd(xs): """Forward pass.""" initial_step_seed, theta, acts, extra_inputs = xs py_utils.ResetStepSeed(initial_step_seed) layer_step_seeds = [] for layer_theta, layer in zip(theta.sub_layers, self.sub_layers): acts, f_seed, g_seed = layer.FProp(layer_theta, acts, *extra_inputs) layer_step_seeds += [(f_seed, g_seed)] return [acts, layer_step_seeds] if self.params.custom_gradient: acts, _ = py_utils.CallDefun( Fwd, [initial_step_seed, theta, inputs, extra_inputs], Bak) py_utils.ResetStepSeed(final_step_seed) return acts else: acts = inputs for layer_theta, layer in zip(theta.sub_layers, self.sub_layers): acts, _, _ = layer.FProp(layer_theta, acts, *extra_inputs) return acts
def ApplyPostTrainingLoop(self, global_step): """Applies any computation to run after each tpu trainining loop. Args: global_step: Global step variable. Returns: Ops to run after training loop ends. """ return tf.no_op()
def Apply(self, metrics, vmap, gradient_mask=None, gradient_adjuster=None): """Computes updates on 'vmap' to optimize 'loss'. TODO(rpang): explore merging gradient_mask and gradient_adjuster. Args: metrics: A Dict[str, (value, weight)], from which loss can be extracted according to p.loss_name. vmap: A `.NestedMap` object containing variables to optimize. gradient_mask: if not None, a dict mapping variable names to a 0/1 scalar. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: (losses, op, eval_metrics), where - losses is a list of scalar tensors; - op is a tf.Operation to update variables; - eval_metrics is a Dict[str, (value, weight)], where each value/weight is a scalar tensor. """ # We apply gradients outside the name_scope to maintain backwards # compatibility on variables created by self.optimizer.Apply(). losses, var_grads, eval_metrics = self._ComputeLossesAndGradients( metrics, vmap) if 'tpu_embedding_var_grads' in var_grads: tpu_embedding_var_grads = var_grads.tpu_embedding_var_grads del var_grads.tpu_embedding_var_grads tpu_embedding_collection = py_utils.GetTpuEmbeddingGraphCollection( )[0] assert tpu_embedding_collection tpu_emb_update_op, stats = tpu_embedding_collection.ApplyGradients( py_utils.GetTaskCallScope(), tpu_embedding_var_grads.Transform( lambda var_grad: var_grad.grad)) eval_metrics.update(stats) else: tpu_emb_update_op = tf.no_op() assert py_utils.GetGlobalStep() is not None lr = self.LearningRate() var_grads, stats = self.AdjustGradients( var_grads, gradient_mask=gradient_mask, gradient_adjuster=gradient_adjuster) eval_metrics.update(stats) self._var_grads = var_grads eval_metrics['learning_rate'] = (tf.convert_to_tensor(lr), tf.convert_to_tensor(1.)) var_update_op = tf.group( [tpu_emb_update_op, self.optimizer.Apply(lr, var_grads)]) return losses, var_update_op, eval_metrics
def mask_update_op(self): with tf.name_scope(self._spec.name): if not self._assign_ops: self._get_mask_assign_ops() with tf.control_dependencies([ tf.assign(self._last_update_step, self._global_step, name='last_mask_update_step_assign') ]): with tf.control_dependencies(self._assign_ops): tf.logging.info('Updating masks.') return tf.no_op('mask_update')
def _GetMaskUpdateOp(self): """Returns op to update masks and threshold variables for model pruning.""" p = self.params tp = p.train mask_update_op = tf.no_op() if tp.pruning_hparams_dict: assert isinstance(tp.pruning_hparams_dict, dict) pruning_hparams = pruning.get_pruning_hparams().override_from_dict( tp.pruning_hparams_dict) pruning_obj = pruning.Pruning( pruning_hparams, global_step=self.global_step) pruning_obj.add_pruning_summaries() mask_update_op = pruning_obj.conditional_mask_update_op() return mask_update_op
def _Apply(): if not var_grad.Flatten(): tf.logging.warning( 'No gradients are available for optimizer.Apply(). ' 'Make sure this is expected.') return tf.no_op() if self.params.use_bf16_gradients_ar: return self._optimizer.apply_gradients( [(tf.cast(g, tf.float32), v) for (v, g) in var_grad.Flatten()], name='meta_backprop') else: return self._optimizer.apply_gradients( [(g, v) for (v, g) in var_grad.Flatten()], name='meta_backprop')
def testExponentialMovingAverage(self): p = base_model.SingleTaskModel.Params() p.task = BaseTaskTest.TestParams() p.task.input = base_input_generator.BaseSequenceInputGenerator.Params() p.train.ema_decay = 0.9 model = p.Instantiate() model._task.CreateChild('a', layers.BatchNormLayer.Params().Set(name='a', dim=1)) model._task._train_op = tf.no_op() model._task.ApplyExponentialMovingAverage(model.ema) with tf.variable_scope('', reuse=True): beta = tf.get_variable('a/beta/var') mean = tf.get_variable('a/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNone(model.ema.average(mean))
def testExponentialMovingAverage(self): p = base_model.SingleTaskModel.Params() p.task = BaseTaskTest.TestParams() p.task.input = base_input_generator.BaseSequenceInputGenerator.Params() p.task.train.ema_decay = 0.9 p.task.train.ema_decay_moving_vars = False model = p.Instantiate() task = model._task task._train_op = tf.no_op() task.ApplyExponentialMovingAverage(model.ema) with tf.variable_scope('base_mdl', reuse=True): beta = tf.get_variable('x/beta/var') mean = tf.get_variable('x/moving_mean/var') self.assertIsNotNone(model.ema.average(beta)) self.assertIsNone(model.ema.average(mean))
def assign_preconditioner_to_host_vars(self): """Assign/Grab latest copy of preconditioners.""" keys_shapes_and_preconditioner_vars = [] assign_ops = [] for var in self._all_vars_for_preconditioning: shape = var.get_shape() if not self._fallback_to_diagonal_for_shape(shape): partitioned_v = TensorPartitioner.partition_tensor( var, self._partition_info) num_partitions = len(partitioned_v) for pt_idx, pt in enumerate(partitioned_v): pt_shape = pt.get_shape() preconditioner_exists_for_dim = ( self._preconditioner_available_for_dims(pt_shape)) var_rank = len(pt_shape) for i in range(var_rank): if preconditioner_exists_for_dim[i]: key = self._key_for_var(var, i, pt_idx) preconditioner = self.get_slot( var, self._preconditioner_key_for_partition_and_dim( i, pt_idx, num_partitions)) keys_shapes_and_preconditioner_vars.append( (key, tf.shape(preconditioner), preconditioner)) if not keys_shapes_and_preconditioner_vars: return tf.no_op() keys, shapes, preconditioner_vars = zip( *keys_shapes_and_preconditioner_vars) preconditioner_vals, successes = x_ops.get_preconditioners( shapes, keys=keys, preconditioner_compute_graphdef=( self._preconditioner_compute_graphdef)) for preconditioner_var, preconditioner_val, success in zip( preconditioner_vars, preconditioner_vals, successes): success_mult = tf.cast(success, preconditioner.dtype) assign_ops.append( state_ops.assign( preconditioner_var, (1.0 - success_mult) * preconditioner_var + success_mult * preconditioner_val)) return tf.group(*assign_ops)
def _OutfeedDequeueLoop(self, per_example_tensors, num_loops, num_devices): """Process all per-example tensor outfeed data for a TPU sess.run. Args: per_example_tensors: dict of key -> tensor as generated by TpuTrainStep. num_loops: number of times that TpuTrainStep will be executed by TpuTrain. num_devices: number of TPU cores assigned to this process. Returns: A dict of per-example tensors from the latest TpuTrainStep. """ if not per_example_tensors: return tf.no_op() tensor_shapes = [ py_utils.GetShape(per_example_tensors[key]) for key in sorted(per_example_tensors) ] tensor_types = [ tf.as_dtype(per_example_tensors[key].dtype) for key in sorted(per_example_tensors) ] def LoopBody(i, *input_arrays): """Process outfeed data for a single TpuTrainStep. Args: i: current loop index. *input_arrays: One tf.TensorArray per outfeed tensor. Returns: i+1 (new index) plus post-write tf.TensorArray handles. """ # Outfeed ops execute on each JF node, so they must be located on the # nodes. outfeed_devices = [] device_assignment = py_utils.GetTpuDeviceAssignment() assert device_assignment for replica in range(device_assignment.num_replicas): for core in range(device_assignment.num_cores_per_replica): with tf.device(device_assignment.host_device( replica, core)): outfeed_devices.append( tpu_ops.outfeed_dequeue_tuple( tensor_types, tensor_shapes, device_ordinal=device_assignment.tpu_ordinal( replica, core))) offset = i * num_devices output_arrays = list(input_arrays) # Each output_array holds a different per-example tensor. We get results # for each tensor from each TPU for each TpuTrainStep call. for j in range(len(output_arrays)): for k in range(len(outfeed_devices)): output_arrays[j] = output_arrays[j].write( offset + k, outfeed_devices[k][j]) return tuple([i + 1] + output_arrays) def LoopCond(i, *output_arrays): del output_arrays return i < num_loops output_arrays = [] for i in range(len(tensor_shapes)): output_arrays.append( tf.TensorArray(tensor_types[i], size=num_loops * num_devices, element_shape=tensor_shapes[i])) # Loop once for each time that TpuTrainStep runs. output_arrays = tf.while_loop(LoopCond, LoopBody, [0] + output_arrays, parallel_iterations=1)[1:] concatenated_arrays = [array.concat() for array in output_arrays] return dict(zip(sorted(per_example_tensors), concatenated_arrays))
def _OutfeedEnqueue(self, per_example_tensors): if not per_example_tensors: return tf.no_op() per_example_tensors = py_utils.NestedMap(per_example_tensors) return tpu_ops.outfeed_enqueue_tuple(per_example_tensors.Flatten())
def _apply_sparse(self, grad, var): return tf.no_op()
def _resource_apply_sparse(self, grad_values, var, grad_indices): return tf.no_op()
def PostTrainingStepUpdate(self, global_step): summary_utils.scalar('cap', self._Value(global_step)) return tf.no_op()
def NoOP(*args, **kwargs): return tf.no_op()
def testBatchNormLayer(self): p = base_model.SingleTaskModel.Params() p.task = self.TestParams(layers.BatchNormLayer.Params().Set(dim=1)) p.task.train.ema_decay = 0.9 p.task.train.ema_decay_moving_vars = True model = p.Instantiate() self.assertIsNotNone(model.ema) task = model._task task._train_op = tf.no_op() task.ApplyExponentialMovingAverage(model.ema) layer = task.encoder self.assertLen(layer.vars, 4) for var in layer.vars.Flatten(): self.assertIsNotNone(model.ema.average(var), msg=var.name) beta = layer.vars.beta mean = layer.vars.moving_mean global_step = 100 beta_1 = np.asarray([.2]) mean_1 = np.asarray([.03]) beta_1_ema = beta_1 * .1 mean_1_ema = mean_1 * .1 with self.session() as sess: # Test EMA values. sess.run(tf.global_variables_initializer()) sess.run(tf.assign(py_utils.GetOrCreateGlobalStepVar(), global_step)) sess.run(tf.assign(beta, beta_1)) sess.run(tf.assign(mean, mean_1)) sess.run(task._post_train_ops) self.assertAllClose([beta_1, beta_1_ema, mean_1, mean_1_ema], sess.run([ beta, model.ema.average(beta), mean, model.ema.average(mean) ])) # Test checkpointer. train_dir = os.path.join(self.get_temp_dir(), 'testSaveRestore') os.mkdir(train_dir) saver = checkpointer.Checkpointer(train_dir, model) saver.Save(sess, model.global_step) self.assertTrue( os.path.isfile( os.path.join(train_dir, 'ckpt-%08d.index' % global_step))) # Restore from ckpt in training mode. with self.session(graph=tf.Graph()) as sess: model = p.Instantiate() self.assertIsNotNone(model.ema) task = model._task task._train_op = tf.no_op() task.ApplyExponentialMovingAverage(model.ema) layer = task.encoder for var in layer.vars.Flatten(): self.assertIsNotNone(model.ema.average(var), msg=var.name) beta = layer.vars.beta mean = layer.vars.moving_mean saver = checkpointer.Checkpointer(train_dir, model) saver.RestoreIfNeeded(sess) self.assertAllClose([beta_1, beta_1_ema, mean_1, mean_1_ema], sess.run([ beta, model.ema.average(beta), mean, model.ema.average(mean) ])) # Restore from ckpt in eval mode. with self.session(graph=tf.Graph()) as sess, self.SetEval(True): model = p.Instantiate() self.assertIsNotNone(model.ema) task = model._task # task._train_op = tf.no_op() # task.ApplyExponentialMovingAverage(model.ema) layer = task.encoder # for var in layer.vars.Flatten(): # self.assertIsNotNone(model.ema.average(var), msg=var.name) beta = layer.vars.beta mean = layer.vars.moving_mean saver = checkpointer.Checkpointer(train_dir, model) saver.RestoreIfNeeded(sess) # Both beta and mean should use the EMA value. self.assertAllClose([beta_1_ema, mean_1_ema], sess.run([beta, mean]))
def _Accum(): return tf.no_op()
def no_update_op(): return tf.no_op()
def control_after_assigns(self): if not self._assign_ops: return tf.no_op() with tf.control_dependencies(self._assign_ops): return tf.no_op()