def _keras_callback_on_batch_end(callback, batch, logs=None): """broadcast should be done after the first gradient step to ensure optimizer initialization.""" if callback.broadcast_done: return if _tf_major_version == 2: if hasattr(callback.model, 'variables'): for v in callback.model.variables: _tf_assign(v, broadcast(v)) opt_variables = None if hasattr(callback.model.optimizer, 'variables'): opt_variables = callback.model.optimizer.variables() else: opt_variables = callback.model.optimizer.optimizer.variables() # print(opt_variables) for v in opt_variables: _tf_assign(v, broadcast(v)) else: raise RuntimeError('No variables() in %s', callback.model) if _tf_major_version == 1: tf.keras.backend.get_session().run(BroadcastGlobalVariablesOp()) callback.broadcast_done = True
def broadcast_variables(variables): """A TensorFlow function that broadcasts global variables. This function is often used with ``tf.GradientTape`` or embedded as part of a training program. """ for v in variables: _tf_assign(v, broadcast(v))
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) gradients, variables = list(zip(*grads_and_vars)) init_store_op = tf.cond(tf.equal(self._step, 0), lambda: self.init_store(variables), tf.no_op) with tf.control_dependencies([init_store_op]): other_peer_vars = self._build_request_ops(target, variables) save_model_op = self._build_save_op(variables) assign_ops = [ _tf_assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. new_grads_and_vars = zip(gradients, variables) apply_op = apply_grads_func(new_grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): # It is important to apply model averaging every iteration [2] gradients, variables = list(zip(*grads_and_vars)) sum_vars = group_all_reduce(variables) avg_vars = [g / self._num_workers for g in sum_vars] # TODO: Apply momentum to the averaged model [2] assign_ops = [ _tf_assign(v, avg_v) for v, avg_v in zip(variables, avg_vars) ] # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. new_grads_and_vars = zip(gradients, variables) # We can overlap model averaging and local SGD [2]. with tf.control_dependencies(assign_ops): return apply_grads_func(new_grads_and_vars, **kwargs)