def _monitor(self, grads, reduced_grads): # Only the master node is doing the global monitoring. noise_op = global_noise_scale(self._device_batch_size, self._global_batch_size, fuse(grads), fuse(reduced_grads)) print_op = tf.print('Gradient Noise Scale:', noise_op) return print_op
def _monitor(self, grads, reduced_grads): self._noise_op = global_noise_scale(self._device_batch_size, self._global_batch_size, fuse(grads), fuse(reduced_grads)) print_op = tf.print('Gradient Noise Scale:', self._noise_op) with tf.control_dependencies([print_op]): return tf.no_op()
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = self._group_all_reduce_fn( [fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = self._group_all_reduce_fn(gradients) else: if self._monitor: summed_gradients = map_maybe(lambda g: monitored_all_reduce(g, []), gradients) # with tf.control_dependencies(summed_gradients): # return calc_stats() else: summed_gradients = self._group_all_reduce_fn(gradients) np = tf.cast(self._num_workers, tf.float32) reduced_grads = map_maybe(lambda g: g / np, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) # logging.info("apply gradients is called here------------") if self._reshape_strategy: # logging.info("reshape on") reshape_strategy(1) else: # logging.info("reshape called with int 0") reshape_strategy(0) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def _monitor(self, grads, reduced_grads): # Only the master node is doing the global monitoring. noise_op = global_noise_scale(self._device_batch_size, self._global_batch_size, fuse(grads), fuse(reduced_grads), alpha=self._alpha) if self._verbose: print_op = tf.print('Gradient Noise Scale:', noise_op) return print_op else: gns = create_global_variable(GraphKeys.GRADIENT_NOISE_SCALE, shape=[], dtype=tf.float32) with tf.control_dependencies([gns.assign(noise_op)]): monitor_op = tf.no_op() return monitor_op
def _build_request_and_save_ops(self, target, variables): var_fused = fuse(variables) save_model_op = save_variable(var_fused) other_peer_var_fused = request_variable_with_template( target, var_fused) other_peer_vars = defuse(other_peer_var_fused, [v.shape for v in variables]) self._save_model_op = save_model_op # save for _get_initializer_op return other_peer_vars, save_model_op
def _build_request_ops(self, target, variables): if self._fuse_requests: var_fused = fuse(variables) other_peer_var_fused = request_variable( target, version=None, name=self._fused_model_name, shape=var_fused.shape, dtype=var_fused.dtype) return defuse(other_peer_var_fused, [v.shape for v in variables]) else: return [ request_variable_with_template(target, v) for v in variables ]
def _build_save_op(self, variables): if self._fuse_requests: var_fused = fuse(variables) return save_variable(var_fused, name=self._fused_model_name) else: return tf.group([save_variable(v) for v in variables])