def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): np, rank = current_cluster_size(), current_rank() target = get_random_peer(np, rank) gradients, variables = list(zip(*grads_and_vars)) init_store_op = tf.cond(tf.equal(self._step, 0), lambda: self.init_store(variables), tf.no_op) with tf.control_dependencies([init_store_op]): other_peer_vars = self._build_request_ops(target, variables) save_model_op = self._build_save_op(variables) assign_ops = [ _tf_assign(v, 0.5 * (v + other_v)) for v, other_v in zip(variables, other_peer_vars) ] # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. new_grads_and_vars = zip(gradients, variables) apply_op = apply_grads_func(new_grads_and_vars, **kwargs) with tf.control_dependencies(assign_ops): with tf.control_dependencies([apply_op]): with tf.control_dependencies([save_model_op]): return tf.group(apply_op)
def __init__(self, optimizer, interval, name=None, use_locking=False): super(AdaptiveSGDOptimizer, self).__init__(optimizer, name, use_locking=use_locking) self._num_workers = current_cluster_size() self._rank = current_rank() self._step = tf.Variable(0, trainable=False, dtype=tf.int32) self._interval = interval
def log_final_result(value, error): from kungfu.tensorflow.ops import current_rank, current_cluster_size if current_rank() != 0: return attrs = { 'np': current_cluster_size(), 'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'), 'bs': args.batch_size, 'model': args.model, 'kf-opt': args.kf_optimizer, } log_detailed_result(value, error, attrs)
def log_final_result(value, error): if current_rank() != 0: return attrs = { 'framework': 'kungfu', 'np': current_cluster_size(), 'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'), 'bs': args.batch_size, 'model': args.model, 'xla': args.xla, 'kf-opt': args.kf_optimizer, 'fuse': args.fuse, 'nvlink': os.getenv('KUNGFU_ALLOW_NVLINK'), 'data': 'disk' if args.data_dir else 'memory', } log_detailed_result(value, error, attrs)
def _rank(method): if method == 'HOROVOD': import horovod.tensorflow as hvd return hvd.rank() else: return current_rank()
def log(s, nl=True): from kungfu.tensorflow.ops import current_rank if current_rank() != 0: return print(s, end='\n' if nl else '')
def show_info_example(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def fake_get_shard_info(use_kungfu): if use_kungfu: from kungfu.tensorflow.ops import current_cluster_size, current_rank return current_rank(), current_cluster_size() return 0, 1