예제 #1
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        np, rank = current_cluster_size(), current_rank()
        target = get_random_peer(np, rank)
        gradients, variables = list(zip(*grads_and_vars))

        init_store_op = tf.cond(tf.equal(self._step, 0),
                                lambda: self.init_store(variables), tf.no_op)
        with tf.control_dependencies([init_store_op]):
            other_peer_vars = self._build_request_ops(target, variables)

        save_model_op = self._build_save_op(variables)

        assign_ops = [
            _tf_assign(v, 0.5 * (v + other_v))
            for v, other_v in zip(variables, other_peer_vars)
        ]

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        new_grads_and_vars = zip(gradients, variables)
        apply_op = apply_grads_func(new_grads_and_vars, **kwargs)

        with tf.control_dependencies(assign_ops):
            with tf.control_dependencies([apply_op]):
                with tf.control_dependencies([save_model_op]):
                    return tf.group(apply_op)
예제 #2
0
 def __init__(self, optimizer, interval, name=None, use_locking=False):
     super(AdaptiveSGDOptimizer, self).__init__(optimizer,
                                                name,
                                                use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()
     self._step = tf.Variable(0, trainable=False, dtype=tf.int32)
     self._interval = interval
예제 #3
0
def log_final_result(value, error):
    from kungfu.tensorflow.ops import current_rank, current_cluster_size
    if current_rank() != 0:
        return
    attrs = {
        'np': current_cluster_size(),
        'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'),
        'bs': args.batch_size,
        'model': args.model,
        'kf-opt': args.kf_optimizer,
    }
    log_detailed_result(value, error, attrs)
예제 #4
0
def log_final_result(value, error):
    if current_rank() != 0:
        return
    attrs = {
        'framework': 'kungfu',
        'np': current_cluster_size(),
        'strategy': os.getenv('KUNGFU_ALLREDUCE_STRATEGY'),
        'bs': args.batch_size,
        'model': args.model,
        'xla': args.xla,
        'kf-opt': args.kf_optimizer,
        'fuse': args.fuse,
        'nvlink': os.getenv('KUNGFU_ALLOW_NVLINK'),
        'data': 'disk' if args.data_dir else 'memory',
    }
    log_detailed_result(value, error, attrs)
예제 #5
0
파일: __main__.py 프로젝트: zuston/KungFu
def _rank(method):
    if method == 'HOROVOD':
        import horovod.tensorflow as hvd
        return hvd.rank()
    else:
        return current_rank()
예제 #6
0
def log(s, nl=True):
    from kungfu.tensorflow.ops import current_rank
    if current_rank() != 0:
        return
    print(s, end='\n' if nl else '')
예제 #7
0
def show_info_example():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
예제 #8
0
def fake_get_shard_info(use_kungfu):
    if use_kungfu:
        from kungfu.tensorflow.ops import current_cluster_size, current_rank
        return current_rank(), current_cluster_size()
    return 0, 1