示例#1
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            # FIXME: We have a limitation that KungFu schedules NCCL operations
            # in the order of the given gradients. This order is sub-optimal
            # to the topological sorting order of dataflow. We get around of this issue by
            # fusing all gradients. We need to figure out H ow to get the optimal topological s
            # sortting order from TensorFlow.
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_gradients)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        reduced_grads_and_vars = zip(reduced_grads, variables)

        return apply_grads_func(reduced_grads_and_vars, **kwargs)
def test_group_all_reduce():
    sizes = [i % 5 for i in range(10)]
    xs = [tf.Variable(tf.ones([n], tf.int32)) if n else None for n in sizes]
    ys = group_all_reduce(xs)
    op = [y for y in ys if y is not None]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(op)
    def _ssgd(self, apply_grads_func, gradients, variables, **kwargs):
        sum_grads = group_all_reduce(gradients)
        avg_grads = map_maybe(lambda g: g / self._num_workers, sum_grads)

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        grads_and_vars = zip(avg_grads, variables)

        return apply_grads_func(grads_and_vars, **kwargs)
示例#4
0
def build_fake_train_op(use_nccl):
    xs = [tf.Variable(tf.ones((2, 5)))]
    if use_nccl:
        from kungfu.tensorflow.ops import group_nccl_all_reduce
        ys = group_nccl_all_reduce(xs)
    else:
        from kungfu.tensorflow.ops import group_all_reduce
        ys = group_all_reduce(xs)
    return ys
def gen_fake_train_op(sizes):
    grads = []
    for size in sizes:
        grads.append(tf.Variable(tf.ones(shape=(size, ), dtype=tf.float32)))
    new_grads = group_all_reduce(grads)
    ops = []
    for g, new_g in zip(grads, new_grads):
        ops.append(tf.assign(g, new_g))
    return tf.group(ops)
示例#6
0
    def _sync_ma_sgd(self, grads_and_vars, **kwargs):
        _, variables = list(zip(*grads_and_vars))
        sum_vars = group_all_reduce(variables)
        avg_vars = [g / self._num_workers for g in sum_vars]
        assign_ops = [
            tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars)
        ]

        with tf.control_dependencies(assign_ops):
            return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
示例#7
0
 def _monitor(self, grads, reduced_grads):
     square_grads = [tf.square(g) for g in grads]
     summed_square_grads = group_all_reduce(square_grads)
     reduced_square_grads = map_maybe(lambda g: g / self._num_workers,
                                      summed_square_grads)
     grad_variances = [
         square_grad - tf.square(grad)
         for square_grad, grad in zip(reduced_square_grads, reduced_grads)
     ]
     variances = [
         tf.norm(grad_variance) for grad_variance in grad_variances
     ]
     summed_variance = tf.reduce_sum(variances)
     return tf.print('Variance:', summed_variance)
示例#8
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        grads, variables = list(zip(*grads_and_vars))

        # Synchronization logic
        summed_grads = group_all_reduce(grads)
        reduced_grads = map_maybe(lambda g: g / self._num_workers,
                                  summed_grads)

        # Monitoring logic
        monitor_grads_op = tf.cond(
            tf.equal(tf.mod(self._step, self._interval), 0),
            lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op())

        with tf.control_dependencies([monitor_grads_op]):
            return apply_grads_func(zip(reduced_grads, variables), **kwargs)
示例#9
0
    def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs):
        # It is important to apply model averaging every iteration [2]
        gradients, variables = list(zip(*grads_and_vars))
        sum_vars = group_all_reduce(variables)
        avg_vars = [g / self._num_workers for g in sum_vars]

        # TODO: Apply momentum to the averaged model [2]
        assign_ops = [
            _tf_assign(v, avg_v) for v, avg_v in zip(variables, avg_vars)
        ]

        # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once.
        new_grads_and_vars = zip(gradients, variables)

        # We can overlap model averaging and local SGD [2].
        with tf.control_dependencies(assign_ops):
            return apply_grads_func(new_grads_and_vars, **kwargs)
    def _monitor(self, grads, reduced_grads):
        square_grads = [tf.square(g) for g in grads]
        summed_square_grads = group_all_reduce(square_grads)
        reduced_square_grads = [
            g / self._num_workers for g in summed_square_grads
        ]
        grad_variances = [
            square_grad - tf.square(grad)
            for square_grad, grad in zip(reduced_square_grads, reduced_grads)
        ]
        self._variances = [
            tf.norm(grad_variance) for grad_variance in grad_variances
        ]
        self._summed_variance = tf.reduce_sum(self._variances)
        print_op = tf.print('Sum of gradient variance:', self._summed_variance)

        with tf.control_dependencies([print_op]):
            return tf.no_op()
示例#11
0
def all_reduce_benchmark(sizes, dtype=tf.float32):
    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = current_cluster_size()
    multiplier = 4 * (np - 1)
    print('all reduce total size: %s among %d peers' %
          (show_size(tot_size), np))
    ys = group_all_reduce(xs)
    init = tf.global_variables_initializer()
    warmup_steps = 5
    bench_steps = 10
    with tf.Session() as sess:
        sess.run(init)
        for step in range(warmup_steps):
            sess.run(ys)
        for step in range(bench_steps):
            t0 = time.time()
            sess.run(ys)
            d = time.time() - t0
            rate = 0
            print('step %d, took %.2fs, equivalent data rate: %s' %
                  (step, d, show_rate(tot_size * multiplier, d)))