Пример #1
0
def test_group_all_reduce():
    sizes = [i % 5 for i in range(10)]
    xs = [tf.Variable(tf.ones([n], tf.int32)) if n else None for n in sizes]
    ys = group_all_reduce(xs)
    op = [y for y in ys if y is not None]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(op)
Пример #2
0
def gen_fake_train_op(sizes):
    grads = []
    for size in sizes:
        grads.append(tf.Variable(tf.ones(shape=(size, ), dtype=tf.float32)))
    new_grads = group_all_reduce(grads)
    ops = []
    for g, new_g in zip(grads, new_grads):
        ops.append(tf.assign(g, new_g))
    return tf.group(ops)
Пример #3
0
    def _sync_ma_sgd(self, grads_and_vars, **kwargs):
        _, variables = list(zip(*grads_and_vars))
        sum_vars = group_all_reduce(variables)
        avg_vars = [g / self._num_workers for g in sum_vars]
        assign_ops = [
            tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars)
        ]

        with tf.control_dependencies(assign_ops):
            return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
Пример #4
0
    def apply_gradients(self, grads_and_vars, **kwargs):
        # It is important to apply model averaging every iteration [2]
        _, variables = list(zip(*grads_and_vars))
        sum_vars = group_all_reduce(variables)
        avg_vars = [g / self._num_workers for g in sum_vars]

        # TODO: Apply momentum to the averaged model [2]
        assign_ops = [
            tf.assign(v, avg_v) for v, avg_v in zip(variables, avg_vars)
        ]

        # We can overlap model averaging and local SGD [2].
        with tf.control_dependencies(assign_ops):
            return self._optimizer.apply_gradients(grads_and_vars, **kwargs)
Пример #5
0
    def apply_gradients(self, grads_and_vars, **kwargs):
        grads, variables = list(zip(*grads_and_vars))

        # Synchronization logic
        summed_grads = group_all_reduce(grads)
        reduced_grads = [g / self._num_workers for g in summed_grads]

        # Monitoring logic
        monitor_grads_op = tf.cond(
            tf.equal(tf.mod(self._step, self._interval), 0),
            lambda: self._monitor(grads, reduced_grads), lambda: tf.no_op())

        with tf.control_dependencies([monitor_grads_op]):
            return self._optimizer.apply_gradients(
                zip(reduced_grads, variables), **kwargs)
Пример #6
0
    def _monitor(self, grads, reduced_grads):
        square_grads = [tf.square(g) for g in grads]
        summed_square_grads = group_all_reduce(square_grads)
        reduced_square_grads = [
            g / self._num_workers for g in summed_square_grads
        ]
        grad_variances = [
            square_grad - tf.square(grad)
            for square_grad, grad in zip(reduced_square_grads, reduced_grads)
        ]
        self._variances = [
            tf.norm(grad_variance) for grad_variance in grad_variances
        ]
        self._summed_variance = tf.reduce_sum(self._variances)
        print_op = tf.print('Sum of gradient variance:', self._summed_variance)

        with tf.control_dependencies([print_op]):
            return tf.no_op()
Пример #7
0
    def apply_gradients(self, grads_and_vars, **kwargs):
        gradients, variables = list(zip(*grads_and_vars))

        if self._nccl:
            if self._nccl_fusion:
                fused_grad = fuse(gradients)
                summed_fused_gradients = group_nccl_all_reduce([fused_grad])
                summed_gradients = defuse(summed_fused_gradients[0],
                                          [g.shape for g in gradients])
            else:
                summed_gradients = group_nccl_all_reduce(gradients)
        else:
            summed_gradients = group_all_reduce(gradients)

        reduced_grads = [g / self._num_workers for g in summed_gradients]
        reduced_grads_and_vars = zip(reduced_grads, variables)
        return self._optimizer.apply_gradients(reduced_grads_and_vars,
                                               **kwargs)
Пример #8
0
def all_reduce_benchmark(sizes, dtype=tf.float32):
    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = current_cluster_size()
    multiplier = 4 * (np - 1)
    print('all reduce total size: %s among %d peers' %
          (show_size(tot_size), np))
    ys = group_all_reduce(xs)
    init = tf.global_variables_initializer()
    warmup_steps = 5
    bench_steps = 10
    with tf.Session() as sess:
        sess.run(init)
        for step in range(warmup_steps):
            sess.run(ys)
        for step in range(bench_steps):
            t0 = time.time()
            sess.run(ys)
            d = time.time() - t0
            rate = 0
            print('step %d, took %.2fs, equivalent data rate: %s' %
                  (step, d, show_rate(tot_size * multiplier, d)))