예제 #1
0
 def _buildShuffle(self, num_workers, num_gpus, num_shards):
     # Use local CPU for all shuffle shards
     gather_devices = [
         "/replica:0/task:0/device:CPU:0" for _ in range(num_shards)
     ]
     return lambda x, un_op: ar.build_shuffle_all_reduce(
         x, gather_devices, math_ops.add_n, un_op)
예제 #2
0
def sum_grad_and_var_all_reduce(single_session,
                                grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    scaled_grads = [g for g, _ in grad_and_vars]
    if alg == 'collective':
        assert not single_session
        summed_grads = build_collective_reduce(scaled_grads, num_workers,
                                               num_shards, 'Add', 'Id')
    else:
        with tf.name_scope('allreduce'):
            # Note that each grad_and_vars looks like the following:
            #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
            if alg == 'nccl':
                summed_grads = all_reduce.build_nccl_all_reduce(
                    scaled_grads, tf.add)
            elif alg == 'xring':
                summed_grads = all_reduce.build_ring_all_reduce(
                    scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
            elif alg == 'nccl/xring':
                summed_grads = all_reduce.build_nccl_then_ring(
                    scaled_grads, num_shards, tf.add)
            elif alg == 'nccl/rechd':
                summed_grads = all_reduce.build_nccl_then_recursive_hd(
                    scaled_grads, tf.add)
            elif alg == 'nccl/pscpu':
                summed_grads = all_reduce.build_nccl_then_shuffle(
                    scaled_grads, aux_devices, tf.add, tf.add_n)
            elif alg == 'pscpu/pscpu':
                summed_grads = all_reduce.build_shuffle_then_shuffle(
                    scaled_grads,
                    aux_devices,
                    # TODO(tucker): devise a way of better specifying the device set
                    # for the second level.
                    [aux_devices[0]],
                    tf.add_n)
            elif alg in ['pscpu', 'psgpu']:
                summed_grads = all_reduce.build_shuffle_all_reduce(
                    scaled_grads, aux_devices, tf.add_n)
            else:
                raise ValueError('unsupported all_reduce alg: ', alg)

    result = []
    for (_, v), g in zip(grad_and_vars, summed_grads):
        result.append([g, v])
    return result
예제 #3
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            from tensorflow.python.ops import nccl_ops
            summed_grads = nccl_ops.all_sum(scaled_grads)
        elif alg == 'simple':
            summed_grads = build_reduce_sum(scaled_grads)
        elif alg == 'trivial':
            summed_grads = build_trivial_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == 'pscpu/pscpu':
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope("allreduce"):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == "nccl":
            from tensorflow.python.ops import nccl_ops
            summed_grads = nccl_ops.all_sum(scaled_grads)
        elif alg == "simple":
            summed_grads = build_reduce_sum(scaled_grads)
        elif alg == "trivial":
            summed_grads = build_trivial_sum(scaled_grads)
        elif alg == "xring":
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == "nccl/xring":
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == "nccl/rechd":
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == "nccl/pscpu":
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == "pscpu/pscpu":
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ["pscpu", "psgpu"]:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError("unsupported all_reduce alg: ", alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
예제 #5
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with ops.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            summed_grads = nccl.all_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices,
                math_ops.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, math_ops.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, math_ops.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
        elif alg == 'pscpu/pscpu':
            second_gather_devices = aux_devices[:num_shards]
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads, aux_devices, second_gather_devices,
                math_ops.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, math_ops.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

    result = []
    for (_, v), g in zip(grad_and_vars, summed_grads):
        result.append([g, v])
    return result
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
  """Apply all-reduce algorithm over specified gradient tensors."""
  with ops.name_scope('allreduce'):
    # Note that each grad_and_vars looks like the following:
    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
    scaled_grads = [g for g, _ in grad_and_vars]
    if alg == 'nccl':
      summed_grads = nccl.all_sum(scaled_grads)
    elif alg == 'xring':
      summed_grads = all_reduce.build_ring_all_reduce(
          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
    elif alg == 'nccl/xring':
      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
                                                     math_ops.add)
    elif alg == 'nccl/rechd':
      summed_grads = all_reduce.build_nccl_then_recursive_hd(
          scaled_grads, math_ops.add)
    elif alg == 'nccl/pscpu':
      summed_grads = all_reduce.build_nccl_then_shuffle(
          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
    elif alg == 'pscpu/pscpu':
      second_gather_devices = aux_devices[:num_shards]
      summed_grads = all_reduce.build_shuffle_then_shuffle(
          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
    elif alg in ['pscpu', 'psgpu']:
      summed_grads = all_reduce.build_shuffle_all_reduce(
          scaled_grads, aux_devices, math_ops.add_n)
    else:
      raise ValueError('unsupported all_reduce alg: ', alg)

    result = []
    for (_, v), g in zip(grad_and_vars, summed_grads):
      result.append([g, v])
    return result
예제 #7
0
 def _buildShuffle(self, num_workers, num_gpus, num_shards):
   # Use local CPU for all shuffle shards
   gather_devices = ["/replica:0/task:0/device:CPU:0"
                     for _ in range(num_shards)]
   return lambda x, un_op: ar.build_shuffle_all_reduce(
       x, gather_devices, math_ops.add_n, un_op)