示例#1
0
  def _batch_all_reduce(self, aggregation, per_device_values):
    """All reduce algorithm in a batch."""
    logging.log_first_n(
        logging.INFO, "batch_all_reduce invoked for batches size = %d with "
        "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
        "agg_small_grads_max_group = %d" %
        (len(per_device_values), self._all_reduce_alg, self._num_packs,
         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
    destinations = per_device_values[0].devices
    grouped = _group_value_by_device(per_device_values)

    device_grad_packs, tensor_packer = _pack_tensors(
        grouped, self._num_packs, self._agg_small_grads_max_bytes,
        self._agg_small_grads_max_group)

    # The actual aggregation of the repacked gradients. Note that they are
    # sharded among different aggregation trees. So it is important to strike
    # the balance on num_splits.
    if self._all_reduce_alg == "nccl":
      # TODO(yuefengz): merge this into the all-reduce library.
      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
          device_grad_packs)
    else:
      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
      # order.
      reduced = (
          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
              destinations, device_grad_packs))

    reduced = _unpack_tensors(reduced, tensor_packer)
    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
                                      aggregation)
示例#2
0
  def _batch_all_reduce(self, aggregation, per_replica_values):
    """All reduce algorithm in a batch."""
    logging.log_first_n(
        logging.INFO, "batch_all_reduce invoked for batches size = %d with "
        "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
        "agg_small_grads_max_group = %d" %
        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
    destinations = per_replica_values[0].devices
    grouped = _group_value_by_device(per_replica_values)

    device_grad_packs, tensor_packer = _pack_tensors(
        grouped, self._num_packs, self._agg_small_grads_max_bytes,
        self._agg_small_grads_max_group)

    # The actual aggregation of the repacked gradients. Note that they are
    # sharded among different aggregation trees. So it is important to strike
    # the balance on num_splits.
    if self._all_reduce_alg == "nccl":
      # TODO(yuefengz): merge this into the all-reduce library.
      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
          device_grad_packs)
    else:
      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
      # order.
      reduced = (
          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
              destinations, device_grad_packs))

    reduced = _unpack_tensors(reduced, tensor_packer)
    return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
                                      aggregation)
示例#3
0
  def _batch_all_reduce(self, method_string, per_device_values):
    """All reduce algorithm in a batch."""
    destinations = per_device_values[0].devices
    grouped = _group_value_by_device(per_device_values)
    if self.num_packs > 0:
      logging.info(
          "batch_all_reduce invoked for batches size = %d with "
          "algorithm = %s and num_packs = %d", len(per_device_values),
          self.all_reduce_alg, self.num_packs)
      tensor_packer = ConcatAndSplitPacker(self.num_packs)
      device_grad_packs = tensor_packer.pack(grouped)
    elif (self.agg_small_grads_max_bytes > 0 and
          self.agg_small_grads_max_group > 0):
      logging.info(
          "batch_all_reduce invoked for batches size = %d with "
          "algorithm = %s, agg_small_grads_max_bytes = %d and "
          "agg_small_grads_max_group = %d", len(per_device_values),
          self.all_reduce_alg, self.agg_small_grads_max_bytes,
          self.agg_small_grads_max_group)
      tensor_packer = AggregateSmallTensorPacker(
          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
      device_grad_packs = tensor_packer.pack(grouped)
    else:
      logging.info(
          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
          len(per_device_values), self.all_reduce_alg)
      tensor_packer = None
      device_grad_packs = grouped

    # The actual aggregation of the repacked gradients. Note that they are
    # sharded among different aggregation trees. So it is important to strike
    # the balance on num_splits.
    if self.all_reduce_alg == "nccl":
      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
          device_grad_packs)
    else:
      # TODO (yuefengz): check that gpu ids in `destinations` are in ascending id:637
      # https://github.com/imdone/tensorflow/issues/638
      # order.
      reduced = (
          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
              destinations, device_grad_packs))

    if tensor_packer:
      reduced = tensor_packer.unpack(reduced)

    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
                                      method_string)
示例#4
0
  def _batch_all_reduce(self, method_string, per_device_values):
    """All reduce algorithm in a batch."""
    destinations = per_device_values[0].devices
    grouped = _group_value_by_device(per_device_values)
    if self.num_packs > 0:
      logging.info(
          "batch_all_reduce invoked for batches size = %d with "
          "algorithm = %s and num_packs = %d", len(per_device_values),
          self.all_reduce_alg, self.num_packs)
      tensor_packer = ConcatAndSplitPacker(self.num_packs)
      device_grad_packs = tensor_packer.pack(grouped)
    elif (self.agg_small_grads_max_bytes > 0 and
          self.agg_small_grads_max_group > 0):
      logging.info(
          "batch_all_reduce invoked for batches size = %d with "
          "algorithm = %s, agg_small_grads_max_bytes = %d and "
          "agg_small_grads_max_group = %d", len(per_device_values),
          self.all_reduce_alg, self.agg_small_grads_max_bytes,
          self.agg_small_grads_max_group)
      tensor_packer = AggregateSmallTensorPacker(
          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
      device_grad_packs = tensor_packer.pack(grouped)
    else:
      logging.info(
          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
          len(per_device_values), self.all_reduce_alg)
      tensor_packer = None
      device_grad_packs = grouped

    # The actual aggregation of the repacked gradients. Note that they are
    # sharded among different aggregation trees. So it is important to strike
    # the balance on num_splits.
    if self.all_reduce_alg == "nccl":
      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
          device_grad_packs)
    else:
      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
      # order.
      reduced = (
          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
              destinations, device_grad_packs))

    if tensor_packer:
      reduced = tensor_packer.unpack(reduced)

    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
                                      method_string)