def _batch_all_reduce(self, aggregation, per_device_values): """All reduce algorithm in a batch.""" logging.log_first_n( logging.INFO, "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(per_device_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = per_device_values[0].devices grouped = _group_value_by_device(per_device_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_tower_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_tower_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices, aggregation)
def _batch_all_reduce(self, aggregation, per_replica_values): """All reduce algorithm in a batch.""" logging.log_first_n( logging.INFO, "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(per_replica_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = per_replica_values[0].devices grouped = _group_value_by_device(per_replica_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_tower_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_tower_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices, aggregation)
def _batch_all_reduce(self, method_string, per_device_values): """All reduce algorithm in a batch.""" destinations = per_device_values[0].devices grouped = _group_value_by_device(per_device_values) if self.num_packs > 0: logging.info( "batch_all_reduce invoked for batches size = %d with " "algorithm = %s and num_packs = %d", len(per_device_values), self.all_reduce_alg, self.num_packs) tensor_packer = ConcatAndSplitPacker(self.num_packs) device_grad_packs = tensor_packer.pack(grouped) elif (self.agg_small_grads_max_bytes > 0 and self.agg_small_grads_max_group > 0): logging.info( "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d", len(per_device_values), self.all_reduce_alg, self.agg_small_grads_max_bytes, self.agg_small_grads_max_group) tensor_packer = AggregateSmallTensorPacker( self.agg_small_grads_max_bytes, self.agg_small_grads_max_group) device_grad_packs = tensor_packer.pack(grouped) else: logging.info( "batch_all_reduce invoked for batches size = %d with algorithm = %s", len(per_device_values), self.all_reduce_alg) tensor_packer = None device_grad_packs = grouped # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self.all_reduce_alg == "nccl": reduced = cross_tower_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO (yuefengz): check that gpu ids in `destinations` are in ascending id:637 # https://github.com/imdone/tensorflow/issues/638 # order. reduced = ( cross_tower_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) if tensor_packer: reduced = tensor_packer.unpack(reduced) return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices, method_string)
def _batch_all_reduce(self, method_string, per_device_values): """All reduce algorithm in a batch.""" destinations = per_device_values[0].devices grouped = _group_value_by_device(per_device_values) if self.num_packs > 0: logging.info( "batch_all_reduce invoked for batches size = %d with " "algorithm = %s and num_packs = %d", len(per_device_values), self.all_reduce_alg, self.num_packs) tensor_packer = ConcatAndSplitPacker(self.num_packs) device_grad_packs = tensor_packer.pack(grouped) elif (self.agg_small_grads_max_bytes > 0 and self.agg_small_grads_max_group > 0): logging.info( "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d", len(per_device_values), self.all_reduce_alg, self.agg_small_grads_max_bytes, self.agg_small_grads_max_group) tensor_packer = AggregateSmallTensorPacker( self.agg_small_grads_max_bytes, self.agg_small_grads_max_group) device_grad_packs = tensor_packer.pack(grouped) else: logging.info( "batch_all_reduce invoked for batches size = %d with algorithm = %s", len(per_device_values), self.all_reduce_alg) tensor_packer = None device_grad_packs = grouped # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self.all_reduce_alg == "nccl": reduced = cross_tower_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_tower_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) if tensor_packer: reduced = tensor_packer.unpack(reduced) return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices, method_string)