Exemplo n.º 1
0
 def _do_all_reduce_pack_test(self, tt):
   """Test that all-reduce results are the same with or without packing."""
   with ops.Graph().as_default():
     tower_grads, consts, _, _ = self._init_tensors(
         tt.num_devices, tt.in_shapes)
     dev_prefixes = ['/job:localhost']
     num_workers = 1
     alg = 'xring'
     shards = 1
     gpu_indices = range(0, tt.num_devices)
     assert len(gpu_indices) == len(tower_grads)
     no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
         dev_prefixes, tower_grads, num_workers, alg, shards,
         gpu_indices,
         agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
     packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
     packed_all_reduce = allreduce.sum_gradients_all_reduce(
         dev_prefixes, packed_tg, num_workers, alg, shards,
         gpu_indices,
         agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
     unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       no_pack_values = sess.run(no_pack_all_reduce)
       pack_unpack_values = sess.run(unpacked_tg)
       for d in range(1, tt.num_devices):
         for t in range(0, len(tt.in_shapes)):
           self.assertTrue(np.allclose(no_pack_values[d][t][0],
                                       tt.num_devices * consts[0][t]))
           self.assertTrue(np.array_equal(no_pack_values[d][t][0],
                                          pack_unpack_values[d][t][0]))
 def preprocess_device_grads(self, device_grads):
     remaining_grads = device_grads
     aggregated_grads = []
     for spec_tuple in self._all_reduce_spec:
         if spec_tuple.limit < 0:
             this_grads = remaining_grads
             remaining_grads = []
         else:
             (this_grads, remaining_grads) = allreduce.split_grads_by_size(
                 spec_tuple.limit, remaining_grads)
         if this_grads:
             range_agg_grads = allreduce.sum_gradients_all_reduce(
                 self._all_reduce_device_prefixes,
                 this_grads,
                 self._num_workers,
                 spec_tuple.alg,
                 spec_tuple.shards,
                 self.benchmark_cnn.gpu_indices,
                 agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
                 agg_small_grads_max_group=self._agg_small_grads_max_group)
             if not aggregated_grads:
                 aggregated_grads = range_agg_grads
             else:
                 assert len(aggregated_grads) == len(range_agg_grads)
                 for i in range(len(aggregated_grads)):
                     aggregated_grads[i] += range_agg_grads[i]
     assert not remaining_grads
     full_device_set = []
     for grads in device_grads:
         g, v = grads[0]
         del v
         full_device_set.append(g.device)
     return (full_device_set, aggregated_grads)
 def preprocess_device_grads(self, device_grads):
     if self._all_reduce_spec:
         aggregated_device_grads = allreduce.sum_gradients_all_reduce(
             ['/job:localhost'],
             device_grads,
             1,
             self._all_reduce_spec.alg,
             self._all_reduce_spec.shards,
             self.benchmark_cnn.gpu_indices,
             agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
             agg_small_grads_max_group=self._agg_small_grads_max_group)
     else:
         agg_grads, self.grad_has_inf_nan = (
             variable_mgr_util.
             aggregate_gradients_using_copy_with_device_selection(
                 self.benchmark_cnn,
                 device_grads,
                 use_mean=False,
                 check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
         aggregated_device_grads = []
         for arr in device_grads:
             aggregated_device_grads.append([
                 (g, v) for (_, v), (g, _) in zip(arr, agg_grads)
             ])
     return self.benchmark_cnn.devices, aggregated_device_grads
Exemplo n.º 4
0
 def preprocess_device_grads(self, device_grads):
     reduced_grads = allreduce.sum_gradients_all_reduce(
         self._single_session,
         self._all_reduce_device_prefixes,
         device_grads,
         self._num_workers,
         'collective',
         1,  # spec_tuple.shards,
         self.benchmark_cnn.gpu_indices,
         allreduce_merge_scope=self._allreduce_merge_scope)
     assert len(reduced_grads) == len(device_grads)
     full_device_set = []
     for grads in device_grads:
         g, _ = grads[0]
         full_device_set.append(g.device)
     return (full_device_set, reduced_grads)
Exemplo n.º 5
0
    def _aggregate_grads(self, device_grads):
        """Aggregate gradients across GPUs.

    Args:
      device_grads: List of lists of (gradient, variable) tuples.
        device_grads[t][g] = (gradient, variable), where t is the index of the
        tower and g is the index of the gradient-variable pair.

    Returns:
      List of lists of (gradient, variable) tuples, in the same form
      as `device_grads`. Each gradient has been summed over the towers.
    """
        if self._all_reduce_spec:
            # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
            # gradient aggregation code, since gradient aggregation is doing an all
            # reduce. Currently, we do gradient repacking in two different places.
            aggregated_device_grads = allreduce.sum_gradients_all_reduce(
                ['/job:localhost'],
                device_grads,
                1,
                self._all_reduce_spec.alg,
                self._all_reduce_spec.shards,
                self.benchmark_cnn.gpu_indices,
                agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
                agg_small_grads_max_group=self._agg_small_grads_max_group)
        elif self.benchmark_cnn.params.hierarchical_copy:
            aggregated_device_grads, self.grad_has_inf_nan = (
                variable_mgr_util.aggregate_gradients_using_hierarchical_copy(
                    self.benchmark_cnn,
                    device_grads,
                    use_mean=False,
                    check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
        else:
            agg_grads, self.grad_has_inf_nan = (
                variable_mgr_util.
                aggregate_gradients_using_copy_with_device_selection(
                    self.benchmark_cnn,
                    device_grads,
                    use_mean=False,
                    check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
            aggregated_device_grads = []
            for arr in device_grads:
                aggregated_device_grads.append([
                    (g, v) for (_, v), (g, _) in zip(arr, agg_grads)
                ])
        return aggregated_device_grads
Exemplo n.º 6
0
 def _do_batch_all_reduce(self, all_device_tensors):
     # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
     # gradient aggregation code, since gradient aggregation is doing an all
     # reduce. Currently, we do gradient repacking in two different places.
     # TODO(reedwm): Change the allreduce code to reduce tensors instead of
     # tower_grads.
     tower_grads = [[(t, None) for t in device_tensors]
                    for device_tensors in all_device_tensors]
     aggregated_device_grads = allreduce.sum_gradients_all_reduce(
         ['/job:localhost'],
         tower_grads,
         1,
         self._all_reduce_spec.alg,
         self._all_reduce_spec.shards,
         self._gpu_indices,
         agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
         agg_small_grads_max_group=self._agg_small_grads_max_group)
     return [[t for t, _ in grad_vars]
             for grad_vars in aggregated_device_grads]
Exemplo n.º 7
0
  def preprocess_device_grads(self, device_grads):
    if self._all_reduce_spec:
      aggregated_device_grads = allreduce.sum_gradients_all_reduce(
          ['/job:localhost'],
          device_grads,
          1,
          self._all_reduce_spec.alg,
          self._all_reduce_spec.shards,
          self.benchmark_cnn.gpu_indices,
          agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
          agg_small_grads_max_group=self._agg_small_grads_max_group)
    else:
      if not self.benchmark_cnn.params.hierarchical_copy:
        agg_grads, self.grad_has_inf_nan = (
            variable_mgr_util.
            aggregate_gradients_using_copy_with_device_selection(
                self.benchmark_cnn,
                device_grads,
                use_mean=False,
                check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
        aggregated_device_grads = []
        for arr in device_grads:
          aggregated_device_grads.append(
              [(g, v) for (_, v), (g, _) in zip(arr, agg_grads)])
      elif self.benchmark_cnn.params.gradient_repacking == 0:
        aggregated_device_grads, self.grad_has_inf_nan = (
            variable_mgr_util.aggregate_gradients_using_hierarchical_copy(
                self.benchmark_cnn,
                device_grads,
                use_mean=False,
                check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
      else:
        device_grad_packs = []
        all_tower_shapes = []
        all_tower_sizes = []
        compact_gradient = self.benchmark_cnn.params.compact_gradient_transfer
        use_fp16 = self.benchmark_cnn.params.use_fp16
        variable_consistency = self.benchmark_cnn.params.variable_consistency
        deferred_gradient = (variable_consistency == 'relaxed')
        gradient_put_ops = []
        for tower_grads_and_vars in device_grads:
          with tf.colocate_with(tower_grads_and_vars[0][0]):
            # Flatten all the grads.
            flat_grads = [tf.reshape(g, [-1]) for g, _ in tower_grads_and_vars]
            # Remember the original shape of all the grads.
            tower_shapes = [tf.shape(g) for g, _ in tower_grads_and_vars]
            # Remember the original sizes of all the grads.
            tower_sizes = [tf.size(g) for g, _ in tower_grads_and_vars]
            # Concat all the flat grads into a big flat tensor.
            concat_grads = tf.concat(flat_grads, 0)
            grads_dtype = concat_grads.dtype
            if use_fp16 and compact_gradient:
              concat_grads = tf.cast(concat_grads, tf.float16)

            # With deferred-gradients, place the gradients in a staging area.
            if deferred_gradient:
              total_var_size = sum(
                  [v.shape.num_elements() for _, v in tower_grads_and_vars])
              gradient_stage = data_flow_ops.StagingArea([concat_grads.dtype])

              # Push the concat-gradients into the staging area.
              gradient_put_op = gradient_stage.put([concat_grads])
              gradient_put_ops.append(gradient_put_op)

              # Push an empty set of gradients into the staging area.
              warmup_op = gradient_stage.put(
                  [tf.zeros([total_var_size], dtype=concat_grads.dtype)])
              self._warmup_ops.append(warmup_op)

              # Fetch the next set of gradients to ues.
              concat_grads = gradient_stage.get()
              concat_grads = tf.reshape(concat_grads, [-1])
            else:
              gradient_put_ops.append(None)

            # Split the big tensor into num_splits packs. In cases where the
            # total size is not divisible num_splits, the last pack gets
            # more elements.
            # TODO(zhengxq): it is possible to optimize away the additional
            # data movement by copying along the original variable boundary.
            # TODO(zhengxq): it is also possible to optimize away all the concat
            # as well.
            num_splits = self.benchmark_cnn.params.gradient_repacking
            total_grad_size = tf.size(concat_grads)
            split_size = total_grad_size // num_splits
            split_size_last = total_grad_size - split_size * (num_splits - 1)
            split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
            grad_packs = tf.split(concat_grads, split_sizes)

            # Ready to aggregate the repacked gradients, with fake variables.
            # TODO(zhengxq): It is hacky to have to use fake variables.
            # We should remove the need for variables in
            # aggregate_gradients_using*.
            device_grad_packs.append(zip(grad_packs, [None] * num_splits))
            all_tower_shapes.append(tower_shapes)
            all_tower_sizes.append(tower_sizes)

        # The actual aggregation of the repacked gradients. Note that they are
        # sharded among different aggregation trees. So it is important to
        # strike the balance on num_splits.
        summed_device_grad_packs, self.grad_has_inf_nan = (
            variable_mgr_util.aggregate_gradients_using_hierarchical_copy(
                self.benchmark_cnn,
                device_grad_packs,
                use_mean=False,
                check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))

        aggregated_device_grads = []
        # pylint: disable=line-too-long
        for (summed_tower_grad_packs, tower_grads_and_vars,
             tower_shapes, tower_sizes, gradient_put_op) in zip(
                 summed_device_grad_packs, device_grads, all_tower_shapes,
                 all_tower_sizes, gradient_put_ops):
          # pylint: enable=line-too-long
          # Reverse the packing operations in the previous steps. Form the
          # summed gradients back into their original shapes.
          with tf.colocate_with(summed_tower_grad_packs[0][0]):
            # Form a list of the summed grad packs.
            device_grad_packs = [g for g, _ in summed_tower_grad_packs]

            # Concat them back into a big flat tensor.
            device_grads_concat = tf.concat(device_grad_packs, 0)
            device_grads_concat = tf.cast(device_grads_concat, grads_dtype)

            # Split the tensors back into their original sizes.
            grads_with_sizes = tf.split(device_grads_concat, tower_sizes)

            # Reshape the tensors back into their original shapes.
            grads_with_shapes = [
                tf.reshape(grad, shape)
                for shape, grad in zip(tower_shapes, grads_with_sizes)
            ]

            # With deferred gradient, add a dependency on the put_op so we
            # don't have a race condition with the update operation that
            # follows. Also it triggers the gradient_put_op. Otherwise, the
            # caller has to explicitly pump it.
            if deferred_gradient:
              assert gradient_put_op
              grads_with_shapes = [
                  control_flow_ops.with_dependencies([gradient_put_op], g)
                  for g in grads_with_shapes
              ]
            # Form the list with the original list of variables.
            summed_tower_grads = [
                (g, v)
                for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars)
            ]
            aggregated_device_grads.append(summed_tower_grads)

    return self.benchmark_cnn.devices, aggregated_device_grads
Exemplo n.º 8
0
    def preprocess_device_grads(self, device_grads):
        if self._all_reduce_spec:
            aggregated_device_grads = allreduce.sum_gradients_all_reduce(
                ['/job:localhost'],
                device_grads,
                1,
                self._all_reduce_spec.alg,
                self._all_reduce_spec.shards,
                self.benchmark_cnn.gpu_indices,
                agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
                agg_small_grads_max_group=self._agg_small_grads_max_group)
        else:
            if not self.benchmark_cnn.params.hierarchical_copy:
                agg_grads, self.grad_has_inf_nan = (
                    variable_mgr_util.
                    aggregate_gradients_using_copy_with_device_selection(
                        self.benchmark_cnn,
                        device_grads,
                        use_mean=False,
                        check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale
                    ))
                aggregated_device_grads = []
                for arr in device_grads:
                    aggregated_device_grads.append([
                        (g, v) for (_, v), (g, _) in zip(arr, agg_grads)
                    ])
            elif self.benchmark_cnn.params.gradient_repacking == 0:
                aggregated_device_grads, self.grad_has_inf_nan = (
                    variable_mgr_util.
                    aggregate_gradients_using_hierarchical_copy(
                        self.benchmark_cnn,
                        device_grads,
                        use_mean=False,
                        check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale
                    ))
            else:
                device_grad_packs = []
                all_tower_shapes = []
                all_tower_sizes = []
                compact_gradient = self.benchmark_cnn.params.compact_gradient_transfer
                use_fp16 = self.benchmark_cnn.params.use_fp16
                for tower_grads_and_vars in device_grads:
                    with tf.colocate_with(tower_grads_and_vars[0][0]):
                        # Flatten all the grads.
                        flat_grads = [
                            tf.reshape(g, [-1])
                            for g, _ in tower_grads_and_vars
                        ]
                        # Remember the original shape of all the grads.
                        tower_shapes = [
                            tf.shape(g) for g, _ in tower_grads_and_vars
                        ]
                        # Remember the original sizes of all the grads.
                        tower_sizes = [
                            tf.size(g) for g, _ in tower_grads_and_vars
                        ]
                        # Concat all the flat grads into a big flat tensor.
                        concat_grads = tf.concat(flat_grads, 0)
                        grads_dtype = concat_grads.dtype
                        if use_fp16 and compact_gradient:
                            concat_grads = tf.cast(concat_grads, tf.float16)

                        # Split the big tensor into num_splits packs. In cases where the
                        # total size is not divisible num_splits, the last pack gets
                        # more elements.
                        # TODO(zhengxq): it is possible to optimize away the additional
                        # data movement by copying along the original variable boundary.
                        # TODO(zhengxq): it is also possible to optimize away all the concat
                        # as well.
                        num_splits = self.benchmark_cnn.params.gradient_repacking
                        total_grad_size = tf.size(concat_grads)
                        split_size = total_grad_size // num_splits
                        split_size_last = total_grad_size - split_size * (
                            num_splits - 1)
                        split_sizes = [split_size] * (num_splits - 1) + [
                            split_size_last
                        ]
                        grad_packs = tf.split(concat_grads, split_sizes)

                        # Ready to aggregate the repacked gradients, with fake variables.
                        # TODO(zhengxq): It is hacky to have to use fake variables.
                        # We should remove the need for variables in
                        # aggregate_gradients_using*.
                        device_grad_packs.append(
                            zip(grad_packs, [None] * num_splits))
                        all_tower_shapes.append(tower_shapes)
                        all_tower_sizes.append(tower_sizes)

                # The actual aggregation of the repacked gradients. Note that they are
                # sharded among different aggregation trees. So it is important to
                # strike the balance on num_splits.
                summed_device_grad_packs, self.grad_has_inf_nan = (
                    variable_mgr_util.
                    aggregate_gradients_using_hierarchical_copy(
                        self.benchmark_cnn,
                        device_grad_packs,
                        use_mean=False,
                        check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale
                    ))

                aggregated_device_grads = []
                # pylint: disable=line-too-long
                for (summed_tower_grad_packs, tower_grads_and_vars,
                     tower_shapes,
                     tower_sizes) in zip(summed_device_grad_packs,
                                         device_grads, all_tower_shapes,
                                         all_tower_sizes):
                    # pylint: enable=line-too-long
                    # Reverse the packing operations in the previous steps. Form the
                    # summed gradients back into their original shapes.
                    with tf.colocate_with(summed_tower_grad_packs[0][0]):
                        # Form a list of the summed grad packs.
                        device_grad_packs = [
                            g for g, _ in summed_tower_grad_packs
                        ]

                        # Concat them back into a big flat tensor.
                        device_grads_concat = tf.concat(device_grad_packs, 0)
                        device_grads_concat = tf.cast(device_grads_concat,
                                                      grads_dtype)

                        # Split the tensors back into their original sizes.
                        grads_with_sizes = tf.split(device_grads_concat,
                                                    tower_sizes)

                        # Reshape the tensors back into their original shapes.
                        grads_with_shapes = [
                            tf.reshape(grad, shape) for shape, grad in zip(
                                tower_shapes, grads_with_sizes)
                        ]

                        # Form the list with the original list of variables.
                        summed_tower_grads = [(g, v) for g, (
                            _,
                            v) in zip(grads_with_shapes, tower_grads_and_vars)]
                        aggregated_device_grads.append(summed_tower_grads)

        return self.benchmark_cnn.devices, aggregated_device_grads