示例#1
0
 def testErrors(self):
     with self.assertRaisesRegex(ValueError,
                                 'Device assignment .* required'):
         nccl_ops.all_sum(
             [array_ops.identity(np.random.random_sample((3, 4)))])
     with self.assertRaisesRegex(ValueError, 'Must pass >0 tensors'):
         nccl_ops.all_sum([])
示例#2
0
def allreduce_tensors(all_tensors, average=True):
    """
    REFERENCE : https://github.com/ppwwyyxx/tensorpack/blob/83e4e187af5765792408e7b7163efd4744d63628/tensorpack/graph_builder/utils.py
    All-reduce average the per-device tensors of the variables among K devices.

    Args:
        all_tensors (K x N): List of list of tensors. N is the number of (independent) variables.
        average (bool): divide the tensors by N or not.
    Returns:
        K x N: same as input, but each tensor is replaced by the all reduce over K devices.
    """
    nr_tower = len(all_tensors)
    if nr_tower == 1:
        return all_tensors
    new_all_tensors = []  # N x K
    for tensors in zip(*all_tensors):
        summed = nccl.all_sum(tensors)

        tensors_for_devices = []  # K
        for tensor in summed:
            with tf.device(tensor.device):
                # tensorflow/benchmarks didn't average gradients
                if average:
                    tensor = tf.multiply(tensor, 1.0 / nr_tower, name='allreduce_avg')
            tensors_for_devices.append(tensor)
        new_all_tensors.append(tensors_for_devices)

    # transpose to K x N
    ret = list(zip(*new_all_tensors))
    return ret
示例#3
0
def allreduce_grads(all_grads, average):
    """
    All-reduce average the gradients among K devices. Results are broadcasted to all devices.

    Args:
        all_grads (K x N): List of list of gradients. N is the number of variables.
        average (bool): average gradients or not.

    Returns:
        K x N: same as input, but each grad is replaced by the average over K devices.
    """

    if get_tf_version_tuple() <= (1, 12):
        from tensorflow.contrib import nccl
    else:
        from tensorflow.python.ops import nccl_ops as nccl
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads
    new_all_grads = []  # N x K
    for grads in zip(*all_grads):
        summed = nccl.all_sum(grads)

        grads_for_devices = []  # K
        for g in summed:
            with tf.device(g.device):
                # tensorflow/benchmarks didn't average gradients
                if average:
                    g = tf.multiply(g, 1.0 / nr_tower)
            grads_for_devices.append(g)
        new_all_grads.append(grads_for_devices)

    # transpose to K x N
    ret = list(zip(*new_all_grads))
    return ret
示例#4
0
    def apply_update(self):
        device_list = list(self._dev_opt.keys())
        for dev in device_list:
            self._dev_grad_sum[dev] = []

        ops = []
        if len(device_list) > 1:
            with tf.name_scope("all_reduce"), tf.device(None):
                var_length = len(self._dev_grad[device_list[0]])
                for var_idx in range(var_length):
                    g = [
                        self._dev_grad[dev][var_idx][0] for dev in device_list
                    ]
                    g = nccl_ops.all_sum(g)
                    for dev, gg in zip(device_list, g):
                        self._dev_grad_sum[dev].append(
                            (gg, self._dev_grad[dev][var_idx][1]))

            for dev_idx, (device,
                          grads) in enumerate(self._dev_grad_sum.items()):
                with tf.name_scope("Apply_grad%d" %
                                   dev_idx), tf.device(device):
                    update_op = self._dev_opt[device].apply_gradients(grads)
                    ops.append(update_op)
        else:
            for device, grads in self._dev_grad.items():
                with tf.name_scope("Apply_grad"), tf.device(device):
                    update_op = self._dev_opt[device].apply_gradients(grads)
                    ops.append(update_op)

        ops.extend(self.reset_opt_state())
        return tf.group(*ops, name='TrainingOp')
示例#5
0
def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
  """Build a subgraph that does one full all-reduce, using NCCL.

  Args:
    input_tensors: list of T `tf.Tensor` of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.  Must be one of
      {tf.add}
    un_op: optional unary elementwise Op to apply to fully-reduce values.

  Returns:
    list of T `tf.Tensor` of reduced values.

  Raises:
    ValueError: red_op not supported.
  """
  if red_op == math_ops.add:
    output_tensors = nccl_ops.all_sum(input_tensors)
  else:
    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
  if un_op:
    un_op_wrapped = []
    for t in output_tensors:
      with ops.colocate_with(t):
        un_op_wrapped.append(un_op(t))
    output_tensors = un_op_wrapped
  return output_tensors
示例#6
0
def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
    """Build a subgraph that does one full all-reduce, using NCCL.

  Args:
    input_tensors: list of T `tf.Tensor` of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.  Must be one of
      {tf.add}
    un_op: optional unary elementwise Op to apply to fully-reduce values.

  Returns:
    list of T `tf.Tensor` of reduced values.

  Raises:
    ValueError: red_op not supported.
  """
    if red_op == math_ops.add:
        output_tensors = nccl_ops.all_sum(input_tensors)
    else:
        raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
    if un_op:
        un_op_wrapped = []
        for t in output_tensors:
            with ops.colocate_with(t):
                un_op_wrapped.append(un_op(t))
        output_tensors = un_op_wrapped
    return output_tensors
示例#7
0
def allreduce_grads(all_grads, average=True):
    """
    REFERENCE : https://github.com/ppwwyyxx/tensorpack/blob/83e4e187af5765792408e7b7163efd4744d63628/tensorpack/graph_builder/utils.py
    All-reduce average the gradients among K devices. Results are broadcasted to all devices.
    Args:
        all_grads (K x N): List of list of gradients. N is the number of variables.
        average (bool): average gradients or not.
    Returns:
        K x N: same as input, but each grad is replaced by the average over K devices.
    """
    # from tensorflow.contrib import nccl
    from tensorflow.python.ops import nccl_ops
    nr_tower = len(all_grads)
    if nr_tower == 1:
        return all_grads
    new_all_grads = []  # N x K
    for grads in zip(*all_grads):
        summed = nccl_ops.all_sum(grads)

        grads_for_devices = []  # K
        for g in summed:
            with tf.device(g.device):
                # tensorflow/benchmarks didn't average gradients
                if average:
                    g = tf.multiply(g, 1.0 / nr_tower, name='allreduce_avg')
            grads_for_devices.append(g)
        new_all_grads.append(grads_for_devices)

    # transpose to K x N
    ret = list(zip(*new_all_grads))
    return ret
示例#8
0
 def _broadcast_nccl(self):
     """Sum gradients across devices using NCCL ops (fast path)."""
     from tensorflow.python.ops import nccl_ops # pylint: disable=no-name-in-module
     for all_vars in zip(*[device.grad_clean.keys() for device in self._devices.values()]):
         if any(x.shape.num_elements() > 0 for x in all_vars):
             all_grads = [device.grad_clean[var] for device, var in zip(self._devices.values(), all_vars)]
             all_grads = nccl_ops.all_sum(all_grads)
             for device, var, grad in zip(self._devices.values(), all_vars, all_grads):
                 device.grad_clean[var] = grad
示例#9
0
def aggregate_gradients_using_nccl(replica_grads):
  """Aggregate gradients using nccl allreduce."""
  agg_all_g_and_v = []
  for single_g_and_v in zip(*replica_grads):
    single_grads = [g for g, _ in single_g_and_v]
    agg_grads = nccl_ops.all_sum(single_grads)
    agg_all_g_and_v.append(
        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])

  agg_all_g_and_v = list(zip(*agg_all_g_and_v))

  return agg_all_g_and_v
def aggregate_gradients_using_nccl(replica_grads):
  """Aggregate gradients using nccl allreduce."""
  agg_all_g_and_v = []
  for single_g_and_v in zip(*replica_grads):
    single_grads = [g for g, _ in single_g_and_v]
    agg_grads = nccl_ops.all_sum(single_grads)
    agg_all_g_and_v.append(
        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])

  agg_all_g_and_v = list(zip(*agg_all_g_and_v))

  return agg_all_g_and_v
示例#11
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            from tensorflow.python.ops import nccl_ops
            summed_grads = nccl_ops.all_sum(scaled_grads)
        elif alg == 'simple':
            summed_grads = build_reduce_sum(scaled_grads)
        elif alg == 'trivial':
            summed_grads = build_trivial_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == 'pscpu/pscpu':
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
示例#12
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with tf.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            from tensorflow.python.ops import nccl_ops
            summed_grads = nccl_ops.all_sum(scaled_grads)
        elif alg == 'simple':
            summed_grads = build_reduce_sum(scaled_grads)
        elif alg == 'trivial':
            summed_grads = build_trivial_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, tf.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, tf.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, tf.add, tf.add_n)
        elif alg == 'pscpu/pscpu':
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads,
                aux_devices,
                # TODO(tucker): devise a way of better specifying the device
                # for the second level.
                [aux_devices[0]],
                tf.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, tf.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

        result = []
        for (_, v), g in zip(grad_and_vars, summed_grads):
            result.append([g, v])
        return result
示例#13
0
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
    """Apply all-reduce algorithm over specified gradient tensors."""
    with ops.name_scope('allreduce'):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        scaled_grads = [g for g, _ in grad_and_vars]
        if alg == 'nccl':
            summed_grads = nccl_ops.all_sum(scaled_grads)
        elif alg == 'xring':
            summed_grads = all_reduce.build_ring_all_reduce(
                scaled_grads, num_workers, num_shards, gpu_indices,
                math_ops.add)
        elif alg == 'nccl/xring':
            summed_grads = all_reduce.build_nccl_then_ring(
                scaled_grads, num_shards, math_ops.add)
        elif alg == 'nccl/rechd':
            summed_grads = all_reduce.build_nccl_then_recursive_hd(
                scaled_grads, math_ops.add)
        elif alg == 'nccl/pscpu':
            summed_grads = all_reduce.build_nccl_then_shuffle(
                scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
        elif alg == 'pscpu/pscpu':
            second_gather_devices = aux_devices[:num_shards]
            summed_grads = all_reduce.build_shuffle_then_shuffle(
                scaled_grads, aux_devices, second_gather_devices,
                math_ops.add_n)
        elif alg in ['pscpu', 'psgpu']:
            summed_grads = all_reduce.build_shuffle_all_reduce(
                scaled_grads, aux_devices, math_ops.add_n)
        else:
            raise ValueError('unsupported all_reduce alg: ', alg)

    result = []
    for (_, v), g in zip(grad_and_vars, summed_grads):
        result.append([g, v])
    return result
def sum_grad_and_var_all_reduce(grad_and_vars,
                                num_workers,
                                alg,
                                gpu_indices,
                                aux_devices=None,
                                num_shards=1):
  """Apply all-reduce algorithm over specified gradient tensors."""
  with ops.name_scope('allreduce'):
    # Note that each grad_and_vars looks like the following:
    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
    scaled_grads = [g for g, _ in grad_and_vars]
    if alg == 'nccl':
      summed_grads = nccl_ops.all_sum(scaled_grads)
    elif alg == 'xring':
      summed_grads = all_reduce.build_ring_all_reduce(
          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
    elif alg == 'nccl/xring':
      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
                                                     math_ops.add)
    elif alg == 'nccl/rechd':
      summed_grads = all_reduce.build_nccl_then_recursive_hd(
          scaled_grads, math_ops.add)
    elif alg == 'nccl/pscpu':
      summed_grads = all_reduce.build_nccl_then_shuffle(
          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
    elif alg == 'pscpu/pscpu':
      second_gather_devices = aux_devices[:num_shards]
      summed_grads = all_reduce.build_shuffle_then_shuffle(
          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
    elif alg in ['pscpu', 'psgpu']:
      summed_grads = all_reduce.build_shuffle_all_reduce(
          scaled_grads, aux_devices, math_ops.add_n)
    else:
      raise ValueError('unsupported all_reduce alg: ', alg)

  result = []
  for (_, v), g in zip(grad_and_vars, summed_grads):
    result.append([g, v])
  return result
示例#15
0
def all_avg_gradients(tower_gradvars, devices, param_server_device='/gpu:0'):
    if len(devices) == 1:
        return tower_gradvars

    if have_nccl and FLAGS.nccl:
        new_tower_grads = []
        contig_list = []
        for d, grad_list in zip(devices, tower_gradvars):
            with tf.device(d):
                flat_grads = [tf.reshape(g, [-1]) for (g, _) in grad_list]
                contig_grads = tf.concat(flat_grads, 0)
                contig_list.append(contig_grads)

        summed_grads = nccl_ops.all_sum(contig_list)
        for d, s, grad_list in zip(devices, summed_grads, tower_gradvars):
            with tf.device(d):
                new_grad_list = [];
                sizes = [tf.size(g) for (g, _) in grad_list]
                flat_grads = tf.split(s, sizes)
                for newg, (oldg, v) in zip(flat_grads, grad_list):
                    newg = tf.reshape(newg, tf.shape(oldg))
                    newg *= 1. / len(devices)
                    new_grad_list.append((newg, v))
                new_tower_grads.append(new_grad_list)
        return new_tower_grads
    else:
        num_devices = len(tower_gradvars)
        avg_gradvars = []
        for layer in zip(*tower_gradvars):
            grads_on_devices, vars_on_devices = zip(*layer)
            with tf.device(param_server_device):
                avg_grad = tf.reduce_mean(tf.stack(grads_on_devices), 0)
            avg_grads_on_devices = [avg_grad]*num_devices
            avg_gradvars_on_devices = zip(*(avg_grads_on_devices, vars_on_devices))
            avg_gradvars.append(avg_gradvars_on_devices)
        return list(zip(*avg_gradvars))
示例#16
0
    def apply_updates(self):
        assert not self._updates_applied
        self._updates_applied = True
        devices = list(self._dev_grads.keys())
        total_grads = sum(len(grads) for grads in self._dev_grads.values())
        assert len(devices) >= 1 and total_grads >= 1
        ops = []
        with absolute_name_scope(self.scope):

            # Cast gradients to FP32 and calculate partial sum within each device.
            dev_grads = OrderedDict()  # device => [(grad, var), ...]
            for dev_idx, dev in enumerate(devices):
                with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev):
                    sums = []
                    for gv in zip(*self._dev_grads[dev]):
                        assert all(v is gv[0][1] for g, v in gv)
                        g = [tf.cast(g, tf.float32) for g, v in gv]
                        g = g[0] if len(g) == 1 else tf.add_n(g)
                        sums.append((g, gv[0][1]))
                    dev_grads[dev] = sums

            # Sum gradients across devices.
            if len(devices) > 1:
                with tf.name_scope('SumAcrossGPUs'), tf.device(None):
                    for var_idx, grad_shape in enumerate(self._grad_shapes):
                        g = [dev_grads[dev][var_idx][0] for dev in devices]
                        if np.prod(
                                grad_shape
                        ):  # nccl does not support zero-sized tensors
                            g = all_sum(g)
                        for dev, gg in zip(devices, g):
                            dev_grads[dev][var_idx] = (
                                gg, dev_grads[dev][var_idx][1])

            # Apply updates separately on each device.
            for dev_idx, (dev, grads) in enumerate(dev_grads.items()):
                with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev):

                    # Scale gradients as needed.
                    if self.use_loss_scaling or total_grads > 1:
                        with tf.name_scope('Scale'):
                            coef = tf.constant(np.float32(1.0 / total_grads),
                                               name='coef')
                            coef = self.undo_loss_scaling(coef)
                            grads = [(g * coef, v) for g, v in grads]

                    # Check for overflows.
                    with tf.name_scope('CheckOverflow'):
                        grad_ok = tf.reduce_all(
                            tf.stack([
                                tf.reduce_all(tf.is_finite(g))
                                for g, v in grads
                            ]))

                    # Update weights and adjust loss scaling.
                    with tf.name_scope('UpdateWeights'):
                        opt = self._dev_opt[dev]
                        ls_var = self.get_loss_scaling_var(dev)
                        if not self.use_loss_scaling:
                            ops.append(
                                tf.cond(grad_ok,
                                        lambda: opt.apply_gradients(grads),
                                        tf.no_op))
                        else:
                            ops.append(
                                tf.cond(
                                    grad_ok, lambda: tf.group(
                                        tf.assign_add(ls_var, self.
                                                      loss_scaling_inc),
                                        opt.apply_gradients(grads)),
                                    lambda: tf.group(
                                        tf.assign_sub(ls_var, self.
                                                      loss_scaling_dec))))

                    # Report statistics on the last device.
                    if dev == devices[-1]:
                        with tf.name_scope('Statistics'):
                            ops.append(
                                autosummary(self.id + '/learning_rate',
                                            self.learning_rate))
                            ops.append(
                                autosummary(self.id + '/overflow_frequency',
                                            tf.where(grad_ok, 0, 1)))
                            if self.use_loss_scaling:
                                ops.append(
                                    autosummary(self.id + '/loss_scaling_log2',
                                                ls_var))

            # Initialize variables and group everything into a single op.
            self.reset_optimizer_state()
            init_uninited_vars(list(self._dev_ls_var.values()))
            return tf.group(*ops, name='TrainingOp')
    def model_fn(features, labels, mode, params):
        """Defines how to train, evaluate and predict from the transformer model."""
        num_gpus = flags_core.get_num_gpus(flags_obj)
        print("num_gpus: ", num_gpus)
        #    num_gpus=params["num_gpus"]

        learning_rate = get_learning_rate(
            learning_rate=params["learning_rate"],
            hidden_size=params["hidden_size"],
            learning_rate_warmup_steps=params["learning_rate_warmup_steps"])
        optimizers = [
            tf.contrib.opt.LazyAdamOptimizer(
                learning_rate,
                beta1=params["optimizer_adam_beta1"],
                beta2=params["optimizer_adam_beta2"],
                epsilon=params["optimizer_adam_epsilon"])
            for _ in range(num_gpus)
        ]

        if params["dtype"] == "fp16":
            optimizers = [
                tf.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer) for optimizer in optimizers
            ]

#    feature_shards, label_shards = replicate_model_fn._split_batch(features, labels, num_gpus, device=consolidation_device)
#    feature_shards, label_shards = split_batch(features, labels, num_gpus)

        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)
        grad_list = []
        losses = []
        logits = []
        for gpu_idx in range(num_gpus):
            device_setter = local_device_setter(
                ps_device_type='cpu', worker_device='/gpu:{}'.format(gpu_idx))
            with tf.device(device_setter):
                #      with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx):
                #with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)):
                logit, loss = create_tower_network(model, params, features,
                                                   labels)
                #        feature_shard, label_shard = next(iterator)
                #        logit, loss = create_tower_network(model, params, features, labels)
                logits.append(logit)
                losses.append(loss)
                grad_list.append([
                    x for x in optimizers[gpu_idx].compute_gradients(loss)
                    if x[0] is not None
                ])

#    output_train = tf.concat(logits, axis=0)
        output_train = tf.reduce_mean(logits, axis=0)
        loss_train = tf.reduce_mean(losses, name='loss')

        #    grads = []
        #    all_vars= []
        sparse_grads = []
        sparse_vars = []
        dense_grads = []
        dense_vars = []
        for tower in grad_list:
            sp_grad = []
            sp_var = []
            dn_grad = []
            dn_var = []
            for x in tower:
                if isinstance(x[1], ops.IndexedSlices):
                    sp_grad.append(x[0])
                    sp_var.append(x[1])
                else:
                    dn_grad.append(x[0])
                    dn_var.append(x[1])

            if (len(sp_var) > 0):
                sparse_grads.append(sp_grad)
                sparse_vars.append(sp_var)
            if (len(dn_var) > 0):
                dense_grads.append(dn_grad)
                dense_vars.append(dn_var)

        #SPARSE


#    for var, grad in zip(sparse_vars, sparse_grads):
#      if len(grad) == 1:
#        avg_grad = grad
#      else:
#        avg_grad = tf.multiply(tf.add_n(grad), 1. /len(grad))
#      gradvars.append((avg_grad, var))
        if len(sparse_vars) > 0:
            if num_gpus == 1:
                reduced_grad = sparse_grads
            else:
                new_all_grads = []
                for grad in sparse_grads:
                    new_grads = []
                    for tower_grad in grad:
                        new_grads.append(tower_grad)
                    summed = tf.add_n(new_grads)
                    grads_for_devices = []
                    for g in summed:
                        with tf.device(g.device):
                            g = tf.multiply(g,
                                            1.0 / num_gpus,
                                            name='allreduce_avg')
                        grads_for_devices.append(g)
                    new_all_grads.append(grads_for_devices)
                reduced_grad = list(zip(*new_all_grads))
            gradvars = [
                list(zip(gs, vs)) for gs, vs in zip(reduced_grad, sparse_vars)
            ]

        #DENSE
        reduced_grad = []
        from tensorflow.python.ops import nccl_ops
        if num_gpus == 1:
            reduced_grad = dense_grads
        else:
            new_all_grads = []
            for grad in dense_grads:
                summed = nccl_ops.all_sum(grad)
                grads_for_devices = []
                for g in summed:
                    with tf.device(g.device):
                        g = tf.multiply(g,
                                        1.0 / num_gpus,
                                        name='allreduce_avg')
                    grads_for_devices.append(g)
                new_all_grads.append(grads_for_devices)
            reduced_grad = list(zip(*new_all_grads))

        grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, dense_vars)]

        #apply gradients to each GPU by broadcasting summed gradient
        train_ops = []
        for idx, grad_and_vars in enumerate(grads):
            with tf.name_scope('apply_gradients'), tf.device(
                    tf.DeviceSpec(device_type="GPU", device_index=idx)):
                global_step = tf.train.get_global_step()
                update_ops = tf.assign(global_step,
                                       global_step + 1,
                                       name='update_global_step')
                #update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx)
                #with tf.control_dependencies(update_ops):
                train_ops.append(optimizers[idx].apply_gradients(
                    grad_and_vars, name='apply_grad_{}'.format(idx)))

                #SPARSE
                if device_index == 0 and len(sparse_vars) > 0:
                    learning_rate = get_learning_rate(
                        learning_rate=params["learning_rate"],
                        hidden_size=params["hidden_size"],
                        learning_rate_warmup_steps=params[
                            "learning_rate_warmup_steps"])
                    optimizer = tf.contrib.opt.LazyAdamOptimizer(
                        learning_rate,
                        beta1=params["optimizer_adam_beta1"],
                        beta2=params["optimizer_adam_beta2"],
                        epsilon=params["optimizer_adam_epsilon"])
                    optimizer = tf.train.SyncReplicasOptimizer(
                        optimizer, replicas_to_aggregate=num_devices)
                    sync_hook = optimizer.make_session_run_hook(is_chief)

                    minimize_op = optimizer.apply_gradients(
                        gradvars, global_step=tf.train.get_global_step())
                    train_ops.append(minimize_op)

        optimize_op = tf.group(update_ops, *train_ops, name='train_op')
        train_metrics = {"learning_rate": learning_rate}

        tf.identity(loss_train, "cross_entropy")

        if mode == tf.estimator.ModeKeys.TRAIN:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss_train,
                                              train_op=optimize_op)
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss_train,
                predictions={"predictions": output_train},
                eval_metric_ops=metrics.get_eval_metrics(
                    output_train, labels, params))
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=output_train,
                export_outputs={
                    "translate":
                    tf.estimator.export.PredictOutput(output_train)
                })
示例#18
0
 def testErrors(self):
   with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
     nccl_ops.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
   with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
     nccl_ops.all_sum([])
示例#19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpus', default='0,1', type=str)
    parser.add_argument('--max_step', default=10000, type=int)
    args = parser.parse_args()
    args.num_gpus = len(args.gpus.split(","))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

    # avoid unimplemented gpu kernel error
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:

        dataset = build_dataset(args.num_gpus)
        iterator = dataset.make_initializable_iterator()
        tower_batches = iterator.get_next()

        tower_grads_list = []
        tower_tvars_list = []
        tower_gvars_list = []
        tower_loss_list = []
        for index, tower_batch in enumerate(tower_batches):
            # by-device variable scope
            with tf.variable_scope("tower_%d" % index) as scope, \
                    tf.device('/gpu:%d' % index):

                tower_loss = build_tower(tower_batch)
                tower_gvars = tf.global_variables(scope._name)
                tower_tvars = tf.trainable_variables(scope._name)
                tower_grads = tf.gradients(tower_loss, tower_tvars)

                tower_loss_list.append(tower_loss)
                tower_tvars_list.append(tower_tvars)
                tower_gvars_list.append(tower_gvars)
                tower_grads_list.append(tower_grads)

                if index == 0:
                    # only one variable global saver
                    def clean(name):
                        name = re.sub('^tower_\d+/', '', name)
                        name = re.sub(':\d+$', '', name)
                        return name

                    save_dict = {clean(var.name): var
                                 for var in tower_gvars}
                    saver = tf.train.Saver(save_dict)

        with tf.name_scope("tower_gvar_sync"):
            # different device is init with different random seed
            # need explicit synchronization before training!!!
            if len(tower_gvars_list) == 1:
                tower_gvar_sync = tf.no_op()
            else:
                sync_ops = []
                for vars in zip(*tower_gvars_list):
                    for var in vars[1:]:
                        sync_ops.append(tf.assign(var, vars[0]))
                tower_gvar_sync = tf.group(*sync_ops)

        with tf.name_scope('all_reduce'):
            avg_tower_grads_list = []
            for grads_to_avg in zip(*tower_grads_list):
                # nccl.all_sum will automatically
                # convert sparse gradients into dense one
                avg_tower_grads_list.append(all_sum(grads_to_avg))
            avg_tower_grads_list = zip(*avg_tower_grads_list)

        with tf.name_scope('metrics'):
            loss = tf.add_n(tower_loss_list) / len(tower_loss_list)

        train_ops = []
        for index, (tower_vars, tower_grads) in \
                enumerate(zip(tower_tvars_list, avg_tower_grads_list)):
            with tf.variable_scope("tower_%d" % index), \
                 tf.device('/gpu:%d' % index):
                tower_grads = [grad / len(tower_batches) for grad in tower_grads]
                if index == 0:
                    # only increment global step with the first worker
                    step = tf.train.get_or_create_global_step()

                tower_optimizer = tf.train.AdamOptimizer()
                tower_train_op = tower_optimizer.apply_gradients(zip(tower_grads, tower_vars),
                                                                 global_step=step if index == 0 else None)
                train_ops.append(tower_train_op)
        train_op = tf.group(train_ops)

        # start running
        sess.run(tf.global_variables_initializer())
        sess.run(iterator.initializer)
        # important to sync variables before training!
        sess.run(tower_gvar_sync)
        while True:
            try:
                fetch_loss, fetch_step, _ = sess.run([loss, step, train_op])
                if fetch_step % 20 == 0:
                    print("step: %d, loss: %.4f" % (fetch_step, fetch_loss))
                if fetch_step > args.max_step:
                    break
            except tf.errors.OutOfRangeError:
                break
        saver.save(sess, "./model")
示例#20
0
    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
        """Construct training op to update the registered variables based on their gradients."""
        tfutil.assert_tf_initialized()
        assert not self._updates_applied
        self._updates_applied = True
        all_ops = []

        # Check for no-op.
        if allow_no_op and len(self._devices) == 0:
            with tfutil.absolute_name_scope(self.scope):
                return tf.no_op(name='TrainingOp')

        # Clean up gradients.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Clean%d" %
                                            device_idx), tf.device(
                                                device.name):
                for var, grad in device.grad_raw.items():

                    # Filter out disconnected gradients and convert to float32.
                    grad = [g for g in grad if g is not None]
                    grad = [tf.cast(g, tf.float32) for g in grad]

                    # Sum within the device.
                    if len(grad) == 0:
                        grad = tf.zeros(var.shape)  # No gradients => zero.
                    elif len(grad) == 1:
                        grad = grad[0]  # Single gradient => use as is.
                    else:
                        grad = tf.add_n(grad)  # Multiple gradients => sum.

                    # Scale as needed.
                    scale = 1.0 / len(device.grad_raw[var]) / len(
                        self._devices)
                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
                    if self.minibatch_multiplier is not None:
                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
                    scale = self.undo_loss_scaling(scale)
                    device.grad_clean[var] = grad * scale

        # Sum gradients across devices.
        if len(self._devices) > 1:
            with tfutil.absolute_name_scope(self.scope +
                                            "/Broadcast"), tf.device(None):
                for all_vars in zip(*[
                        device.grad_clean.keys()
                        for device in self._devices.values()
                ]):
                    if len(all_vars) > 0 and all(
                            dim > 0 for dim in all_vars[0].shape.as_list()
                    ):  # NCCL does not support zero-sized tensors.
                        all_grads = [
                            device.grad_clean[var] for device, var in zip(
                                self._devices.values(), all_vars)
                        ]
                        all_grads = nccl_ops.all_sum(all_grads)
                        for device, var, grad in zip(self._devices.values(),
                                                     all_vars, all_grads):
                            device.grad_clean[var] = grad

        # Apply updates separately on each device.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Apply%d" %
                                            device_idx), tf.device(
                                                device.name):
                # pylint: disable=cell-var-from-loop

                # Accumulate gradients over time.
                if self.minibatch_multiplier is None:
                    acc_ok = tf.constant(True, name='acc_ok')
                    device.grad_acc = OrderedDict(device.grad_clean)
                else:
                    # Create variables.
                    with tf.control_dependencies(None):
                        for var in device.grad_clean.keys():
                            device.grad_acc_vars[var] = tf.Variable(
                                tf.zeros(var.shape),
                                trainable=False,
                                name="grad_acc_var")
                        device.grad_acc_count = tf.Variable(
                            tf.zeros([]),
                            trainable=False,
                            name="grad_acc_count")

                    # Track counter.
                    count_cur = device.grad_acc_count + 1.0
                    count_inc_op = lambda: tf.assign(device.grad_acc_count,
                                                     count_cur)
                    count_reset_op = lambda: tf.assign(device.grad_acc_count,
                                                       tf.zeros([]))
                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier,
                                                   tf.float32))
                    all_ops.append(
                        tf.cond(acc_ok, count_reset_op, count_inc_op))

                    # Track gradients.
                    for var, grad in device.grad_clean.items():
                        acc_var = device.grad_acc_vars[var]
                        acc_cur = acc_var + grad
                        device.grad_acc[var] = acc_cur
                        with tf.control_dependencies([acc_cur]):
                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
                            acc_reset_op = lambda: tf.assign(
                                acc_var, tf.zeros(var.shape))
                            all_ops.append(
                                tf.cond(acc_ok, acc_reset_op, acc_inc_op))

                # No overflow => apply gradients.
                all_ok = tf.reduce_all(
                    tf.stack([acc_ok] + [
                        tf.reduce_all(tf.is_finite(g))
                        for g in device.grad_acc.values()
                    ]))
                apply_op = lambda: device.optimizer.apply_gradients(
                    [(tf.cast(grad, var.dtype), var)
                     for var, grad in device.grad_acc.items()])
                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))

                # Adjust loss scaling.
                if self.use_loss_scaling:
                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var,
                                                      self.loss_scaling_inc)
                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var,
                                                      self.loss_scaling_dec)
                    ls_update_op = lambda: tf.group(
                        tf.cond(all_ok, ls_inc_op, ls_dec_op))
                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))

                # Last device => report statistics.
                if device_idx == len(self._devices) - 1:
                    all_ops.append(
                        autosummary.autosummary(self.id + "/learning_rate",
                                                self.learning_rate))
                    all_ops.append(
                        autosummary.autosummary(self.id +
                                                "/overflow_frequency",
                                                tf.where(all_ok, 0, 1),
                                                condition=acc_ok))
                    if self.use_loss_scaling:
                        all_ops.append(
                            autosummary.autosummary(
                                self.id + "/loss_scaling_log2",
                                device.loss_scaling_var))

        # Initialize variables.
        self.reset_optimizer_state()
        if self.use_loss_scaling:
            tfutil.init_uninitialized_vars(
                [device.loss_scaling_var for device in self._devices.values()])
        if self.minibatch_multiplier is not None:
            tfutil.run([
                var.initializer for device in self._devices.values()
                for var in list(device.grad_acc_vars.values()) +
                [device.grad_acc_count]
            ])

        # Group everything into a single op.
        with tfutil.absolute_name_scope(self.scope):
            return tf.group(*all_ops, name="TrainingOp")
示例#21
0
def all_sum_gpu(g, *args, **kws):
    return nccl_ops.all_sum(g, *args, **kws)