def _do_batch_all_reduce(self, reduce_op, dense_values): """Run batch all-reduces.""" logging.log_first_n( logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s," "num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(dense_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = dense_values[0].devices grouped = _group_value_by_device(dense_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_device_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_device_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
def _do_batch_all_reduce(self, reduce_op, dense_values): """Run batch all-reduces.""" logging.log_first_n( logging.INFO, "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and " "agg_small_grads_max_group = %d" % (len(dense_values), self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = dense_values[0].devices grouped = _group_value_by_device(dense_values) device_grad_packs, tensor_packer = _pack_tensors( grouped, self._num_packs, self._agg_small_grads_max_bytes, self._agg_small_grads_max_group) # The actual aggregation of the repacked gradients. Note that they are # sharded among different aggregation trees. So it is important to strike # the balance on num_splits. if self._all_reduce_alg == "nccl": # TODO(yuefengz): merge this into the all-reduce library. reduced = cross_device_utils.aggregate_gradients_using_nccl( device_grad_packs) else: # TODO(yuefengz): check that gpu ids in `destinations` are in ascending # order. reduced = ( cross_device_utils.aggregate_gradients_using_hierarchical_copy( destinations, device_grad_packs)) reduced = _unpack_tensors(reduced, tensor_packer) return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" cluster_spec = cluster.as_dict() # num_gpus=len(cluster_spec["worker"]) num_gpus=2 learning_rate = get_learning_rate(learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"]) optimizers = [tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) for _ in range(num_gpus)] if params["dtype"] == "fp16": optimizers = [tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) for optimizer in optimizers] model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) grad_list= [] losses = [] logits = [] for gpu_idx in range(num_gpus): # device_setter = local_device_setter(cluster, worker_device="/job:worker/task:%d" % gpu_idx) device_setter = local_device_setter(cluster, worker_device="gpu:%d" % gpu_idx) with tf.device(device_setter): # with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % gpu_idx, cluster=cluster)): # with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope('tower%d'%gpu_idx): #with tf.device(tf.compat.v1.train.replica_device_setter(cluster=cluster_spec)): logit, loss = create_tower_network(model, params, features, labels) # feature_shard, label_shard = next(iterator) # logit, loss = create_tower_network(model, params, features, labels) logits.append(logit) losses.append(loss) grad_list.append([x for x in optimizers[gpu_idx].compute_gradients(loss) if x[0] is not None]) # output_train = tf.concat(logits, axis=0) output_train = tf.reduce_mean(logits, axis=0) loss_train = tf.reduce_mean(losses, name='loss') ''' grads = [] all_vars= [] for tower in grad_list: grads.append([x[0] for x in tower]) all_vars.append([x[1] for x in tower]) reduced_grad = [] if num_gpus==1: reduced_grad = grads else: new_all_grads = [] for grad in zip(*grads): summed = nccl_ops.all_sum(grad) grads_for_devices = [] for g in summed: with tf.device(g.device): g = tf.multiply(g, 1.0 / num_gpus, name='allreduce_avg') grads_for_devices.append(g) new_all_grads.append(grads_for_devices) reduced_grad = list(zip(*new_all_grads)) grads = [list(zip(gs, vs)) for gs, vs in zip(reduced_grad, all_vars)] ''' from tensorflow.python.distribute import cross_device_utils grads = cross_device_utils.aggregate_gradients_using_nccl(grad_list) #apply gradients to each GPU by broadcasting summed gradient train_ops = [] for idx, grad_and_vars in enumerate(grads): with tf.name_scope('apply_gradients'), tf.device(tf.DeviceSpec(device_type="GPU", device_index=idx)): # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='tower%d'%idx) global_step = tf.train.get_global_step() update_ops = tf.assign(global_step, global_step+1, name='update_global_step') # with tf.control_dependencies(update_ops): train_ops.append(optimizers[idx].apply_gradients(grad_and_vars, name='apply_grad_{}'.format(idx))) optimize_op = tf.group(update_ops, *train_ops, name='train_op') train_metrics = {"learning_rate": learning_rate} tf.identity(loss_train, "cross_entropy") if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, train_op=optimize_op) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode=mode, loss=loss_train, predictions={"predictions": output_train}, eval_metric_ops=metrics.get_eval_metrics(output_train, labels, params)) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=output_train, export_outputs={"translate": tf.estimator.export.PredictOutput(output_train)})