def get_gradients(self, loss, params): """ Compute gradients of all trainable variables. See Optimizer.get_gradients() for more info. In DistributedOptimizer, get_gradients() is overriden to also push_pull the gradients before returning them. """ gradients = super(self.__class__, self).get_gradients(loss, params) if bps.size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Push_Pull") as scope: for grad in gradients: if grad is not None: if self._sparse_as_dense and \ isinstance(grad, tf.IndexedSlices): grad = tf.convert_to_tensor(grad) avg_grad = bps.push_pull( grad, scope, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression) averaged_gradients.append(avg_grad) else: averaged_gradients.append(None) return averaged_gradients else: return gradients
def _byteps_average_metrics_in_place(self, logs): logs = logs or {} reduced_logs = {} import byteps.tensorflow as bps if self._allreduce_ranks <= 1.: self._allreduce_ranks = float(bps.size()) # Reduce every metric among workers. Sort metrics by name # to ensure consistent order. for metric, value in sorted(logs.items()): from tensorflow.python.eager import context if context.executing_eagerly(): with tf.device(self._device): reduced_logs[metric] = bps.push_pull( K.constant(value, name=metric)).numpy() else: if metric not in self.variables: with tf.name_scope('MetricAverageCallback') as scope: var = tf.Variable(value, name=metric) K.get_session().run(var.initializer) self._m_vars[metric] = var self._allreduce_ops[metric] = bps.push_pull( var, scope, device_dense=self._device) else: K.set_value(self._m_vars[metric], value) reduced_logs[metric] = K.get_session().run( self._allreduce_ops[metric]) # Override the reduced values back into logs dictionary # for other callbacks to use. for metric, value in reduced_logs.items(): logs[metric] = value / self._allreduce_ranks
def run(benchmark_step): # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log('Total img/sec on %d %s(s): %.1f +-%.1f' % (bps.size(), device, bps.size() * img_sec_mean, bps.size() * img_sec_conf))
def reduce_implementation(self, reduce_op, per_replica_value, destinations): if tf_cross_device_ops.check_destinations(destinations): devices = tf_cross_device_ops.get_devices_from(destinations) else: devices = tf_cross_device_ops.get_devices_from(per_replica_value) reduce_to_device = devices[0] logging.log_first_n(logging.INFO, "Using byteps push pull to aggregate values", 1) reduced = _simple_reduce(per_replica_value, reduce_to_device, self.accumulation_fn, reduce_op) if size() > 1: reduced = _push_pull(reduced) return reduced
def on_batch_end(self, batch, logs=None): if self.broadcast_done: return if bps.size() <= 1: return with tf.device(self.device): if bps._executing_eagerly() and hasattr(self.model, 'variables'): # TensorFlow 2.0 or TensorFlow eager bps.broadcast_variables(self.model.variables, root_rank=self.root_rank) bps.broadcast_variables(self.model.optimizer.variables(), root_rank=self.root_rank) else: bcast_op = bps.broadcast_global_variables(self.root_rank) self.backend.get_session().run(bcast_op) self.broadcast_done = True
def _push_pull(self, gradients): self._aggregated_gradients = True if bps.size() > 1: averaged_gradients = [] with tf.name_scope(self._name + "_Push_Pull") as scope: for grad in gradients: if grad is not None: if self._sparse_as_dense and \ isinstance(grad, tf.IndexedSlices): grad = tf.convert_to_tensor(grad) avg_grad = bps.push_pull( grad, scope, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression) averaged_gradients.append(avg_grad) else: averaged_gradients.append(None) return averaged_gradients else: return gradients
def _push_pull(self, grads): self._aggregated_gradients = True import byteps.tensorflow as bps if bps.size() > 1: averaged_gradients = [] with tf.name_scope( "DistributedLossScaleOptimizer_Push_Pull") as scope: for grad in grads: if grad is not None: if self._sparse_as_dense and isinstance( grad, tf.IndexedSlices): grad = tf.convert_to_tensor(grad) avg_grad = bps.push_pull( grad, scope, device_dense=self._device_dense, device_sparse=self._device_sparse, compression=self._compression) averaged_gradients.append(avg_grad) else: averaged_gradients.append(None) return averaged_gradients else: return grads
def main(_): # BytePS: initialize BytePS. bps.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % bps.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # BytePS: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * bps.size()) # BytePS: add BytePS Distributed Optimizer. opt = bps.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # BytePS: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. bps.BroadcastGlobalVariablesHook(0), # BytePS: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=200000 // bps.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # BytePS: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) # BytePS: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if bps.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
def loss_function(): logits = model(data, training=True) return tf.losses.sparse_softmax_cross_entropy(target, logits) def log(s, nl=True): if bps.rank() != 0: return print(s, end='\n' if nl else '') sys.stdout.flush() log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, bps.size())) def run(benchmark_step): # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec)
def multiplier(epoch): # Adjust epoch to produce round numbers at the end of each epoch, so that TensorBoard # learning rate graphs look better. epoch += 1. / self.steps_per_epoch return 1. / bps.size() * (epoch * (bps.size() - 1) / warmup_epochs + 1)
tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() opt = tf.optimizers.Adam(0.001 * bps.size()) checkpoint_dir = './checkpoints' checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) tape = bps.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
def on_train_begin(self, logs=None): if bps.size() <= 1: return with tf.device(self.device): bcast_op = bps.broadcast_global_variables(self.root_rank) self.backend.get_session().run(bcast_op)
# initialization. if first_batch: bps.broadcast_variables(model.variables, root_rank=0) bps.broadcast_variables(opt.variables(), root_rank=0) def log(s, nl=True): if bps.rank() != 0: return print(s, end='\n' if nl else '') log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, bps.size())) with tf.device(device): # Warm-up log('Running warmup...') benchmark_step(first_batch=True) timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time
def main(_): bps.init() tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig(256) model_fn = model_fn_builder(bert_config=bert_config, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps) max_seq_length = FLAGS.max_seq_length max_predictions_per_seq = FLAGS.max_predictions_per_seq with tf.name_scope("input"): input_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) input_mask = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) segment_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) masked_lm_positions = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_weights = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.float32) next_sentence_labels = tf.placeholder( shape=[FLAGS.train_batch_size, 1], dtype=tf.int32) features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, "next_sentence_labels": next_sentence_labels } train_op = model_fn(features, None, None, None) infer_shape_ops = add_infer_shape_ops() hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. bps.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=205 // bps.size()), ] config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) training_batch_generator = train_input_generator(features) with tf.train.MonitoredTrainingSession(hooks=hooks, config=config) as mon_sess: mon_sess = TimelineSession(mon_sess, infer_shape_ops) while not mon_sess.should_stop(): # Run a training step synchronously. feed_dict = next(training_batch_generator) mon_sess.run([train_op], feed_dict=feed_dict)