def test_train_model_lr_schedule(self): initial_lr = 0.1 * hvd.size() opt = tf.keras.optimizers.Adam() opt = hvd.DistributedOptimizer(opt) def linear_multiplier(epoch): return epoch model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.ThresholdedReLU(0.5)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], experimental_run_tf_function=False) x = np.random.random((10, 3)) y = np.random.random((10, 3, 2)) class StoreLearningRateCallback(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): # test learning rate warmup lr = self.model.optimizer.lr.numpy() if epoch >= 0 and epoch < 5: assert lr <= initial_lr or np.isclose(lr, initial_lr) # # test learning rate schedule callback if epoch > 5 and epoch < 10: assert lr <= initial_lr * \ 1e-1 or np.isclose(lr, initial_lr * 1e-1) if epoch > 10 and epoch < 15: assert lr < initial_lr * \ 1e-2 or np.isclose(lr, initial_lr * 1e-2) if epoch >= 15 and epoch < 20: assert np.isclose(lr, initial_lr * linear_multiplier(epoch)) # No assertions needed for BroadcastGlobalVariableCallbacks # We just need to verify that it doesn't hang or error callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), hvd.callbacks.LearningRateWarmupCallback(initial_lr=initial_lr, warmup_epochs=5), hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-1, start_epoch=5, end_epoch=10), hvd.callbacks.LearningRateScheduleCallback(initial_lr=initial_lr, multiplier=1e-2, start_epoch=10, end_epoch=15), hvd.callbacks.LearningRateScheduleCallback( initial_lr=initial_lr, multiplier=linear_multiplier, start_epoch=15, end_epoch=20), StoreLearningRateCallback() ] train_history = model.fit(x, y, steps_per_epoch=5, callbacks=callbacks, epochs=20) # test that the metrics average is being respected loss_metrics = train_history.history["loss"] loss_metrics_tensor = tf.convert_to_tensor(loss_metrics, dtype=tf.float32) expected_loss_metrics_tensor = hvd.broadcast(loss_metrics_tensor, root_rank=0) self.assertAllClose(expected_loss_metrics_tensor, loss_metrics_tensor)
resume_from_epoch = 0 if args.use_checkpointing: #checkpointing should only be done on the root worker. if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) resume_from_epoch = restart_epoch(args) #broadcast `resume_from_epoch` from first process to all others resume_from_epoch = hvd.broadcast(resume_from_epoch, 0) # Create/load the model. model = create_model(resume_from_epoch) # Train the model. model.fit_generator(train_iter, #keep the total number of steps the same despite of an increased number of workers steps_per_epoch=len(train_iter) // hvd.size(), callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=test_iter,
import tensorflow as tf import horovod.tensorflow.keras as hvd hvd.init() # Ensure only 1 process downloads the data on each node if hvd.local_rank() == 0: (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() hvd.broadcast(0, 0) else: hvd.broadcast(0, 0) (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() # Data partition for different workers num_pics_per_rank = x_train.shape[0] // hvd.size() pic_begin = num_pics_per_rank * hvd.rank() pic_end = pic_begin + num_pics_per_rank x_train = x_train[pic_begin:pic_end, ] y_train = y_train[pic_begin:pic_end, ] x_train, x_test = x_train / 255.0, x_test / 255.0 model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(10), ]) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # Using hvd.size()(number of workers) to scale learning rate and wrapping # optimizer with Distributed optimizer class provided by horovod.
.map(preprocess, num_parallel_calls=AUTOTUNE) .batch(args.val_batch_size)) # Look for a pre-existing checkpoint from which to resume training existing_checkpoints_dir = pathlib.Path(args.read_checkpoints_from) checkpoint_filepath = None initial_epoch = 0 for _most_recent_epoch in range(args.epochs, 0, -1): _checkpoint_filepath = f"{existing_checkpoints_dir}/checkpoint-epoch-{_most_recent_epoch:02d}.h5" if os.path.exists(_checkpoint_filepath): checkpoint_filepath = _checkpoint_filepath initial_epoch = _most_recent_epoch break # make sure that all workers agree to resume training from the same epoch intial_epoch = hvd.broadcast(initial_epoch, root_rank=0, name='initial_epoch') _loss_fn = (keras.losses .CategoricalCrossentropy()) # adjust initial learning rate based on number of "effective GPUs". _global_batch_size = args.batch_size * hvd.size() _n_effective_gpus = _global_batch_size // args.base_batch_size _initial_lr = args.base_lr * _n_effective_gpus _optimizer = (keras.optimizers .SGD(lr=_initial_lr, momentum=args.momentum)) _distributed_optimizer = hvd.DistributedOptimizer(_optimizer) _metrics = [ keras.metrics.CategoricalAccuracy(), keras.metrics.TopKCategoricalAccuracy(k=5)