def test_result_reduce_horovod(enable_pl_optimizer, tmpdir): """Make sure result logging works with Horovod. This test mirrors tests/core/test_results.py::_ddp_test_fn """ tutils.reset_seed() tutils.set_random_master_port() def hvd_test_fn(): path_here = os.path.abspath(os.path.dirname(__file__)) path_root = os.path.abspath(os.path.join(path_here, '..', '..')) sys.path.insert(0, os.path.abspath(path_root)) import horovod.torch as hvd from tests.base.boring_model import BoringModel class TestModel(BoringModel): def training_step(self, batch, batch_idx): self.training_step_called = True tensor = torch.tensor([1.0]) self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum', on_step=True, on_epoch=True) res = self._results # Check that `tensor` is summed across all ranks automatically assert res["test_tensor"].item() == hvd.size(), \ "Result-Log does not work properly with Horovod and Tensors" def training_epoch_end(self, outputs) -> None: assert len(outputs) == 0 model = TestModel() model.val_dataloader = None trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=1, log_every_n_steps=1, weights_summary=None, enable_pl_optimizer=enable_pl_optimizer, ) trainer.fit(model) horovod.run(hvd_test_fn, np=2)
def test_accuracy_metric_horovod(): num_batches = 10 batch_size = 16 threshold = 0.5 def sk_metric(preds, target): sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8) sk_target = target.view(-1).numpy() return accuracy_score(y_true=sk_target, y_pred=sk_preds) preds = torch.rand(num_batches, batch_size) target = torch.randint(high=2, size=(num_batches, batch_size)) def _compute_batch(): import horovod.torch as hvd trainer = Trainer( fast_dev_run=True, distributed_backend='horovod', ) accelerator_backend = trainer.accelerator_connector.select_accelerator( ) assert isinstance(accelerator_backend, HorovodAccelerator) metric = Accuracy(compute_on_step=True, dist_sync_on_step=True, dist_sync_fn=accelerator_backend.gather_all_tensors, threshold=threshold) for i in range(hvd.rank(), num_batches, hvd.size()): batch_result = metric(preds[i], target[i]) if hvd.rank() == 0: dist_preds = torch.stack( [preds[i + r] for r in range(hvd.size())]) dist_target = torch.stack( [target[i + r] for r in range(hvd.size())]) sk_batch_result = sk_metric(dist_preds, dist_target) assert np.allclose(batch_result.numpy(), sk_batch_result) # check on all batches on all ranks result = metric.compute() assert isinstance(result, torch.Tensor) total_preds = torch.stack([preds[i] for i in range(num_batches)]) total_target = torch.stack([target[i] for i in range(num_batches)]) sk_result = sk_metric(total_preds, total_target) assert np.allclose(result.numpy(), sk_result) horovod.run(_compute_batch, np=2)
def test_accuracy_metric_horovod(): num_batches = 10 batch_size = 16 threshold = 0.5 def sk_metric(preds, target): sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8) sk_target = target.view(-1).numpy() return accuracy_score(y_true=sk_target, y_pred=sk_preds) preds = torch.rand(num_batches, batch_size) target = torch.randint(high=2, size=(num_batches, batch_size)) def _compute_batch(): trainer = Trainer( fast_dev_run=True, accelerator='horovod', ) assert isinstance(trainer.accelerator, CPUAccelerator) # TODO: test that we selected the correct training_type_plugin based on horovod flags metric = Accuracy( compute_on_step=True, dist_sync_on_step=True, dist_sync_fn=trainer.training_type_plugin.gather_all_tensors, threshold=threshold) for i in range(hvd.rank(), num_batches, hvd.size()): batch_result = metric(preds[i], target[i]) if hvd.rank() == 0: dist_preds = torch.stack( [preds[i + r] for r in range(hvd.size())]) dist_target = torch.stack( [target[i + r] for r in range(hvd.size())]) sk_batch_result = sk_metric(dist_preds, dist_target) assert np.allclose(batch_result.numpy(), sk_batch_result) # check on all batches on all ranks result = metric.compute() assert isinstance(result, torch.Tensor) total_preds = torch.stack([preds[i] for i in range(num_batches)]) total_target = torch.stack([target[i] for i in range(num_batches)]) sk_result = sk_metric(total_preds, total_target) assert np.allclose(result.numpy(), sk_result) horovod.run(_compute_batch, np=2)
def test_run_with_hosts(self): """Tests two usable hosts, two slots each in standard happy path.""" hosts = 'localhost:2,127.0.0.1:2' results = horovod.run(train, num_proc=2, min_num_proc=2, max_num_proc=2, hosts=hosts) self.assertEqual([(0, 2), (1, 2)], results)
def test_run_with_discovery_script(self): """Tests two usable hosts, two slots each via discovery script in standard happy path.""" with NamedTemporaryFile(mode='w') as script: script.write('echo "localhost:2"\n') script.write('echo "127.0.0.1:2"\n') script.file.close() os.chmod(script.name, 0o700) results = horovod.run(train, num_proc=2, min_num_proc=2, max_num_proc=2, host_discovery_script=script.name) self.assertEqual([(0, 2), (1, 2)], results)
train(state) checkpoint_dir = './checkpoints' checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting it. if hvd.rank() == 0: checkpoint.save(checkpoint_dir) if __name__ == '__main__': if len(sys.argv) == 5: # run training through horovod.run num_proc = int(sys.argv[1]) min_num_proc = int(sys.argv[2]) max_num_proc = int(sys.argv[3]) hosts = sys.argv[4] print('Running training through horovod.run') horovod.run(main, num_proc=num_proc, min_num_proc=min_num_proc, max_num_proc=max_num_proc, hosts=hosts, use_gloo=True, verbose=2) else: # this is running via horovodrun main()
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 # Train the model. # Horovod: adjust number of steps based on number of GPUs. mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose) if __name__ == '__main__': if len(sys.argv) == 4: # run training through horovod.run np = int(sys.argv[1]) hosts = sys.argv[2] comm = sys.argv[3] print('Running training through horovod.run') horovod.run(main, np=np, hosts=hosts, use_gloo=comm == 'gloo', use_mpi=comm == 'mpi') else: # this is running via horovodrun main()
if args.use_mixed_precision: # Initialize scaler in global scale scaler = torch.cuda.amp.GradScaler() for epoch in range(1, args.epochs + 1): if args.use_mixed_precision: train_mixed_precision(epoch, scaler) else: train_epoch(epoch) # Keep test in full precision since computation is relatively light. test() if __name__ == '__main__': args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.num_proc: # run training through horovod.run print('Running training through horovod.run') horovod.run(main, args=(args, ), np=args.num_proc, hosts=args.hosts, use_gloo=args.communication == 'gloo', use_mpi=args.communication == 'mpi') else: # this is running via horovodrun main(args)
_train_step(batch_idx, data, target) # Specific hvd hvd.shutdown() if __name__ == "__main__": parser = argparse.ArgumentParser("Torch Native - Horovod") parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument("--nproc_per_node", type=int, default=2) parser.add_argument("--log_interval", type=int, default=4) parser.add_argument("--nb_samples", type=int, default=128) parser.add_argument("--batch_size", type=int, default=16) args_parsed = parser.parse_args() args_parsed.cuda = not args_parsed.no_cuda and torch.cuda.is_available() config = { "log_interval": args_parsed.log_interval, "batch_size": args_parsed.batch_size, "nb_samples": args_parsed.nb_samples, } args = (args_parsed.nproc_per_node, args_parsed.cuda, config) # Specific hvd run(training, args=args, use_gloo=True, np=args_parsed.nproc_per_node)