def test_horovod_state_dict(ray_start_4_cpus): def train_func(config): result = hvd_train_func(config) assert len(result) == epochs assert result[-1] < result[0] num_workers = 2 epochs = 10 scaling_config = ScalingConfig(num_workers=num_workers) config = {"num_epochs": epochs, "save_model_as_dict": True} trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, ) result = trainer.fit() predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net()) # Find some test data to run on. test_set = datasets.MNIST( "./data", train=False, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) test_dataloader = DataLoader(test_set, batch_size=10) test_dataloader_iter = iter(test_dataloader) images, labels = next( test_dataloader_iter ) # only running a batch inference of 10 images predicted_labels = run_image_prediction(predictor.model, images) assert torch.equal(predicted_labels, labels)
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_func, train_loop_config=kwargs, scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers), ) results = trainer.fit() print(results.metrics)
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config={ "num_epochs": kwargs["num_epochs"], "log_interval": kwargs["log_interval"], "use_cuda": kwargs["use_cuda"], }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(result)