Пример #1
0
def test_horovod_state_dict(ray_start_4_cpus):
    def train_func(config):
        result = hvd_train_func(config)
        assert len(result) == epochs
        assert result[-1] < result[0]

    num_workers = 2
    epochs = 10
    scaling_config = ScalingConfig(num_workers=num_workers)
    config = {"num_epochs": epochs, "save_model_as_dict": True}
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
    )
    result = trainer.fit()
    predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net())

    # Find some test data to run on.
    test_set = datasets.MNIST(
        "./data",
        train=False,
        download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    )

    test_dataloader = DataLoader(test_set, batch_size=10)
    test_dataloader_iter = iter(test_dataloader)
    images, labels = next(
        test_dataloader_iter
    )  # only running a batch inference of 10 images
    predicted_labels = run_image_prediction(predictor.model, images)
    assert torch.equal(predicted_labels, labels)
Пример #2
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_func,
        train_loop_config=kwargs,
        scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers),
    )
    results = trainer.fit()
    print(results.metrics)
Пример #3
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "num_epochs": kwargs["num_epochs"],
            "log_interval": kwargs["log_interval"],
            "use_cuda": kwargs["use_cuda"],
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(result)