예제 #1
0
def test_horovod_state_dict(ray_start_4_cpus):
    def train_func(config):
        result = hvd_train_func(config)
        assert len(result) == epochs
        assert result[-1] < result[0]

    num_workers = 2
    epochs = 10
    scaling_config = ScalingConfig(num_workers=num_workers)
    config = {"num_epochs": epochs, "save_model_as_dict": True}
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
    )
    result = trainer.fit()
    predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net())

    # Find some test data to run on.
    test_set = datasets.MNIST(
        "./data",
        train=False,
        download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    )

    test_dataloader = DataLoader(test_set, batch_size=10)
    test_dataloader_iter = iter(test_dataloader)
    images, labels = next(
        test_dataloader_iter
    )  # only running a batch inference of 10 images
    predicted_labels = run_image_prediction(predictor.model, images)
    assert torch.equal(predicted_labels, labels)
예제 #2
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_func,
        train_loop_config=kwargs,
        scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers),
    )
    results = trainer.fit()
    print(results.metrics)
예제 #3
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "num_epochs": kwargs["num_epochs"],
            "log_interval": kwargs["log_interval"],
            "use_cuda": kwargs["use_cuda"],
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(result)
예제 #4
0
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0):
    horovod_trainer = HorovodTrainer(
        train_loop_per_worker=train_loop_per_worker,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
        train_loop_config={
            "mode": mode,
            "x_max": x_max
        },
    )

    tuner = Tuner(
        horovod_trainer,
        param_space={"train_loop_config": {
            "lr": tune.uniform(0.1, 1)
        }},
        tune_config=TuneConfig(mode="min",
                               metric="loss",
                               num_samples=num_samples),
        _tuner_kwargs={"fail_fast": True},
    )

    result_grid = tuner.fit()

    print("Best hyperparameters found were: ",
          result_grid.get_best_result().config)
예제 #5
0
        [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]),
        ]
    )  # meanstd transformation

    dataset = torchvision.datasets.CIFAR10(
        root="/tmp/data_cifar", train=True, download=True, transform=transform_train
    )

    horovod_trainer = HorovodTrainer(
        train_loop_per_worker=train_loop_per_worker,
        scaling_config={
            "use_gpu": False if args.smoke_test else True,
            "num_workers": 2 if args.smoke_test else 4,
        },
        train_loop_config={"batch_size": 64, "data": ray.put(dataset)},
    )

    # ensure that checkpointing works.
    pbt = create_scheduler(
        "pbt",
        perturbation_interval=2,
        hyperparam_mutations={
            "train_loop_config": {"lr": tune.uniform(0.001, 0.1)},
        },
    )

    tuner = Tuner(
        horovod_trainer,
예제 #6
0
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]),
    ])  # meanstd transformation

    dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar",
                                           train=True,
                                           download=True,
                                           transform=transform_train)

    horovod_trainer = HorovodTrainer(
        train_loop_per_worker=train_loop_per_worker,
        scaling_config=ScalingConfig(
            use_gpu=False if args.smoke_test else True,
            num_workers=2 if args.smoke_test else 4,
        ),
        train_loop_config={
            "batch_size": 64,
            "data": ray.put(dataset)
        },
    )

    # ensure that checkpointing works.
    pbt = create_scheduler(
        "pbt",
        perturbation_interval=2,
        hyperparam_mutations={
            "train_loop_config": {
                "lr": tune.uniform(0.001, 0.1)
            },
        },