Exemplo n.º 1
0
def test_validation(ray_start_2_cpus):
    def bad_func(a, b, c):
        return 1

    t_cls = DistributedTrainableCreator(bad_func, num_slots=2)
    with pytest.raises(ValueError):
        t_cls()
Exemplo n.º 2
0
def tune_horovod(hosts_per_trial,
                 slots_per_host,
                 num_samples,
                 use_gpu,
                 mode="square",
                 x_max=1.0):
    horovod_trainable = DistributedTrainableCreator(
        train,
        use_gpu=use_gpu,
        num_hosts=hosts_per_trial,
        num_slots=slots_per_host,
        replicate_pem=False,
    )
    analysis = tune.run(
        horovod_trainable,
        metric="loss",
        mode="min",
        config={
            "lr": tune.uniform(0.1, 1),
            "mode": mode,
            "x_max": x_max
        },
        num_samples=num_samples,
        fail_fast=True,
    )
    print("Best hyperparameters found were: ", analysis.best_config)
Exemplo n.º 3
0
def test_step_after_completion(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(
        _train_simple, num_hosts=1, num_slots=2)
    trainer = trainable_cls(config={"epochs": 1})
    with pytest.raises(RuntimeError):
        for i in range(10):
            trainer.train()
Exemplo n.º 4
0
def test_single_step(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(_train_simple,
                                                num_hosts=1,
                                                num_slots=2)
    trainer = trainable_cls()
    trainer.train()
    trainer.stop()
Exemplo n.º 5
0
def test_simple_tune(ray_start_4_cpus, enabled_checkpoint):
    trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2)
    analysis = tune.run(trainable_cls,
                        config={"enable_checkpoint": enabled_checkpoint},
                        num_samples=2,
                        stop={"training_iteration": 2})
    assert analysis.trials[0].last_result["training_iteration"] == 2
    assert analysis.trials[0].has_checkpoint() == enabled_checkpoint
Exemplo n.º 6
0
def test_resource_tune(ray_connect_cluster, use_gpu):
    if use_gpu and ray.cluster_resources().get("GPU", 0) == 0:
        pytest.skip("No GPU available.")
    trainable_cls = DistributedTrainableCreator(
        _train_simple, num_workers=2, use_gpu=use_gpu
    )
    analysis = tune.run(trainable_cls, num_samples=2, stop={"training_iteration": 2})
    assert analysis.trials[0].last_result["training_iteration"] == 2
Exemplo n.º 7
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help=("Finish quickly for testing."))
    args = parser.parse_args()

    if args.smoke_test:
        ray.init()
    else:
        ray.init(address="auto")  # assumes ray is started with ray up

    horovod_trainable = DistributedTrainableCreator(
        train,
        use_gpu=False if args.smoke_test else True,
        num_hosts=1 if args.smoke_test else 2,
        num_workers=2 if args.smoke_test else 2,
        replicate_pem=False,
        timeout_s=300,
    )

    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]),
    ])  # meanstd transformation

    dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar",
                                           train=True,
                                           download=True,
                                           transform=transform_train)
Exemplo n.º 8
0
        with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
            print("this checkpoint dir: ", checkpoint_dir)
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict(), epoch), path)


if __name__ == "__main__":
    if args.smoke_test:
        ray.init()
    else:
        ray.init(address="auto")  # assumes ray is started with ray up

    horovod_trainable = DistributedTrainableCreator(
        train,
        use_gpu=True,
        num_hosts=1 if args.smoke_test else 2,
        num_slots=2 if args.smoke_test else 2,
        replicate_pem=False)

    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]),
    ])  # meanstd transformation

    dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar",
                                           train=True,
                                           download=True,
                                           transform=transform_train)
Exemplo n.º 9
0
local_dir = ""
if args.restorePath:
    path = args.restorePath + "/experiment/" + args.restoreFile
    print('Restore from ' + path)
    hyperopt.restore(path)
    local_dir = args.restorePath
    print("Training logs will be saved to " + local_dir)
else:
    local_dir = args.rayResult

hyperopt_limited = ConcurrencyLimiter(hyperopt,
                                      max_concurrent=args.max_concurrent)

trainable = DistributedTrainableCreator(training_initialization(),
                                        num_slots=int(args.numGPU),
                                        use_gpu=True)

analysis = tune.run(
    trainable,
    #resources_per_trial=resources,
    scheduler=asha,
    search_alg=hyperopt_limited,
    num_samples=int(args.numHparams),
    config=config,
    name='experiment',
    local_dir=local_dir)

print("Searcher_state is saved to " + local_dir +
      "/experiment/searcher_state.pkl")
hyperopt.save(local_dir + "/experiment/searcher_state.pkl")
Exemplo n.º 10
0
def test_set_global(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2)
    trainable = trainable_cls()
    result = trainable.train()
    trainable.stop()
    assert result["rank"] == 0
Exemplo n.º 11
0
def test_validate_session(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(_train_validate_session)
    tune.run(trainable_cls)
Exemplo n.º 12
0
    if hvd.rank() == 0:
        mlflow.set_tracking_uri("file:/home/jovyan/mlruns")
        mlflow.set_experiment("mlflow_example_default_ray_final")
        mlflow.start_run(nested=False, run_name="y")

    for epoch in range(1, config["epochs"] + 1):
        train(epoch)
        test(epoch)

    if hvd.rank() == 0:
        mlflow.end_run()


if __name__ == "__main__":
    trainable = DistributedTrainableCreator(training_function,
                                            num_slots=base_config["num_slots"],
                                            use_gpu=True)
    if base_config["hyperparameters"]["type"] == "grid":
        analysis = tune.run(
            trainable,
            num_samples=1,
            metric="loss",
            mode="min",
            config={
                "epochs":
                tune.grid_search(base_config["hyperparameters"]["epochs"]),
                "lr":
                tune.grid_search(base_config["hyperparameters"]["lr"]),
                "mode":
                "square",
                "x_max":
Exemplo n.º 13
0
                        type=str,
                        default="square",
                        choices=["square", "cubic"])
    parser.add_argument("--learning_rate",
                        type=float,
                        default=0.1,
                        dest="learning_rate")
    parser.add_argument("--x_max", type=float, default=1., dest="x_max")
    parser.add_argument("--gpu", action="store_true")
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help=("Finish quickly for testing."))
    parser.add_argument("--hosts-per-trial", type=int, default=1)
    parser.add_argument("--slots-per-host", type=int, default=2)
    args = parser.parse_args()

    # import ray
    # ray.init(address="auto")  # assumes ray is started with ray up

    horovod_trainable = DistributedTrainableCreator(
        train,
        use_gpu=args.gpu,
        num_hosts=args.hosts_per_trial,
        num_slots=args.slots_per_host,
        replicate_pem=False)
    analysis = tune.run(horovod_trainable,
                        config={"lr": tune.uniform(0.1, 1)},
                        num_samples=2 if args.smoke_test else 10,
                        fail_fast=True)
    config = analysis.get_best_config(metric="loss", mode="min")
Exemplo n.º 14
0
def test_simple_tune(ray_start_4_cpus):
    trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2)
    analysis = tune.run(
        trainable_cls, num_samples=2, stop={"training_iteration": 2})
    assert analysis.trials[0].last_result["training_iteration"] == 2