def test_validation(ray_start_2_cpus): def bad_func(a, b, c): return 1 t_cls = DistributedTrainableCreator(bad_func, num_slots=2) with pytest.raises(ValueError): t_cls()
def tune_horovod(hosts_per_trial, slots_per_host, num_samples, use_gpu, mode="square", x_max=1.0): horovod_trainable = DistributedTrainableCreator( train, use_gpu=use_gpu, num_hosts=hosts_per_trial, num_slots=slots_per_host, replicate_pem=False, ) analysis = tune.run( horovod_trainable, metric="loss", mode="min", config={ "lr": tune.uniform(0.1, 1), "mode": mode, "x_max": x_max }, num_samples=num_samples, fail_fast=True, ) print("Best hyperparameters found were: ", analysis.best_config)
def test_step_after_completion(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator( _train_simple, num_hosts=1, num_slots=2) trainer = trainable_cls(config={"epochs": 1}) with pytest.raises(RuntimeError): for i in range(10): trainer.train()
def test_single_step(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator(_train_simple, num_hosts=1, num_slots=2) trainer = trainable_cls() trainer.train() trainer.stop()
def test_simple_tune(ray_start_4_cpus, enabled_checkpoint): trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2) analysis = tune.run(trainable_cls, config={"enable_checkpoint": enabled_checkpoint}, num_samples=2, stop={"training_iteration": 2}) assert analysis.trials[0].last_result["training_iteration"] == 2 assert analysis.trials[0].has_checkpoint() == enabled_checkpoint
def test_resource_tune(ray_connect_cluster, use_gpu): if use_gpu and ray.cluster_resources().get("GPU", 0) == 0: pytest.skip("No GPU available.") trainable_cls = DistributedTrainableCreator( _train_simple, num_workers=2, use_gpu=use_gpu ) analysis = tune.run(trainable_cls, num_samples=2, stop={"training_iteration": 2}) assert analysis.trials[0].last_result["training_iteration"] == 2
parser = argparse.ArgumentParser() parser.add_argument("--smoke-test", action="store_true", help=("Finish quickly for testing.")) args = parser.parse_args() if args.smoke_test: ray.init() else: ray.init(address="auto") # assumes ray is started with ray up horovod_trainable = DistributedTrainableCreator( train, use_gpu=False if args.smoke_test else True, num_hosts=1 if args.smoke_test else 2, num_workers=2 if args.smoke_test else 2, replicate_pem=False, timeout_s=300, ) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]), ]) # meanstd transformation dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar", train=True, download=True, transform=transform_train)
with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: print("this checkpoint dir: ", checkpoint_dir) path = os.path.join(checkpoint_dir, "checkpoint") torch.save((net.state_dict(), optimizer.state_dict(), epoch), path) if __name__ == "__main__": if args.smoke_test: ray.init() else: ray.init(address="auto") # assumes ray is started with ray up horovod_trainable = DistributedTrainableCreator( train, use_gpu=True, num_hosts=1 if args.smoke_test else 2, num_slots=2 if args.smoke_test else 2, replicate_pem=False) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(CIFAR10_STATS["mean"], CIFAR10_STATS["std"]), ]) # meanstd transformation dataset = torchvision.datasets.CIFAR10(root="/tmp/data_cifar", train=True, download=True, transform=transform_train)
local_dir = "" if args.restorePath: path = args.restorePath + "/experiment/" + args.restoreFile print('Restore from ' + path) hyperopt.restore(path) local_dir = args.restorePath print("Training logs will be saved to " + local_dir) else: local_dir = args.rayResult hyperopt_limited = ConcurrencyLimiter(hyperopt, max_concurrent=args.max_concurrent) trainable = DistributedTrainableCreator(training_initialization(), num_slots=int(args.numGPU), use_gpu=True) analysis = tune.run( trainable, #resources_per_trial=resources, scheduler=asha, search_alg=hyperopt_limited, num_samples=int(args.numHparams), config=config, name='experiment', local_dir=local_dir) print("Searcher_state is saved to " + local_dir + "/experiment/searcher_state.pkl") hyperopt.save(local_dir + "/experiment/searcher_state.pkl")
def test_set_global(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2) trainable = trainable_cls() result = trainable.train() trainable.stop() assert result["rank"] == 0
def test_validate_session(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator(_train_validate_session) tune.run(trainable_cls)
if hvd.rank() == 0: mlflow.set_tracking_uri("file:/home/jovyan/mlruns") mlflow.set_experiment("mlflow_example_default_ray_final") mlflow.start_run(nested=False, run_name="y") for epoch in range(1, config["epochs"] + 1): train(epoch) test(epoch) if hvd.rank() == 0: mlflow.end_run() if __name__ == "__main__": trainable = DistributedTrainableCreator(training_function, num_slots=base_config["num_slots"], use_gpu=True) if base_config["hyperparameters"]["type"] == "grid": analysis = tune.run( trainable, num_samples=1, metric="loss", mode="min", config={ "epochs": tune.grid_search(base_config["hyperparameters"]["epochs"]), "lr": tune.grid_search(base_config["hyperparameters"]["lr"]), "mode": "square", "x_max":
type=str, default="square", choices=["square", "cubic"]) parser.add_argument("--learning_rate", type=float, default=0.1, dest="learning_rate") parser.add_argument("--x_max", type=float, default=1., dest="x_max") parser.add_argument("--gpu", action="store_true") parser.add_argument("--smoke-test", action="store_true", help=("Finish quickly for testing.")) parser.add_argument("--hosts-per-trial", type=int, default=1) parser.add_argument("--slots-per-host", type=int, default=2) args = parser.parse_args() # import ray # ray.init(address="auto") # assumes ray is started with ray up horovod_trainable = DistributedTrainableCreator( train, use_gpu=args.gpu, num_hosts=args.hosts_per_trial, num_slots=args.slots_per_host, replicate_pem=False) analysis = tune.run(horovod_trainable, config={"lr": tune.uniform(0.1, 1)}, num_samples=2 if args.smoke_test else 10, fail_fast=True) config = analysis.get_best_config(metric="loss", mode="min")
def test_simple_tune(ray_start_4_cpus): trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2) analysis = tune.run( trainable_cls, num_samples=2, stop={"training_iteration": 2}) assert analysis.trials[0].last_result["training_iteration"] == 2