def test_validation(ray_start_2_cpus): # noqa: F811 def bad_func(a, b, c): return 1 t_cls = DistributedTrainableCreator(bad_func) with pytest.raises(ValueError): t_cls()
def test_colocated(ray_4_node): # noqa: F811 assert ray.available_resources()["CPU"] == 4 trainable_cls = DistributedTrainableCreator( _train_check_global, num_workers=4, num_workers_per_host=1) trainable = trainable_cls() assert ray.available_resources().get("CPU", 0) == 0 trainable.train() trainable.stop()
def test_colocated_gpu_double(ray_4_node_gpu): # noqa: F811 assert ray.available_resources()["GPU"] == 8 trainable_cls = DistributedTrainableCreator(_train_check_global, num_workers=8, num_gpus_per_worker=1, num_cpus_per_worker=1, num_workers_per_host=2) trainable = trainable_cls() assert ray.available_resources().get("GPU", 0) == 0 trainable.train() trainable.stop()
default=2, help="Sets number of workers for training.") parser.add_argument( "--use-gpu", action="store_true", default=False, help="enables CUDA training") parser.add_argument( "--cluster", action="store_true", default=False, help="enables multi-node tuning") args = parser.parse_args() tf_trainable = DistributedTrainableCreator( train_mnist, use_gpu=args.use_gpu, num_workers=2, ) sched = AsyncHyperBandScheduler(max_t=400, grace_period=20) analysis = tune.run( tf_trainable, name="exp", scheduler=sched, metric="mean_accuracy", mode="max", stop={ "mean_accuracy": 0.99, "training_iteration": 10 },
default=False, help="enables multi-node tuning") parser.add_argument("--smoke-test", action="store_true", default=False, help="enables small scale testing") args = parser.parse_args() if args.cluster: options = dict(address="auto") else: options = dict(num_cpus=4) ray.init(**options) tf_trainable = DistributedTrainableCreator( train_mnist, num_workers=args.num_workers, num_workers_per_host=args.num_workers_per_host, num_cpus_per_worker=args.num_cpus_per_worker, num_gpus_per_worker=args.num_gpus_per_worker, ) sched = AsyncHyperBandScheduler(max_t=400, grace_period=20) analysis = tune.run(tf_trainable, name="exp", scheduler=sched, metric="mean_accuracy", mode="max", stop={ "mean_accuracy": 0.99, "training_iteration": 10 },
def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain, ntest, seeds): if seeds: # Set seeds for reproducibility random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) cfg = load_config(config) config_file_path = config if tune_result_dir is not None: os.environ["TUNE_RESULT_DIR"] = tune_result_dir else: if isinstance(cfg["raytune"]["local_dir"], type(None)): raise TypeError( "Please specify a local_dir in the raytune section of the config file." ) trd = cfg["raytune"]["local_dir"] + "/tune_result_dir" os.environ["TUNE_RESULT_DIR"] = trd expdir = Path(cfg["raytune"]["local_dir"]) / name expdir.mkdir(parents=True, exist_ok=True) shutil.copy( "mlpf/raytune/search_space.py", str(Path(cfg["raytune"]["local_dir"]) / name / "search_space.py" )) # Copy the config file to the train dir for later reference shutil.copy(config_file_path, str(Path(cfg["raytune"]["local_dir"]) / name / "config.yaml") ) # Copy the config file to the train dir for later reference ray.tune.ray_trial_executor.DEFAULT_GET_TIMEOUT = 1 * 60 * 60 # Avoid timeout errors if not local: ray.init(address='auto') sched = get_raytune_schedule(cfg["raytune"]) search_alg = get_raytune_search_alg(cfg["raytune"], seeds) distributed_trainable = DistributedTrainableCreator( partial(build_model_and_train, full_config=config_file_path, ntrain=ntrain, ntest=ntest, name=name, seeds=seeds), num_workers=1, # Number of hosts that each trial is expected to use. num_cpus_per_worker=cpus, num_gpus_per_worker=gpus, num_workers_per_host= 1, # Number of workers to colocate per host. None if not specified. timeout_s=1 * 60 * 60, ) sync_config = tune.SyncConfig(sync_to_driver=False) start = datetime.now() analysis = tune.run( distributed_trainable, config=search_space, name=name, scheduler=sched, search_alg=search_alg, num_samples=raytune_num_samples, local_dir=cfg["raytune"]["local_dir"], callbacks=[TBXLoggerCallback()], log_to_file=True, resume=resume, max_failures=2, sync_config=sync_config, ) end = datetime.now() print("Total time of tune.run(...): {}".format(end - start)) print( "Best hyperparameters found according to {} were: ".format( cfg["raytune"]["default_metric"]), analysis.get_best_config(cfg["raytune"]["default_metric"], cfg["raytune"]["default_mode"])) skip = 20 if skip > cfg["setup"]["num_epochs"]: skip = 0 analysis.default_metric = cfg["raytune"]["default_metric"] analysis.default_mode = cfg["raytune"]["default_mode"] plot_ray_analysis(analysis, save=True, skip=skip) topk_summary_plot_v2(analysis, k=5, save_dir=Path(analysis.get_best_logdir()).parent) summarize_top_k(analysis, k=5, save_dir=Path(analysis.get_best_logdir()).parent) best_params = analysis.get_best_config(cfg["raytune"]["default_metric"], cfg["raytune"]["default_mode"]) with open( Path(analysis.get_best_logdir()).parent / "best_parameters.txt", "a") as best_params_file: best_params_file.write("Best hyperparameters according to {}\n".format( cfg["raytune"]["default_metric"])) for key, val in best_params.items(): best_params_file.write(("{}: {}\n".format(key, val))) with open(Path(analysis.get_best_logdir()).parent / "time.txt", "a") as timefile: timefile.write(str(end - start) + "\n") num_skipped = count_skipped_configurations(analysis.get_best_logdir()) print("Number of skipped configurations: {}".format(num_skipped))
def test_step_after_completion(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2) trainer = trainable_cls(config={"epochs": 1}) with pytest.raises(RuntimeError): for i in range(10): trainer.train()
def test_single_step(ray_start_2_cpus): # noqa: F811 trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2) trainer = trainable_cls() trainer.train() trainer.stop()
def test_validate_session(ray_start_2_cpus): trainable_cls = DistributedTrainableCreator(_train_validate_session) tune.run(trainable_cls)