def main(smoke_test: bool = False): ray.init(address="auto") num_samples = 16 results_per_second = 1 / 60 trial_length_s = 86400 if smoke_test else 3600 max_runtime = 90000 if smoke_test else 4200 callback = ProgressCallback() timed_tune_run( name="long running large checkpoints", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=900, # Once every 15 minutes checkpoint_size_b=int(0.75 * 1000**3), keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(syncer="auto"), callbacks=[callback])
# distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None }, max_failures=-1, # used for fault tolerance checkpoint_freq=2, # used for fault tolerance progress_reporter=reporter, scheduler=pbt_scheduler, callbacks=[ FailureInjectorCallback(time_between_checks=90), ProgressCallback() ], stop={"training_iteration": 1} if args.smoke_test else None) print(analysis.get_best_config(metric="val_loss", mode="min"))
# ensure that checkpointing works. pbt = create_scheduler( "pbt", perturbation_interval=2, hyperparam_mutations={ "lr": tune.uniform(0.001, 0.1), }, ) analysis = tune.run( horovod_trainable, metric="loss", mode="min", keep_checkpoints_num=1, scheduler=pbt, config={ "lr": 0.1 if args.smoke_test else tune.grid_search( [0.1 * i for i in range(1, 10)]), "batch_size": 64, "data": ray.put(dataset), }, num_samples=1, stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ProgressCallback()], # FailureInjectorCallback() fail_fast=True, ) print("Best hyperparameters found were: ", analysis.best_config)
# num_gpus=0, # resources={str(i): 2}, # object_store_memory=object_store_memory, # redis_max_memory=redis_max_memory, # dashboard_host="0.0.0.0") # ray.init(address=cluster.address) if "RAY_ADDRESS" in os.environ: del os.environ["RAY_ADDRESS"] ray.init(num_cpus=10) # Run the workload. run_experiments( { "impala": { "run": "IMPALA", "env": "CartPole-v0", "config": { "num_workers": 8, "num_gpus": 0, "num_envs_per_worker": 5, "remote_worker_envs": True, "remote_env_batch_wait_ms": 99999999, "rollout_fragment_length": 50, "train_batch_size": 100, }, }, }, callbacks=[ProgressCallback()])
hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }, ) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None, }, max_failures=-1, # used for fault tolerance checkpoint_freq=2, # used for fault tolerance progress_reporter=reporter, scheduler=pbt_scheduler, callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], stop={"training_iteration": 1} if args.smoke_test else None, ) print(analysis.get_best_config(metric="val_loss", mode="min"))