def testClusterSyncFunction(self): def sync_func_driver(source, target): assert ":" in source, "Source {} not a remote path.".format(source) assert ":" not in target, "Target is supposed to be local." with open(os.path.join(target, "test.log2"), "w") as f: print("writing to", f.name) f.write(source) sync_config = tune.SyncConfig(sync_to_driver=sync_func_driver, node_sync_period=5) [trial] = tune.run("__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=sync_config).trials test_file_path = os.path.join(trial.logdir, "test.log2") self.assertFalse(os.path.exists(test_file_path)) with patch("ray.services.get_node_ip_address") as mock_sync: mock_sync.return_value = "0.0.0.0" sync_config = tune.SyncConfig(sync_to_driver=sync_func_driver) [trial] = tune.run("__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=sync_config).trials test_file_path = os.path.join(trial.logdir, "test.log2") self.assertTrue(os.path.exists(test_file_path)) os.remove(test_file_path)
def testCloudProperString(self): with self.assertRaises(ValueError): [trial] = tune.run( "__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=tune.SyncConfig(**{ "upload_dir": "test", "syncer": "ls {target}" }), ).trials with self.assertRaises(ValueError): [trial] = tune.run( "__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=tune.SyncConfig(**{ "upload_dir": "test", "syncer": "ls {source}" }), ).trials tmpdir = tempfile.mkdtemp() logfile = os.path.join(tmpdir, "test.log") [trial] = tune.run( "__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=tune.SyncConfig( **{ "upload_dir": "test", "syncer": "echo {source} {target} > " + logfile, }), ).trials with open(logfile) as f: lines = f.read() self.assertTrue("test" in lines) shutil.rmtree(tmpdir)
def train(): register_super_mario_env() ray.init(address="auto") _ = Counter.options(name="global_counter", max_concurrency=1).remote() tune.run( AmpedTrainer, config={ "env": "super_mario", "framework": "torch", "num_workers": 1, "log_level": "INFO", "seed": 1337, "num_envs_per_worker": 3, "entropy_coeff": 0.01, "kl_coeff": 0.0, "train_batch_size": 256, "num_sgd_iter": 2, "num_simulations": 10, "batch_mode": "truncate_episodes", "remote_worker_envs": True, #"ignore_worker_failures": True, "num_gpus_per_worker": 1, # "num_cpus_per_worker": 1, "num_gpus": 1, }, sync_config=tune.SyncConfig(upload_dir="gs://amp-results"), stop={"episodes_total": 100}, checkpoint_freq=10, #checkpoint_at_end=True, #resume=True, )
def train(): register_super_mario_env() ray.init(address="auto") tune.run( PPOTrainer, config={ "env": "super_mario", "framework": "torch", "num_workers": 4, "log_level": "INFO", "seed": 1337, "num_envs_per_worker": 5, "entropy_coeff": 0.01, "kl_coeff": 0.0, "num_sgd_iter": 2, "num_gpus": 1, "vf_share_layers": False, }, sync_config=tune.SyncConfig(upload_dir="gs://amp-results"), stop={"training_iteration": 500}, checkpoint_freq=500, checkpoint_at_end=True, #resume=True, )
def testCloudSyncPeriod(self): """Tests that changing CLOUD_SYNC_PERIOD affects syncing frequency.""" tmpdir = tempfile.mkdtemp() def trainable(config): for i in range(10): time.sleep(1) tune.report(score=i) mock = unittest.mock.Mock() def counter(local, remote): mock() sync_config = tune.SyncConfig(upload_dir="test", sync_to_cloud=counter, cloud_sync_period=1) # This was originally set to 0.5 os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" self.addCleanup( lambda: os.environ.pop("TUNE_GLOBAL_CHECKPOINT_S", None)) [trial] = tune.run( trainable, name="foo", max_failures=0, local_dir=tmpdir, stop={ "training_iteration": 10 }, sync_config=sync_config, ).trials self.assertEqual(mock.call_count, 12) shutil.rmtree(tmpdir)
def testNoSyncToDriver(self): """Test that sync to driver is disabled""" class _Trial: def __init__(self, id, logdir): self.id = (id,) self.logdir = logdir trial = _Trial("0", "some_dir") sync_config = tune.SyncConfig(syncer=None) # Create syncer callbacks callbacks = create_default_callbacks([], sync_config, loggers=None) syncer_callback = callbacks[-1] # Sanity check that we got the syncer callback self.assertTrue(isinstance(syncer_callback, SyncerCallback)) # Sync function should be false (no sync to driver) self.assertEqual(syncer_callback._sync_function, False) # Sync to driver is disabled, so this should be no-op trial_syncer = syncer_callback._get_trial_syncer(trial) self.assertEqual(trial_syncer.sync_client, NOOP)
def testSyncDetection(self): kubernetes_conf = {"provider": {"type": "kubernetes", "namespace": "test_ray"}} docker_conf = {"docker": {"image": "bogus"}, "provider": {"type": "aws"}} aws_conf = {"provider": {"type": "aws"}} with tempfile.TemporaryDirectory() as dir: kubernetes_file = os.path.join(dir, "kubernetes.yaml") with open(kubernetes_file, "wt") as fp: yaml.safe_dump(kubernetes_conf, fp) docker_file = os.path.join(dir, "docker.yaml") with open(docker_file, "wt") as fp: yaml.safe_dump(docker_conf, fp) aws_file = os.path.join(dir, "aws.yaml") with open(aws_file, "wt") as fp: yaml.safe_dump(aws_conf, fp) kubernetes_syncer = detect_cluster_syncer(None, kubernetes_file) self.assertTrue(issubclass(kubernetes_syncer, KubernetesSyncer)) self.assertEqual(kubernetes_syncer._namespace, "test_ray") docker_syncer = detect_cluster_syncer(None, docker_file) self.assertTrue(issubclass(docker_syncer, DockerSyncer)) aws_syncer = detect_cluster_syncer(None, aws_file) self.assertEqual(aws_syncer, None) # Should still return DockerSyncer, since it was passed explicitly syncer = detect_cluster_syncer( tune.SyncConfig(syncer=DockerSyncer), kubernetes_file ) self.assertTrue(issubclass(syncer, DockerSyncer))
def run_tune( sync_to_driver: bool, upload_dir: Optional[str] = None, durable: bool = False, experiment_name: str = "cloud_test", indicator_file: str = "/tmp/tune_cloud_indicator", ): num_cpus_per_trial = int(os.environ.get("TUNE_NUM_CPUS_PER_TRIAL", "2")) if durable: trainable = tune.durable(train) else: trainable = train tune.run(trainable, name=experiment_name, resume="AUTO", num_samples=4, config={ "max_iterations": 30, "sleep_time": 5, "checkpoint_freq": 2, "score_multiplied": tune.randint(0, 100), }, sync_config=tune.SyncConfig( sync_to_driver=sync_to_driver, upload_dir=upload_dir, sync_on_checkpoint=True, cloud_sync_period=0.5, ), keep_checkpoints_num=2, resources_per_trial={"cpu": num_cpus_per_trial}, callbacks=[IndicatorCallback(indicator_file=indicator_file)], verbose=2)
def main(bucket): secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt") if os.path.isfile(secrets_file): print(f"Loading AWS secrets from file {secrets_file}") from configparser import ConfigParser config = ConfigParser() config.read(secrets_file) for k, v in config.items(): for x, y in v.items(): var = str(x).upper() os.environ[var] = str(y) else: print("No AWS secrets file found. Loading from boto.") from boto3 import Session session = Session() credentials = session.get_credentials() current_credentials = credentials.get_frozen_credentials() os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key os.environ["AWS_SESSION_TOKEN"] = current_credentials.token if all( os.getenv(k, "") for k in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", ] ): print("AWS secrets found in env.") else: print("Warning: No AWS secrets found in env!") ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000 ** 2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( upload_dir=f"s3://{bucket}/durable/", ), )
def testSyncConfigDeprecation(self): with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"): sync_conf = tune.SyncConfig(node_sync_period=4, cloud_sync_period=8) self.assertEqual(sync_conf.sync_period, 4) with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"): sync_conf = tune.SyncConfig(node_sync_period=4) self.assertEqual(sync_conf.sync_period, 4) with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"): sync_conf = tune.SyncConfig(cloud_sync_period=8) self.assertEqual(sync_conf.sync_period, 8) with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"): sync_conf = tune.SyncConfig( sync_to_driver="a", sync_to_cloud="b", upload_dir=None ) self.assertEqual(sync_conf.syncer, "a") with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"): sync_conf = tune.SyncConfig( sync_to_driver="a", sync_to_cloud="b", upload_dir="c" ) self.assertEqual(sync_conf.syncer, "b") with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"): sync_conf = tune.SyncConfig(sync_to_cloud="b", upload_dir=None) self.assertEqual(sync_conf.syncer, None) with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"): sync_conf = tune.SyncConfig(sync_to_driver="a", upload_dir="c") self.assertEqual(sync_conf.syncer, None)
def testSyncConfigDeprecation(self): with self.assertRaisesRegex(DeprecationWarning, expected_regex="sync_period"): tune.SyncConfig(node_sync_period=4, cloud_sync_period=8) with self.assertRaisesRegex(DeprecationWarning, expected_regex="sync_period"): tune.SyncConfig(node_sync_period=4) with self.assertRaisesRegex(DeprecationWarning, expected_regex="sync_period"): tune.SyncConfig(cloud_sync_period=8) with self.assertRaisesRegex(DeprecationWarning, expected_regex="syncer"): tune.SyncConfig(sync_to_driver="a", sync_to_cloud="b", upload_dir=None) with self.assertRaisesRegex(DeprecationWarning, expected_regex="syncer"): tune.SyncConfig(sync_to_driver="a", sync_to_cloud="b", upload_dir="c") with self.assertRaisesRegex(DeprecationWarning, expected_regex="syncer"): tune.SyncConfig(sync_to_cloud="b", upload_dir=None) with self.assertRaisesRegex(DeprecationWarning, expected_regex="syncer"): tune.SyncConfig(sync_to_driver="a", upload_dir="c")
def run_tune( no_syncer: bool, upload_dir: Optional[str] = None, experiment_name: str = "cloud_test", indicator_file: str = "/tmp/tune_cloud_indicator", trainable: str = "function", num_cpus_per_trial: int = 2, ): if trainable == "function": train = fn_trainable config = { "max_iterations": 100, "sleep_time": 5, "checkpoint_freq": 2, "score_multiplied": tune.randint(0, 100), } kwargs = {"resources_per_trial": {"cpu": num_cpus_per_trial}} elif trainable == "rllib_str" or trainable == "rllib_trainer": if trainable == "rllib_str": train = "PPO" else: train = PPO config = { "env": "CartPole-v1", "num_workers": 1, "num_envs_per_worker": 1, "callbacks": RLlibCallback, } kwargs = { "stop": { "training_iteration": 100 }, "checkpoint_freq": 2, "checkpoint_at_end": True, } else: raise RuntimeError(f"Unknown trainable: {trainable}") tune.run( train, name=experiment_name, resume="AUTO", num_samples=4, config=config, sync_config=tune.SyncConfig( syncer="auto" if not no_syncer else None, upload_dir=upload_dir, sync_on_checkpoint=True, sync_period=0.5, ), keep_checkpoints_num=2, callbacks=[IndicatorCallback(indicator_file=indicator_file)], verbose=2, **kwargs, )
def testClusterProperString(self): """Tests that invalid commands throw..""" with self.assertRaises(TuneError): # This raises ValueError because logger is init in safe zone. sync_config = tune.SyncConfig(syncer="ls {target}") [trial] = tune.run( "__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=sync_config, ).trials with self.assertRaises(TuneError): # This raises ValueError because logger is init in safe zone. sync_config = tune.SyncConfig(syncer="ls {source}") [trial] = tune.run( "__fake", name="foo", max_failures=0, sync_config=sync_config, stop={ "training_iteration": 1 }, ).trials with patch.object(CommandBasedClient, "_execute") as mock_fn: with patch("ray.tune.syncer.get_node_ip_address") as mock_sync: sync_config = tune.SyncConfig(syncer="echo {source} {target}") mock_sync.return_value = "0.0.0.0" [trial] = tune.run( "__fake", name="foo", max_failures=0, sync_config=sync_config, stop={ "training_iteration": 1 }, ).trials self.assertGreater(mock_fn.call_count, 0)
def run(self, fast_dev_run=False, use_gpus=False): utils.set_seeds(self.search_params.data.seed) search_dict = self.search_params.to_ray_tune_search_dict() # see tune.utils.UtilMonitor search_dict['log_sys_usage'] = True output_str = str(self.search_params.logs.output_dir) if output_str.startswith('s3://') or output_str.startswith( 'gs://') or output_str.startswith('hdfs://'): sync_config = tune.SyncConfig( upload_dir=self.search_params.logs.output_dir) else: sync_config = None analysis = tune.run( run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run, include_gpus=use_gpus), name=self.search_params.exp.get_project_exp_name(), stop=self.get_tune_stopper(self.search_params.opt.num_epochs), config=search_dict, resources_per_trial=self.get_resources_per_trial( self.search_params, include_gpu=use_gpus), num_samples=self.tune_hp.num_hp_samples, sync_config=sync_config, loggers=self.get_tune_loggers(), log_to_file=self.tune_hp.log_to_file and not self.tune_hp.ray_local_mode, keep_checkpoints_num=2, checkpoint_score_attr= f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}', fail_fast=False, scheduler=self.get_tune_scheduler(self.search_params, self.tune_hp), verbose=2, progress_reporter=self.get_cli_reporter(), reuse_actors=False, ) utils.hprint("done with tune.run") param_hash = self.search_params.get_short_hash(num_chars=8) analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle' print(f"Saving {analysis_file}") utils.save_cloudpickle(analysis_file, analysis) best_trial = analysis.get_best_trial( self.search_params.opt.search_metric, self.search_params.opt.search_mode, "last-5-avg") utils.hprint('best_trial.last_result', do_include_pre_break_line=True) utils.print_dict(best_trial.last_result) utils.hprint('best_trial.config', do_include_pre_break_line=True) utils.print_dict(best_trial.config)
def testNoUploadDir(self): """No Upload Dir is given.""" with self.assertRaises(AssertionError): [trial] = tune.run( "__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=tune.SyncConfig( **{"sync_to_cloud": "echo {source} {target}"})).trials
def train(): register_super_mario_env() client = WebClient(token=os.environ['SLACK_BOT_TOKEN']) ray.init(address="auto") def send_message(message): try: _ = client.chat_postMessage(channel='#notifications', text=message) except SlackApiError as e: print(f"Got an error: {e.response['error']}") try: tune.run( AmpedTrainer, config={ "env": "super_mario", "framework": "torch", "num_workers": 4, "log_level": "INFO", "seed": 1337, "num_envs_per_worker": 5, "entropy_coeff": 0.01, "kl_coeff": 0.0, "num_sgd_iter": 2, "num_gpus": 1, "num_simulations": 10, #"train_batch_size": 256, # "sgd_minibatch_size": 128, # "batch_mode": "complete_episodes", "remote_worker_envs": True, # "ignore_worker_failures": True, # "num_cpus_per_worker": 1, # "sample_async": True, # "no_done_at_end": True, # "soft_horizon": True, "horizon": 256, # "rollout_fragment_length": 256, }, sync_config=tune.SyncConfig(upload_dir="gs://amp-results"), stop={"training_iteration": 500}, checkpoint_freq=500, raise_on_failed_trial=True, checkpoint_at_end=True, #resume=True, ) except TuneError as e: print(e) send_message("The trail failed :(") finally: send_message("Trial over")
def run( base_config: Dict[str, Any], ray_server: str, init_kwargs: Dict[str, Any], exp_name: str, spec: Dict[str, Any], ) -> ray.tune.ExperimentAnalysis: ray.init(address=ray_server, **init_kwargs) # We have to register the function we're going to call with Ray. # We partially apply worker_fn, so it's different for each experiment. # Compute a hash based on the config to make sure it has a unique name! # Note Ray does let you pass a worker_fn directly without registering, but then # it registers using the function name (which may not be unique). cfg = { # ReadOnlyDict's aren't serializable: see sacred issue #499 "base_config": utils.sacred_copy(base_config), "exp_name": exp_name, } cfg_str = json.dumps(cfg) hasher = hashlib.md5() # we are not worried about security here hasher.update(cfg_str.encode("utf8")) cfg_hash = hasher.hexdigest() trainable_name = f"{worker_name}-{cfg_hash}" base_config = utils.sacred_copy(base_config) trainable_fn = functools.partial(worker_fn, base_config) tune.register_trainable(trainable_name, trainable_fn) exp_id = f"{ex.path}/{exp_name}/{utils.make_timestamp()}-{uuid.uuid4().hex}" spec = utils.sacred_copy(spec) # Disable TensorBoard logger: fails due to the spec containing string variables. tune_loggers = [tune.logger.JsonLogger, tune.logger.CSVLogger] sync_config = None if "sync_config" in spec: sync_config = tune.SyncConfig(**spec["sync_config"]) try: result = tune.run( trainable_name, name=exp_id, config=spec["config"], sync_config=sync_config, loggers=tune_loggers, **spec["run_kwargs"], ) finally: ray.shutdown() return result, exp_id
def testNoSync(self): """Sync should not run on a single node.""" def sync_func(source, target): pass sync_config = tune.SyncConfig(sync_to_driver=sync_func) with patch.object(CommandBasedClient, "_execute") as mock_sync: [trial] = tune.run("__fake", name="foo", max_failures=0, stop={ "training_iteration": 1 }, sync_config=sync_config).trials self.assertEqual(mock_sync.call_count, 0)
def main(smoke_test: bool = False): ray.init(address="auto") num_samples = 100 if not smoke_test else 20 results_per_second = 0.01 trial_length_s = 300 max_runtime = 1000 timed_tune_run( name="result network overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, resources_per_trial={"cpu": 2}, # One per node sync_config=tune.SyncConfig(syncer="auto"))
def main(): ray.init(address="auto") num_samples = 200 results_per_second = 0.01 trial_length_s = 300 max_runtime = 1000 timed_tune_run( name="result network overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, resources_per_trial={"cpu": 2}, # One per node sync_config=tune.SyncConfig(sync_to_driver=True))
def testCloudSyncPeriod(self): """Tests that changing SYNC_PERIOD affects syncing frequency.""" tmpdir = tempfile.mkdtemp() def trainable(config): for i in range(10): time.sleep(1) tune.report(score=i) def counter(local, remote): count_file = os.path.join(tmpdir, "count.txt") if not os.path.exists(count_file): count = 0 else: with open(count_file, "rb") as fp: count = pickle.load(fp) count += 1 with open(count_file, "wb") as fp: pickle.dump(count, fp) sync_config = tune.SyncConfig(upload_dir="test", syncer=counter, sync_period=1) # This was originally set to 0.5 os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" self.addCleanup( lambda: os.environ.pop("TUNE_GLOBAL_CHECKPOINT_S", None)) [trial] = tune.run( trainable, name="foo", max_failures=0, local_dir=tmpdir, stop={ "training_iteration": 10 }, sync_config=sync_config, ).trials count_file = os.path.join(tmpdir, "count.txt") with open(count_file, "rb") as fp: count = pickle.load(fp) self.assertEqual(count, 12) shutil.rmtree(tmpdir)
def run(self, fast_dev_run=False, use_gpus=False, log_to_file=False): search_dict = self.search_params.to_ray_tune_search_dict() # see tune.utils.UtilMonitor search_dict['log_sys_usage'] = True # noinspection PyTypeChecker analysis = tune.run( run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run, include_gpus=use_gpus), name=self.search_params.exp.get_project_exp_name(), stop=self.get_tune_stopper(self.search_params.opt.num_epochs), config=search_dict, resources_per_trial=self.get_resources_per_trial( self.search_params, include_gpu=use_gpus), num_samples=self.search_params.tune.num_hp_samples, sync_config=tune.SyncConfig( upload_dir=self.search_params.metrics.output_dir), loggers=self.get_tune_loggers(), log_to_file=log_to_file, keep_checkpoints_num=2, checkpoint_score_attr= f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}', fail_fast=False, scheduler=self.get_tune_scheduler(self.search_params), verbose=2, progress_reporter=self.get_cli_reporter(), reuse_actors=False, ) utils.hprint("done with tune.run") param_hash = self.search_params.get_short_hash(num_chars=8) analysis_file = self.search_params.metrics.output_dir / f'tune_analysis_{param_hash}.pkl' print(f"Saving {analysis_file}") utils.save_pickle(analysis_file, analysis) best_trial = analysis.get_best_trial( self.search_params.opt.search_metric, self.search_params.opt.search_mode, "last-5-avg") print(f'best_trial.last_result: {best_trial.last_result}') print("Best trial config: {}".format(best_trial.config)) print("Best trial final search_metric: {}".format( best_trial.last_result[self.search_params.opt.search_metric]))
def main(): ray.init(address="auto") num_samples = 16 results_per_second = 1 / 60 trial_length_s = 86400 max_runtime = 90000 timed_tune_run( name="long running large checkpoints", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=900, # Once every 15 minutes checkpoint_size_b=int(3.75 * 1000**3), keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(sync_to_driver=True))
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 1000 results_per_second = 0.5 trial_length_s = 100 max_runtime = 120 if is_ray_cluster(): # Add constant overhead for SSH connection max_runtime = 120 timed_tune_run(name="result throughput cluster", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, sync_config=tune.SyncConfig(sync_to_driver=False)) # Tweak!
def main(bucket): secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt") if os.path.isfile(secrets_file): print(f"Loading AWS secrets from file {secrets_file}") from configparser import ConfigParser config = ConfigParser() config.read(secrets_file) for k, v in config.items(): for x, y in v.items(): var = str(x).upper() os.environ[var] = str(y) else: print("No AWS secrets file found.") ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000**2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( sync_to_driver=False, upload_dir=f"s3://{bucket}/durable/", ))
def testCloudFunctions(self): tmpdir = tempfile.mkdtemp() tmpdir2 = tempfile.mkdtemp() os.mkdir(os.path.join(tmpdir2, "foo")) def sync_func(local, remote): for filename in glob.glob(os.path.join(local, "*.json")): shutil.copy(filename, remote) sync_config = tune.SyncConfig(upload_dir=tmpdir2, syncer=sync_func) [trial] = tune.run("__fake", name="foo", max_failures=0, local_dir=tmpdir, stop={ "training_iteration": 1 }, sync_config=sync_config).trials test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json")) self.assertTrue(test_file_path) shutil.rmtree(tmpdir) shutil.rmtree(tmpdir2)
def main(): ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000**2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( sync_to_driver=False, upload_dir="s3://ray-tune-scalability-test/durable/", ))
def main(smoke_test: bool = False): ray.init(address="auto") num_samples = 16 results_per_second = 1 / 60 trial_length_s = 86400 if smoke_test else 3600 max_runtime = 90000 if smoke_test else 4200 callback = ProgressCallback() timed_tune_run( name="long running large checkpoints", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=900, # Once every 15 minutes checkpoint_size_b=int(0.75 * 1000**3), keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(syncer="auto"), callbacks=[callback])
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config." ) if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict[TYPE] search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`." ) else: search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend) ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # we can't set Trial actor's CPUs to 0 so we just go very low resources_per_trial = PlacementGroupFactory( [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none) if self._gpu_resources_per_trial_non_none else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none ) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace)) run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, ) if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path(trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning("Skipping evaluation as no model checkpoints were available") else: logger.warning("Skipping evaluation as no validation set was provided") ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials] else: logger.warning("No trials reported results; check if time budget lower than epoch latency") ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
def tune_test(path, num_trials, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, fake_data=False, smoke_test=False): ray_params = RayParams(elastic_training=False, max_actor_restarts=0, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=0 if not use_gpu else 1) def local_train(config): temp_dir = None if fake_data or smoke_test: temp_dir = "/tmp/release_test_data" if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(temp_dir, 0o755) local_path = os.path.join(temp_dir, "smoketest.parquet") create_parquet(filename=local_path, num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10) else: if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` " f"on all nodes first.") local_path = path xgboost_params = { "tree_method": "hist" if not use_gpu else "gpu_hist", } xgboost_params.update({ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }) xgboost_params.update(config) additional_results = {} bst, time_taken = train_ray( path=local_path, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=regression, use_gpu=use_gpu, smoke_test=smoke_test, ray_params=ray_params, xgboost_params=xgboost_params, # kwargs additional_results=additional_results, callbacks=[PlacementCallback(), TuneReportCallback()]) bst.save_model("tuned.xgb") trial_ips = [] for rank, ips in enumerate(additional_results["callback_returns"]): for ip in ips: trial_ips.append(ip) tune_trial = get_trial_id() with tune.checkpoint_dir(num_boost_rounds + 1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, "callback_returns.json"), "wt") as f: json.dump({tune_trial: trial_ips}, f) if temp_dir: shutil.rmtree(temp_dir) search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } analysis = tune.run( local_train, config=search_space, num_samples=num_trials, sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer), resources_per_trial=ray_params.get_tune_resources()) # In our PACK scheduling, we expect that each IP hosts only workers # for one Ray Tune trial. ip_to_trials = defaultdict(list) for trial in analysis.trials: trial = trial with open( os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt") as f: trial_to_ips = json.load(f) for tune_trial, ips in trial_to_ips.items(): for node_ip in ips: ip_to_trials[node_ip].append(tune_trial) fail = False for ip, trial_ids in ip_to_trials.items(): print(f"For IP {ip} got trial IDs {trial_ids}") fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids) if fail: raise ValueError("Different trial IDs found on same node.") else: print("Success.")