示例#1
0
    def testClusterSyncFunction(self):
        def sync_func_driver(source, target):
            assert ":" in source, "Source {} not a remote path.".format(source)
            assert ":" not in target, "Target is supposed to be local."
            with open(os.path.join(target, "test.log2"), "w") as f:
                print("writing to", f.name)
                f.write(source)

        sync_config = tune.SyncConfig(sync_to_driver=sync_func_driver,
                                      node_sync_period=5)

        [trial] = tune.run("__fake",
                           name="foo",
                           max_failures=0,
                           stop={
                               "training_iteration": 1
                           },
                           sync_config=sync_config).trials
        test_file_path = os.path.join(trial.logdir, "test.log2")
        self.assertFalse(os.path.exists(test_file_path))

        with patch("ray.services.get_node_ip_address") as mock_sync:
            mock_sync.return_value = "0.0.0.0"
            sync_config = tune.SyncConfig(sync_to_driver=sync_func_driver)
            [trial] = tune.run("__fake",
                               name="foo",
                               max_failures=0,
                               stop={
                                   "training_iteration": 1
                               },
                               sync_config=sync_config).trials
        test_file_path = os.path.join(trial.logdir, "test.log2")
        self.assertTrue(os.path.exists(test_file_path))
        os.remove(test_file_path)
示例#2
0
    def testCloudProperString(self):
        with self.assertRaises(ValueError):
            [trial] = tune.run(
                "__fake",
                name="foo",
                max_failures=0,
                stop={
                    "training_iteration": 1
                },
                sync_config=tune.SyncConfig(**{
                    "upload_dir": "test",
                    "syncer": "ls {target}"
                }),
            ).trials

        with self.assertRaises(ValueError):
            [trial] = tune.run(
                "__fake",
                name="foo",
                max_failures=0,
                stop={
                    "training_iteration": 1
                },
                sync_config=tune.SyncConfig(**{
                    "upload_dir": "test",
                    "syncer": "ls {source}"
                }),
            ).trials

        tmpdir = tempfile.mkdtemp()
        logfile = os.path.join(tmpdir, "test.log")

        [trial] = tune.run(
            "__fake",
            name="foo",
            max_failures=0,
            stop={
                "training_iteration": 1
            },
            sync_config=tune.SyncConfig(
                **{
                    "upload_dir": "test",
                    "syncer": "echo {source} {target} > " + logfile,
                }),
        ).trials
        with open(logfile) as f:
            lines = f.read()
            self.assertTrue("test" in lines)
        shutil.rmtree(tmpdir)
示例#3
0
def train():
    register_super_mario_env()

    ray.init(address="auto")
    _ = Counter.options(name="global_counter", max_concurrency=1).remote()

    tune.run(
        AmpedTrainer,
        config={
            "env": "super_mario",
            "framework": "torch",
            "num_workers": 1,
            "log_level": "INFO",
            "seed": 1337,
            "num_envs_per_worker": 3,
            "entropy_coeff": 0.01,
            "kl_coeff": 0.0,
            "train_batch_size": 256,
            "num_sgd_iter": 2,
            "num_simulations": 10,
            "batch_mode": "truncate_episodes",
            "remote_worker_envs": True,
            #"ignore_worker_failures": True,
            "num_gpus_per_worker": 1,
            # "num_cpus_per_worker": 1,
            "num_gpus": 1,
        },
        sync_config=tune.SyncConfig(upload_dir="gs://amp-results"),
        stop={"episodes_total": 100},
        checkpoint_freq=10,
        #checkpoint_at_end=True,
        #resume=True,
    )
示例#4
0
def train():
    register_super_mario_env()

    ray.init(address="auto")

    tune.run(
        PPOTrainer,
        config={
            "env": "super_mario",
            "framework": "torch",
            "num_workers": 4,
            "log_level": "INFO",
            "seed": 1337,
            "num_envs_per_worker": 5,
            "entropy_coeff": 0.01,
            "kl_coeff": 0.0,
            "num_sgd_iter": 2,
            "num_gpus": 1,
            "vf_share_layers": False,
        },
        sync_config=tune.SyncConfig(upload_dir="gs://amp-results"),
        stop={"training_iteration": 500},
        checkpoint_freq=500,
        checkpoint_at_end=True,
        #resume=True,
    )
示例#5
0
    def testCloudSyncPeriod(self):
        """Tests that changing CLOUD_SYNC_PERIOD affects syncing frequency."""
        tmpdir = tempfile.mkdtemp()

        def trainable(config):
            for i in range(10):
                time.sleep(1)
                tune.report(score=i)

        mock = unittest.mock.Mock()

        def counter(local, remote):
            mock()

        sync_config = tune.SyncConfig(upload_dir="test",
                                      sync_to_cloud=counter,
                                      cloud_sync_period=1)
        # This was originally set to 0.5
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0"
        self.addCleanup(
            lambda: os.environ.pop("TUNE_GLOBAL_CHECKPOINT_S", None))
        [trial] = tune.run(
            trainable,
            name="foo",
            max_failures=0,
            local_dir=tmpdir,
            stop={
                "training_iteration": 10
            },
            sync_config=sync_config,
        ).trials

        self.assertEqual(mock.call_count, 12)
        shutil.rmtree(tmpdir)
示例#6
0
    def testNoSyncToDriver(self):
        """Test that sync to driver is disabled"""

        class _Trial:
            def __init__(self, id, logdir):
                self.id = (id,)
                self.logdir = logdir

        trial = _Trial("0", "some_dir")

        sync_config = tune.SyncConfig(syncer=None)

        # Create syncer callbacks
        callbacks = create_default_callbacks([], sync_config, loggers=None)
        syncer_callback = callbacks[-1]

        # Sanity check that we got the syncer callback
        self.assertTrue(isinstance(syncer_callback, SyncerCallback))

        # Sync function should be false (no sync to driver)
        self.assertEqual(syncer_callback._sync_function, False)

        # Sync to driver is disabled, so this should be no-op
        trial_syncer = syncer_callback._get_trial_syncer(trial)
        self.assertEqual(trial_syncer.sync_client, NOOP)
示例#7
0
    def testSyncDetection(self):
        kubernetes_conf = {"provider": {"type": "kubernetes", "namespace": "test_ray"}}
        docker_conf = {"docker": {"image": "bogus"}, "provider": {"type": "aws"}}
        aws_conf = {"provider": {"type": "aws"}}

        with tempfile.TemporaryDirectory() as dir:
            kubernetes_file = os.path.join(dir, "kubernetes.yaml")
            with open(kubernetes_file, "wt") as fp:
                yaml.safe_dump(kubernetes_conf, fp)

            docker_file = os.path.join(dir, "docker.yaml")
            with open(docker_file, "wt") as fp:
                yaml.safe_dump(docker_conf, fp)

            aws_file = os.path.join(dir, "aws.yaml")
            with open(aws_file, "wt") as fp:
                yaml.safe_dump(aws_conf, fp)

            kubernetes_syncer = detect_cluster_syncer(None, kubernetes_file)
            self.assertTrue(issubclass(kubernetes_syncer, KubernetesSyncer))
            self.assertEqual(kubernetes_syncer._namespace, "test_ray")

            docker_syncer = detect_cluster_syncer(None, docker_file)
            self.assertTrue(issubclass(docker_syncer, DockerSyncer))

            aws_syncer = detect_cluster_syncer(None, aws_file)
            self.assertEqual(aws_syncer, None)

            # Should still return DockerSyncer, since it was passed explicitly
            syncer = detect_cluster_syncer(
                tune.SyncConfig(syncer=DockerSyncer), kubernetes_file
            )
            self.assertTrue(issubclass(syncer, DockerSyncer))
示例#8
0
def run_tune(
    sync_to_driver: bool,
    upload_dir: Optional[str] = None,
    durable: bool = False,
    experiment_name: str = "cloud_test",
    indicator_file: str = "/tmp/tune_cloud_indicator",
):
    num_cpus_per_trial = int(os.environ.get("TUNE_NUM_CPUS_PER_TRIAL", "2"))

    if durable:
        trainable = tune.durable(train)
    else:
        trainable = train

    tune.run(trainable,
             name=experiment_name,
             resume="AUTO",
             num_samples=4,
             config={
                 "max_iterations": 30,
                 "sleep_time": 5,
                 "checkpoint_freq": 2,
                 "score_multiplied": tune.randint(0, 100),
             },
             sync_config=tune.SyncConfig(
                 sync_to_driver=sync_to_driver,
                 upload_dir=upload_dir,
                 sync_on_checkpoint=True,
                 cloud_sync_period=0.5,
             ),
             keep_checkpoints_num=2,
             resources_per_trial={"cpu": num_cpus_per_trial},
             callbacks=[IndicatorCallback(indicator_file=indicator_file)],
             verbose=2)
示例#9
0
def main(bucket):
    secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt")
    if os.path.isfile(secrets_file):
        print(f"Loading AWS secrets from file {secrets_file}")

        from configparser import ConfigParser

        config = ConfigParser()
        config.read(secrets_file)

        for k, v in config.items():
            for x, y in v.items():
                var = str(x).upper()
                os.environ[var] = str(y)
    else:
        print("No AWS secrets file found. Loading from boto.")
        from boto3 import Session

        session = Session()
        credentials = session.get_credentials()
        current_credentials = credentials.get_frozen_credentials()

        os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key
        os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key
        os.environ["AWS_SESSION_TOKEN"] = current_credentials.token

    if all(
        os.getenv(k, "")
        for k in [
            "AWS_ACCESS_KEY_ID",
            "AWS_SECRET_ACCESS_KEY",
            "AWS_SESSION_TOKEN",
        ]
    ):
        print("AWS secrets found in env.")
    else:
        print("Warning: No AWS secrets found in env!")

    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000 ** 2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            upload_dir=f"s3://{bucket}/durable/",
        ),
    )
示例#10
0
    def testSyncConfigDeprecation(self):
        with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"):
            sync_conf = tune.SyncConfig(node_sync_period=4, cloud_sync_period=8)
            self.assertEqual(sync_conf.sync_period, 4)

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"):
            sync_conf = tune.SyncConfig(node_sync_period=4)
            self.assertEqual(sync_conf.sync_period, 4)

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="sync_period"):
            sync_conf = tune.SyncConfig(cloud_sync_period=8)
            self.assertEqual(sync_conf.sync_period, 8)

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"):
            sync_conf = tune.SyncConfig(
                sync_to_driver="a", sync_to_cloud="b", upload_dir=None
            )
            self.assertEqual(sync_conf.syncer, "a")

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"):
            sync_conf = tune.SyncConfig(
                sync_to_driver="a", sync_to_cloud="b", upload_dir="c"
            )
            self.assertEqual(sync_conf.syncer, "b")

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"):
            sync_conf = tune.SyncConfig(sync_to_cloud="b", upload_dir=None)
            self.assertEqual(sync_conf.syncer, None)

        with self.assertWarnsRegex(DeprecationWarning, expected_regex="syncer"):
            sync_conf = tune.SyncConfig(sync_to_driver="a", upload_dir="c")
            self.assertEqual(sync_conf.syncer, None)
示例#11
0
    def testSyncConfigDeprecation(self):
        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="sync_period"):
            tune.SyncConfig(node_sync_period=4, cloud_sync_period=8)

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="sync_period"):
            tune.SyncConfig(node_sync_period=4)

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="sync_period"):
            tune.SyncConfig(cloud_sync_period=8)

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="syncer"):
            tune.SyncConfig(sync_to_driver="a",
                            sync_to_cloud="b",
                            upload_dir=None)

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="syncer"):
            tune.SyncConfig(sync_to_driver="a",
                            sync_to_cloud="b",
                            upload_dir="c")

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="syncer"):
            tune.SyncConfig(sync_to_cloud="b", upload_dir=None)

        with self.assertRaisesRegex(DeprecationWarning,
                                    expected_regex="syncer"):
            tune.SyncConfig(sync_to_driver="a", upload_dir="c")
示例#12
0
def run_tune(
    no_syncer: bool,
    upload_dir: Optional[str] = None,
    experiment_name: str = "cloud_test",
    indicator_file: str = "/tmp/tune_cloud_indicator",
    trainable: str = "function",
    num_cpus_per_trial: int = 2,
):
    if trainable == "function":
        train = fn_trainable
        config = {
            "max_iterations": 100,
            "sleep_time": 5,
            "checkpoint_freq": 2,
            "score_multiplied": tune.randint(0, 100),
        }
        kwargs = {"resources_per_trial": {"cpu": num_cpus_per_trial}}
    elif trainable == "rllib_str" or trainable == "rllib_trainer":
        if trainable == "rllib_str":
            train = "PPO"
        else:
            train = PPO

        config = {
            "env": "CartPole-v1",
            "num_workers": 1,
            "num_envs_per_worker": 1,
            "callbacks": RLlibCallback,
        }
        kwargs = {
            "stop": {
                "training_iteration": 100
            },
            "checkpoint_freq": 2,
            "checkpoint_at_end": True,
        }
    else:
        raise RuntimeError(f"Unknown trainable: {trainable}")

    tune.run(
        train,
        name=experiment_name,
        resume="AUTO",
        num_samples=4,
        config=config,
        sync_config=tune.SyncConfig(
            syncer="auto" if not no_syncer else None,
            upload_dir=upload_dir,
            sync_on_checkpoint=True,
            sync_period=0.5,
        ),
        keep_checkpoints_num=2,
        callbacks=[IndicatorCallback(indicator_file=indicator_file)],
        verbose=2,
        **kwargs,
    )
示例#13
0
    def testClusterProperString(self):
        """Tests that invalid commands throw.."""
        with self.assertRaises(TuneError):
            # This raises ValueError because logger is init in safe zone.
            sync_config = tune.SyncConfig(syncer="ls {target}")
            [trial] = tune.run(
                "__fake",
                name="foo",
                max_failures=0,
                stop={
                    "training_iteration": 1
                },
                sync_config=sync_config,
            ).trials

        with self.assertRaises(TuneError):
            # This raises ValueError because logger is init in safe zone.
            sync_config = tune.SyncConfig(syncer="ls {source}")
            [trial] = tune.run(
                "__fake",
                name="foo",
                max_failures=0,
                sync_config=sync_config,
                stop={
                    "training_iteration": 1
                },
            ).trials

        with patch.object(CommandBasedClient, "_execute") as mock_fn:
            with patch("ray.tune.syncer.get_node_ip_address") as mock_sync:
                sync_config = tune.SyncConfig(syncer="echo {source} {target}")
                mock_sync.return_value = "0.0.0.0"
                [trial] = tune.run(
                    "__fake",
                    name="foo",
                    max_failures=0,
                    sync_config=sync_config,
                    stop={
                        "training_iteration": 1
                    },
                ).trials
                self.assertGreater(mock_fn.call_count, 0)
示例#14
0
    def run(self, fast_dev_run=False, use_gpus=False):
        utils.set_seeds(self.search_params.data.seed)

        search_dict = self.search_params.to_ray_tune_search_dict()
        # see tune.utils.UtilMonitor
        search_dict['log_sys_usage'] = True

        output_str = str(self.search_params.logs.output_dir)
        if output_str.startswith('s3://') or output_str.startswith(
                'gs://') or output_str.startswith('hdfs://'):
            sync_config = tune.SyncConfig(
                upload_dir=self.search_params.logs.output_dir)
        else:
            sync_config = None

        analysis = tune.run(
            run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run,
                                                 include_gpus=use_gpus),
            name=self.search_params.exp.get_project_exp_name(),
            stop=self.get_tune_stopper(self.search_params.opt.num_epochs),
            config=search_dict,
            resources_per_trial=self.get_resources_per_trial(
                self.search_params, include_gpu=use_gpus),
            num_samples=self.tune_hp.num_hp_samples,
            sync_config=sync_config,
            loggers=self.get_tune_loggers(),
            log_to_file=self.tune_hp.log_to_file
            and not self.tune_hp.ray_local_mode,
            keep_checkpoints_num=2,
            checkpoint_score_attr=
            f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}',
            fail_fast=False,
            scheduler=self.get_tune_scheduler(self.search_params,
                                              self.tune_hp),
            verbose=2,
            progress_reporter=self.get_cli_reporter(),
            reuse_actors=False,
        )

        utils.hprint("done with tune.run")

        param_hash = self.search_params.get_short_hash(num_chars=8)
        analysis_file = self.search_params.logs.output_dir / f'tune_analysis_{param_hash}.cloudpickle'
        print(f"Saving {analysis_file}")
        utils.save_cloudpickle(analysis_file, analysis)

        best_trial = analysis.get_best_trial(
            self.search_params.opt.search_metric,
            self.search_params.opt.search_mode, "last-5-avg")
        utils.hprint('best_trial.last_result', do_include_pre_break_line=True)
        utils.print_dict(best_trial.last_result)

        utils.hprint('best_trial.config', do_include_pre_break_line=True)
        utils.print_dict(best_trial.config)
示例#15
0
 def testNoUploadDir(self):
     """No Upload Dir is given."""
     with self.assertRaises(AssertionError):
         [trial] = tune.run(
             "__fake",
             name="foo",
             max_failures=0,
             stop={
                 "training_iteration": 1
             },
             sync_config=tune.SyncConfig(
                 **{"sync_to_cloud": "echo {source} {target}"})).trials
示例#16
0
def train():
    register_super_mario_env()

    client = WebClient(token=os.environ['SLACK_BOT_TOKEN'])

    ray.init(address="auto")

    def send_message(message):
        try:
            _ = client.chat_postMessage(channel='#notifications', text=message)
        except SlackApiError as e:
            print(f"Got an error: {e.response['error']}")

    try:
        tune.run(
            AmpedTrainer,
            config={
                "env": "super_mario",
                "framework": "torch",
                "num_workers": 4,
                "log_level": "INFO",
                "seed": 1337,
                "num_envs_per_worker": 5,
                "entropy_coeff": 0.01,
                "kl_coeff": 0.0,
                "num_sgd_iter": 2,
                "num_gpus": 1,
                "num_simulations": 10,
                #"train_batch_size": 256,
                # "sgd_minibatch_size": 128,
                # "batch_mode": "complete_episodes",
                "remote_worker_envs": True,
                # "ignore_worker_failures": True,
                # "num_cpus_per_worker": 1,
                # "sample_async": True,
                # "no_done_at_end": True,
                # "soft_horizon": True,
                "horizon": 256,
                # "rollout_fragment_length": 256,
            },
            sync_config=tune.SyncConfig(upload_dir="gs://amp-results"),
            stop={"training_iteration": 500},
            checkpoint_freq=500,
            raise_on_failed_trial=True,
            checkpoint_at_end=True,
            #resume=True,
        )
    except TuneError as e:
        print(e)
        send_message("The trail failed :(")
    finally:
        send_message("Trial over")
    def run(
        base_config: Dict[str, Any],
        ray_server: str,
        init_kwargs: Dict[str, Any],
        exp_name: str,
        spec: Dict[str, Any],
    ) -> ray.tune.ExperimentAnalysis:
        ray.init(address=ray_server, **init_kwargs)

        # We have to register the function we're going to call with Ray.
        # We partially apply worker_fn, so it's different for each experiment.
        # Compute a hash based on the config to make sure it has a unique name!
        # Note Ray does let you pass a worker_fn directly without registering, but then
        # it registers using the function name (which may not be unique).
        cfg = {
            # ReadOnlyDict's aren't serializable: see sacred issue #499
            "base_config": utils.sacred_copy(base_config),
            "exp_name": exp_name,
        }
        cfg_str = json.dumps(cfg)
        hasher = hashlib.md5()  # we are not worried about security here
        hasher.update(cfg_str.encode("utf8"))
        cfg_hash = hasher.hexdigest()

        trainable_name = f"{worker_name}-{cfg_hash}"
        base_config = utils.sacred_copy(base_config)
        trainable_fn = functools.partial(worker_fn, base_config)
        tune.register_trainable(trainable_name, trainable_fn)

        exp_id = f"{ex.path}/{exp_name}/{utils.make_timestamp()}-{uuid.uuid4().hex}"
        spec = utils.sacred_copy(spec)

        # Disable TensorBoard logger: fails due to the spec containing string variables.
        tune_loggers = [tune.logger.JsonLogger, tune.logger.CSVLogger]
        sync_config = None
        if "sync_config" in spec:
            sync_config = tune.SyncConfig(**spec["sync_config"])
        try:
            result = tune.run(
                trainable_name,
                name=exp_id,
                config=spec["config"],
                sync_config=sync_config,
                loggers=tune_loggers,
                **spec["run_kwargs"],
            )
        finally:
            ray.shutdown()

        return result, exp_id
示例#18
0
    def testNoSync(self):
        """Sync should not run on a single node."""
        def sync_func(source, target):
            pass

        sync_config = tune.SyncConfig(sync_to_driver=sync_func)

        with patch.object(CommandBasedClient, "_execute") as mock_sync:
            [trial] = tune.run("__fake",
                               name="foo",
                               max_failures=0,
                               stop={
                                   "training_iteration": 1
                               },
                               sync_config=sync_config).trials
            self.assertEqual(mock_sync.call_count, 0)
示例#19
0
def main(smoke_test: bool = False):
    ray.init(address="auto")

    num_samples = 100 if not smoke_test else 20
    results_per_second = 0.01
    trial_length_s = 300

    max_runtime = 1000

    timed_tune_run(
        name="result network overhead",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        resources_per_trial={"cpu": 2},  # One per node
        sync_config=tune.SyncConfig(syncer="auto"))
示例#20
0
def main():
    ray.init(address="auto")

    num_samples = 200
    results_per_second = 0.01
    trial_length_s = 300

    max_runtime = 1000

    timed_tune_run(
        name="result network overhead",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        resources_per_trial={"cpu": 2},  # One per node
        sync_config=tune.SyncConfig(sync_to_driver=True))
示例#21
0
    def testCloudSyncPeriod(self):
        """Tests that changing SYNC_PERIOD affects syncing frequency."""
        tmpdir = tempfile.mkdtemp()

        def trainable(config):
            for i in range(10):
                time.sleep(1)
                tune.report(score=i)

        def counter(local, remote):
            count_file = os.path.join(tmpdir, "count.txt")
            if not os.path.exists(count_file):
                count = 0
            else:
                with open(count_file, "rb") as fp:
                    count = pickle.load(fp)
            count += 1
            with open(count_file, "wb") as fp:
                pickle.dump(count, fp)

        sync_config = tune.SyncConfig(upload_dir="test",
                                      syncer=counter,
                                      sync_period=1)
        # This was originally set to 0.5
        os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0"
        self.addCleanup(
            lambda: os.environ.pop("TUNE_GLOBAL_CHECKPOINT_S", None))
        [trial] = tune.run(
            trainable,
            name="foo",
            max_failures=0,
            local_dir=tmpdir,
            stop={
                "training_iteration": 10
            },
            sync_config=sync_config,
        ).trials

        count_file = os.path.join(tmpdir, "count.txt")
        with open(count_file, "rb") as fp:
            count = pickle.load(fp)

        self.assertEqual(count, 12)
        shutil.rmtree(tmpdir)
示例#22
0
    def run(self, fast_dev_run=False, use_gpus=False, log_to_file=False):
        search_dict = self.search_params.to_ray_tune_search_dict()
        # see tune.utils.UtilMonitor
        search_dict['log_sys_usage'] = True

        # noinspection PyTypeChecker
        analysis = tune.run(
            run_or_experiment=self._get_train_fn(fast_dev_run=fast_dev_run,
                                                 include_gpus=use_gpus),
            name=self.search_params.exp.get_project_exp_name(),
            stop=self.get_tune_stopper(self.search_params.opt.num_epochs),
            config=search_dict,
            resources_per_trial=self.get_resources_per_trial(
                self.search_params, include_gpu=use_gpus),
            num_samples=self.search_params.tune.num_hp_samples,
            sync_config=tune.SyncConfig(
                upload_dir=self.search_params.metrics.output_dir),
            loggers=self.get_tune_loggers(),
            log_to_file=log_to_file,
            keep_checkpoints_num=2,
            checkpoint_score_attr=
            f'{self.search_params.opt.search_mode}-{self.search_params.opt.search_metric}',
            fail_fast=False,
            scheduler=self.get_tune_scheduler(self.search_params),
            verbose=2,
            progress_reporter=self.get_cli_reporter(),
            reuse_actors=False,
        )

        utils.hprint("done with tune.run")

        param_hash = self.search_params.get_short_hash(num_chars=8)
        analysis_file = self.search_params.metrics.output_dir / f'tune_analysis_{param_hash}.pkl'
        print(f"Saving {analysis_file}")
        utils.save_pickle(analysis_file, analysis)

        best_trial = analysis.get_best_trial(
            self.search_params.opt.search_metric,
            self.search_params.opt.search_mode, "last-5-avg")
        print(f'best_trial.last_result: {best_trial.last_result}')
        print("Best trial config: {}".format(best_trial.config))
        print("Best trial final search_metric: {}".format(
            best_trial.last_result[self.search_params.opt.search_metric]))
示例#23
0
def main():
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 1 / 60
    trial_length_s = 86400

    max_runtime = 90000

    timed_tune_run(
        name="long running large checkpoints",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=900,  # Once every 15 minutes
        checkpoint_size_b=int(3.75 * 1000**3),
        keep_checkpoints_num=2,  # 2 * 16 * 4 = 128 GB
        resources_per_trial={"cpu": 1},
        sync_config=tune.SyncConfig(sync_to_driver=True))
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 1000
    results_per_second = 0.5
    trial_length_s = 100

    max_runtime = 120

    if is_ray_cluster():
        # Add constant overhead for SSH connection
        max_runtime = 120

    timed_tune_run(name="result throughput cluster",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime,
                   sync_config=tune.SyncConfig(sync_to_driver=False))  # Tweak!
示例#25
0
def main(bucket):
    secrets_file = os.path.join(os.path.dirname(__file__), "..",
                                "aws_secrets.txt")
    if os.path.isfile(secrets_file):
        print(f"Loading AWS secrets from file {secrets_file}")

        from configparser import ConfigParser
        config = ConfigParser()
        config.read(secrets_file)

        for k, v in config.items():
            for x, y in v.items():
                var = str(x).upper()
                os.environ[var] = str(y)
    else:
        print("No AWS secrets file found.")

    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000**2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            sync_to_driver=False,
            upload_dir=f"s3://{bucket}/durable/",
        ))
示例#26
0
    def testCloudFunctions(self):
        tmpdir = tempfile.mkdtemp()
        tmpdir2 = tempfile.mkdtemp()
        os.mkdir(os.path.join(tmpdir2, "foo"))

        def sync_func(local, remote):
            for filename in glob.glob(os.path.join(local, "*.json")):
                shutil.copy(filename, remote)

        sync_config = tune.SyncConfig(upload_dir=tmpdir2, syncer=sync_func)
        [trial] = tune.run("__fake",
                           name="foo",
                           max_failures=0,
                           local_dir=tmpdir,
                           stop={
                               "training_iteration": 1
                           },
                           sync_config=sync_config).trials
        test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json"))
        self.assertTrue(test_file_path)
        shutil.rmtree(tmpdir)
        shutil.rmtree(tmpdir2)
示例#27
0
def main():
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000**2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            sync_to_driver=False,
            upload_dir="s3://ray-tune-scalability-test/durable/",
        ))
def main(smoke_test: bool = False):
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 1 / 60
    trial_length_s = 86400 if smoke_test else 3600

    max_runtime = 90000 if smoke_test else 4200

    callback = ProgressCallback()

    timed_tune_run(
        name="long running large checkpoints",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=900,  # Once every 15 minutes
        checkpoint_size_b=int(0.75 * 1000**3),
        keep_checkpoints_num=2,  # 2 * 16 * 4 = 128 GB
        resources_per_trial={"cpu": 1},
        sync_config=tune.SyncConfig(syncer="auto"),
        callbacks=[callback])
示例#29
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        **kwargs,
    ) -> RayTuneResults:
        if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset):
            dataset = os.path.abspath(dataset)

        if isinstance(backend, str):
            backend = initialize_backend(backend)

        if gpus is not None:
            raise ValueError(
                "Parameter `gpus` is not supported when using Ray Tune. "
                "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your "
                "hyperopt config."
            )

        if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1:
            # Enforce fractional GPU utilization
            gpu_memory_limit = self.gpu_resources_per_trial

        hyperopt_dict = dict(
            config=config,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            # model_load_path=model_load_path,
            # model_resume_path=model_resume_path,
            eval_split=self.split,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
            skip_save_eval_stats=skip_save_eval_stats,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
            backend=backend,
            random_seed=random_seed,
            debug=debug,
        )

        mode = "min" if self.goal != MAXIMIZE else "max"
        metric = "metric_score"
        if self.search_alg_dict is not None:
            if TYPE not in self.search_alg_dict:
                logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.")
                search_alg = None
            else:
                search_alg_type = self.search_alg_dict[TYPE]
                search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict)
        else:
            search_alg = None

        if self.max_concurrent_trials:
            assert (
                self.max_concurrent_trials > 0
            ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}"
            if isinstance(search_alg, BasicVariantGenerator) or search_alg is None:
                search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials)
            elif isinstance(search_alg, ConcurrencyLimiter):
                raise ValueError(
                    "You have specified `max_concurrent_trials`, but the search "
                    "algorithm is already a `ConcurrencyLimiter`. FIX THIS "
                    "by setting `max_concurrent_trials=None`."
                )
            else:
                search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials)

        resources_per_trial = {
            "cpu": self._cpu_resources_per_trial_non_none,
            "gpu": self._gpu_resources_per_trial_non_none,
        }

        def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None):
            return self._run_experiment(
                config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend)
            )

        tune_config = {}
        tune_callbacks = []
        for callback in callbacks or []:
            run_experiment_trial, tune_config = callback.prepare_ray_tune(
                run_experiment_trial,
                tune_config,
                tune_callbacks,
            )

        if _is_ray_backend(backend):
            # we can't set Trial actor's CPUs to 0 so we just go very low
            resources_per_trial = PlacementGroupFactory(
                [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none)
                if self._gpu_resources_per_trial_non_none
                else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none
            )

        if has_remote_protocol(output_directory):
            run_experiment_trial = tune.durable(run_experiment_trial)
            self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory)
            output_directory = None
        elif self.kubernetes_namespace:
            from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer

            self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace))

        run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict)
        register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params)

        analysis = tune.run(
            f"trainable_func_f{hash_dict(config).decode('ascii')}",
            config={
                **self.search_space,
                **tune_config,
            },
            scheduler=self.scheduler,
            search_alg=search_alg,
            num_samples=self.num_samples,
            keep_checkpoints_num=1,
            max_failures=1,  # retry a trial failure once
            resources_per_trial=resources_per_trial,
            time_budget_s=self.time_budget_s,
            sync_config=self.sync_config,
            local_dir=output_directory,
            metric=metric,
            mode=mode,
            trial_name_creator=lambda trial: f"trial_{trial.trial_id}",
            trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
            callbacks=tune_callbacks,
        )

        if "metric_score" in analysis.results_df.columns:
            ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE)

            # Catch nans in edge case where the trial doesn't complete
            temp_ordered_trials = []
            for kwargs in ordered_trials.to_dict(orient="records"):
                for key in ["parameters", "training_stats", "eval_stats"]:
                    if isinstance(kwargs[key], float):
                        kwargs[key] = {}
                temp_ordered_trials.append(kwargs)

            # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate
            # tune.report call(s) but were terminated before reporting eval_stats from post-train
            # evaluation (e.g., trial stopped due to time budget or relatively poor performance.)
            # For any such trials, run model evaluation for the best model in that trial & record
            # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json.
            for trial in temp_ordered_trials:
                if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}":
                    # Evaluate the best model on the eval_split, which is validation_set
                    if validation_set is not None and validation_set.size > 0:
                        trial_path = trial["trial_dir"]
                        best_model_path = self._get_best_model_path(trial_path, analysis)
                        if best_model_path is not None:
                            self._evaluate_best_model(
                                trial,
                                trial_path,
                                best_model_path,
                                validation_set,
                                data_format,
                                skip_save_unprocessed_output,
                                skip_save_predictions,
                                skip_save_eval_stats,
                                gpus,
                                gpu_memory_limit,
                                allow_parallel_threads,
                                backend,
                                debug,
                            )
                        else:
                            logger.warning("Skipping evaluation as no model checkpoints were available")
                    else:
                        logger.warning("Skipping evaluation as no validation set was provided")

            ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials]
        else:
            logger.warning("No trials reported results; check if time budget lower than epoch latency")
            ordered_trials = []

        return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
示例#30
0
def tune_test(path,
              num_trials,
              num_workers,
              num_boost_rounds,
              num_files=0,
              regression=False,
              use_gpu=False,
              fake_data=False,
              smoke_test=False):
    ray_params = RayParams(elastic_training=False,
                           max_actor_restarts=0,
                           num_actors=num_workers,
                           cpus_per_actor=1,
                           gpus_per_actor=0 if not use_gpu else 1)

    def local_train(config):
        temp_dir = None
        if fake_data or smoke_test:
            temp_dir = "/tmp/release_test_data"
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)

            os.makedirs(temp_dir, 0o755)
            local_path = os.path.join(temp_dir, "smoketest.parquet")

            create_parquet(filename=local_path,
                           num_rows=args.num_workers * 500,
                           num_features=4,
                           num_classes=2,
                           num_partitions=args.num_workers * 10)
        else:
            if not os.path.exists(path):
                raise ValueError(
                    f"Benchmarking data not found: {path}."
                    f"\nFIX THIS by running `python create_test_data.py` "
                    f"on all nodes first.")
            local_path = path

        xgboost_params = {
            "tree_method": "hist" if not use_gpu else "gpu_hist",
        }

        xgboost_params.update({
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
        })

        xgboost_params.update(config)

        additional_results = {}

        bst, time_taken = train_ray(
            path=local_path,
            num_workers=num_workers,
            num_boost_rounds=num_boost_rounds,
            num_files=num_files,
            regression=regression,
            use_gpu=use_gpu,
            smoke_test=smoke_test,
            ray_params=ray_params,
            xgboost_params=xgboost_params,
            # kwargs
            additional_results=additional_results,
            callbacks=[PlacementCallback(),
                       TuneReportCallback()])

        bst.save_model("tuned.xgb")

        trial_ips = []
        for rank, ips in enumerate(additional_results["callback_returns"]):
            for ip in ips:
                trial_ips.append(ip)

        tune_trial = get_trial_id()
        with tune.checkpoint_dir(num_boost_rounds + 1) as checkpoint_dir:
            with open(os.path.join(checkpoint_dir, "callback_returns.json"),
                      "wt") as f:
                json.dump({tune_trial: trial_ips}, f)

        if temp_dir:
            shutil.rmtree(temp_dir)

    search_space = {
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9)
    }

    analysis = tune.run(
        local_train,
        config=search_space,
        num_samples=num_trials,
        sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer),
        resources_per_trial=ray_params.get_tune_resources())

    # In our PACK scheduling, we expect that each IP hosts only workers
    # for one Ray Tune trial.
    ip_to_trials = defaultdict(list)
    for trial in analysis.trials:
        trial = trial
        with open(
                os.path.join(trial.checkpoint.value, "callback_returns.json"),
                "rt") as f:
            trial_to_ips = json.load(f)
        for tune_trial, ips in trial_to_ips.items():
            for node_ip in ips:
                ip_to_trials[node_ip].append(tune_trial)

    fail = False
    for ip, trial_ids in ip_to_trials.items():
        print(f"For IP {ip} got trial IDs {trial_ids}")
        fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids)

    if fail:
        raise ValueError("Different trial IDs found on same node.")
    else:
        print("Success.")