def testHasResourcesForTrialWithCaching(self):
        pgm = _PlacementGroupManager()
        pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}])
        pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}])

        executor = RayTrialExecutor(reuse_actors=True)
        executor._pg_manager = pgm
        executor.set_max_pending_trials(1)

        def train(config):
            yield 1
            yield 2
            yield 3
            yield 4

        register_trainable("resettable", train)

        trial1 = Trial("resettable", placement_group_factory=pgf1)
        trial2 = Trial("resettable", placement_group_factory=pgf1)
        trial3 = Trial("resettable", placement_group_factory=pgf2)

        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert executor.has_resources_for_trial(trial3)

        executor._stage_and_update_status([trial1, trial2, trial3])

        while not pgm.has_ready(trial1):
            time.sleep(1)
            executor._stage_and_update_status([trial1, trial2, trial3])

        # Fill staging
        executor._stage_and_update_status([trial1, trial2, trial3])

        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert not executor.has_resources_for_trial(trial3)

        executor._start_trial(trial1)
        executor._stage_and_update_status([trial1, trial2, trial3])
        executor.pause_trial(
            trial1)  # Caches the PG and removes a PG from staging

        assert len(pgm._staging_futures) == 0

        # This will re-schedule a placement group
        pgm.reconcile_placement_groups([trial1, trial2])

        assert len(pgm._staging_futures) == 1

        assert not pgm.can_stage()

        # We should still have resources for this trial as it has a cached PG
        assert executor.has_resources_for_trial(trial1)
        assert executor.has_resources_for_trial(trial2)
        assert not executor.has_resources_for_trial(trial3)
示例#2
0
    def example_resources_allocation_function(
        trial_runner: "trial_runner.TrialRunner",
        trial: Trial,
        result: Dict[str, Any],
        scheduler: "ResourceChangingScheduler",
    ) -> Optional[Union[PlacementGroupFactory, Resources]]:
        """This is a basic example of a resource allocating function.

        The function naively balances available CPUs over live trials.

        This function returns a new ``PlacementGroupFactory`` with updated
        resource requirements, or None. If the returned
        ``PlacementGroupFactory`` is equal by value to the one the
        trial has currently, the scheduler will skip the update process
        internally (same with None).

        See :class:`DistributeResources` for a more complex,
        robust approach.

        Args:
            trial_runner: Trial runner for this Tune run.
                Can be used to obtain information about other trials.
            trial: The trial to allocate new resources to.
            result: The latest results of trial.
            scheduler: The scheduler calling the function.
        """

        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler._base_trial_resources

        # Don't bother if this is just the first iteration
        if result["training_iteration"] < 1:
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        # Assume that the number of CPUs cannot go below what was
        # specified in tune.run
        min_cpu = base_trial_resource.required_resources.get("CPU", 0)

        # Get the number of CPUs available in total (not just free)
        total_available_cpus = (
            trial_runner.trial_executor._resource_updater.get_num_cpus()
        )

        # Divide the free CPUs among all live trials
        cpu_to_use = max(
            min_cpu, total_available_cpus // len(trial_runner.get_live_trials())
        )

        # Assign new CPUs to the trial in a PlacementGroupFactory
        return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])
示例#3
0
    def testResourceDeadlock(self):
        """Tests that resource deadlock is avoided for heterogeneous PGFs.

        We start 4 trials in a cluster with 2 CPUs. The first two trials
        require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU.

        The second trial needs a bit more time to finish. This means that the
        resources from the first trial will be freed, and the PG of the
        _fourth_ trial becomes ready (not that of the third trial, because that
        requires 2 CPUs - however, one is still occupied by trial 2).

        After the first two trials finished, the FIFOScheduler tries to start
        the third trial. However, it can't be started because its placement
        group is not ready. Instead, the placement group of the fourth
        trial is ready. Thus, we opt to run the fourth trial instead.
        """
        def train(config):
            time.sleep(config["sleep"])
            return 4

        ray.init(num_cpus=2)

        tune.register_trainable("het", train)
        pgf1 = PlacementGroupFactory([{"CPU": 1}])
        pgf2 = PlacementGroupFactory([{"CPU": 2}])

        trial1 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf1)
        trial2 = Trial("het",
                       config={"sleep": 2},
                       placement_group_factory=pgf1)
        trial3 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf2)
        trial4 = Trial("het",
                       config={"sleep": 0},
                       placement_group_factory=pgf1)

        runner = TrialRunner(fail_fast=True)
        runner.add_trial(trial1)
        runner.add_trial(trial2)
        runner.add_trial(trial3)
        runner.add_trial(trial4)

        timeout = time.monotonic() + 30
        while not runner.is_finished():
            # We enforce a timeout here
            self.assertLess(time.monotonic(),
                            timeout,
                            msg="Ran into a resource deadlock")

            runner.step()
示例#4
0
    def testPlacementGroupFactoryEquality(self):
        """
        Test that two different placement group factory objects are considered
        equal and evaluate to the same hash.
        """
        from collections import Counter

        pgf_1 = PlacementGroupFactory(
            [{"CPU": 2, "GPU": 4, "custom": 7}, {"GPU": 2, "custom": 1, "CPU": 3}],
            "PACK",
            "no_name",
            None,
        )

        pgf_2 = PlacementGroupFactory(
            [
                {
                    "custom": 7,
                    "GPU": 4,
                    "CPU": 2,
                },
                {"custom": 1, "GPU": 2, "CPU": 3},
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        pgf_3 = PlacementGroupFactory(
            [
                {"custom": 7, "GPU": 4, "CPU": 2.0, "custom2": 0},
                {"custom": 1.0, "GPU": 2, "CPU": 3, "custom2": 0},
            ],
            strategy="PACK",
            name="no_name",
            lifetime=None,
        )

        self.assertEqual(pgf_1, pgf_2)
        self.assertEqual(pgf_2, pgf_3)

        # Hash testing
        counter = Counter()
        counter[pgf_1] += 1
        counter[pgf_2] += 1
        counter[pgf_3] += 1

        self.assertEqual(counter[pgf_1], 3)
        self.assertEqual(counter[pgf_2], 3)
        self.assertEqual(counter[pgf_3], 3)
示例#5
0
文件: config.py 项目: parasj/ray
    def as_placement_group_factory(self) -> "PlacementGroupFactory":
        """Returns a PlacementGroupFactory to specify resources for Tune."""
        from ray.tune.execution.placement_groups import PlacementGroupFactory

        trainer_resources = self._trainer_resources_not_none
        trainer_bundle = [trainer_resources]
        worker_resources = {
            "CPU": self.num_cpus_per_worker,
            "GPU": self.num_gpus_per_worker,
        }
        worker_resources_extra = ({} if self.resources_per_worker is None else
                                  self.resources_per_worker)
        worker_bundles = [{
            **worker_resources,
            **worker_resources_extra
        } for _ in range(self.num_workers if self.num_workers else 0)]
        bundles = trainer_bundle + worker_bundles
        if self._max_cpu_fraction_per_node is not None:
            kwargs = {
                "_max_cpu_fraction_per_node": self._max_cpu_fraction_per_node,
            }
        else:
            kwargs = {}
        return PlacementGroupFactory(bundles,
                                     strategy=self.placement_strategy,
                                     **kwargs)
示例#6
0
    def testResourcesAvailableWithPlacementGroup(self):
        def train(config):
            tune.report(metric=0, resources=ray.available_resources())

        head_bundle = {"CPU": 1, "GPU": 0, "custom": 4}
        child_bundle = {"CPU": 2, "GPU": 1, "custom": 3}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle]
        )

        out = tune.run(train, resources_per_trial=placement_group_factory)

        available = {
            key: val
            for key, val in out.trials[0].last_result["resources"].items()
            if key in ["CPU", "GPU", "custom"]
        }

        if not available:
            self.skipTest(
                "Warning: Ray reported no available resources, "
                "but this is an error on the Ray core side. "
                "Skipping this test for now."
            )

        self.assertDictEqual(
            available,
            {
                "CPU": self.head_cpus - 5.0,
                "GPU": self.head_gpus - 2.0,
                "custom": self.head_custom - 10.0,
            },
        )
 def default_resource_request(cls, config):
     head_bundle = {"CPU": 1, "GPU": 0}
     child_bundle = {"CPU": 1}
     return PlacementGroupFactory(
         [head_bundle, child_bundle, child_bundle],
         strategy=config["placement_strategy"],
     )
 def _get_resources_from_bundles(
     self, bundles: List[Dict[str, float]]
 ) -> Dict[str, float]:
     """Get total sums of resources in bundles"""
     if not bundles:
         return {"CPU": 0, "GPU": 0}
     pgf = PlacementGroupFactory(bundles)
     return pgf.required_resources
示例#9
0
def test_placement_group_no_cpu_trainer():
    """Bundles with only GPU:1 but no CPU should work"""
    ray.init(num_gpus=1, num_cpus=1)
    pgf = PlacementGroupFactory([{"GPU": 1, "CPU": 0}, {"CPU": 1}])

    def train(config):
        time.sleep(1)
        return 5

    tune.run(train, resources_per_trial=pgf)
示例#10
0
    def default_resource_request(cls, config):
        cf = dict(cls.get_default_config(), **config)

        eval_config = cf["evaluation_config"]

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[
                {
                    # Driver + Aggregation Workers:
                    # Force to be on same node to maximize data bandwidth
                    # between aggregation workers and the learner (driver).
                    # Aggregation workers tree-aggregate experiences collected
                    # from RolloutWorkers (n rollout workers map to m
                    # aggregation workers, where m < n) and always use 1 CPU
                    # each.
                    "CPU": cf["num_cpus_for_driver"] + cf["num_aggregation_workers"],
                    "GPU": 0 if cf["_fake_gpus"] else cf["num_gpus"],
                }
            ]
            + [
                {
                    # RolloutWorkers.
                    "CPU": cf["num_cpus_per_worker"],
                    "GPU": cf["num_gpus_per_worker"],
                    **cf["custom_resources_per_worker"],
                }
                for _ in range(cf["num_workers"])
            ]
            + (
                [
                    {
                        # Evaluation (remote) workers.
                        # Note: The local eval worker is located on the driver
                        # CPU or not even created iff >0 eval workers.
                        "CPU": eval_config.get(
                            "num_cpus_per_worker", cf["num_cpus_per_worker"]
                        ),
                        "GPU": eval_config.get(
                            "num_gpus_per_worker", cf["num_gpus_per_worker"]
                        ),
                        **eval_config.get(
                            "custom_resources_per_worker",
                            cf["custom_resources_per_worker"],
                        ),
                    }
                    for _ in range(cf["evaluation_num_workers"])
                ]
                if cf["evaluation_interval"]
                else []
            ),
            strategy=config.get("placement_strategy", "PACK"),
        )
示例#11
0
    def testNewResources(self):
        sched = ResourceChangingScheduler(resources_allocation_function=(
            lambda a, b, c, d: PlacementGroupFactory([{
                "CPU": 2
            }])))

        def train(config, checkpoint_dir=None):
            tune.report(metric=1, resources=tune.get_trial_resources())

        analysis = tune.run(
            train,
            scheduler=sched,
            stop={"training_iteration": 2},
            resources_per_trial=PlacementGroupFactory([{
                "CPU": 1
            }]),
            num_samples=1,
        )

        results_list = list(analysis.results.values())
        assert results_list[0]["resources"].head_cpus == 2.0
示例#12
0
    def default_resource_request(cls, config):
        cf = dict(cls.get_default_config(), **config)
        # Construct a dummy LeagueBuilder, such that it gets the opportunity to
        # adjust the multiagent config, according to its setup, and we can then
        # properly infer the resources to allocate.
        from_config(cf["league_builder_config"],
                    trainer=None,
                    trainer_config=cf)

        max_num_policies_to_train = cf["max_num_policies_to_train"] or len(
            cf["multiagent"].get("policies_to_train")
            or cf["multiagent"]["policies"])
        num_learner_shards = min(cf["num_gpus"] or max_num_policies_to_train,
                                 max_num_policies_to_train)
        num_gpus_per_shard = cf["num_gpus"] / num_learner_shards
        num_policies_per_shard = max_num_policies_to_train / num_learner_shards

        fake_gpus = cf["_fake_gpus"]

        eval_config = cf["evaluation_config"]

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[{
                # Driver (no GPUs).
                "CPU": cf["num_cpus_for_driver"],
            }] + [
                {
                    # RolloutWorkers (no GPUs).
                    "CPU": cf["num_cpus_per_worker"],
                } for _ in range(cf["num_workers"])
            ] + [
                {
                    # Policy learners (and Replay buffer shards).
                    # 1 CPU for the replay buffer.
                    # 1 CPU (or fractional GPU) for each learning policy.
                    "CPU": 1 + (num_policies_per_shard if fake_gpus else 0),
                    "GPU": 0 if fake_gpus else num_gpus_per_shard,
                } for _ in range(num_learner_shards)
            ] + ([
                {
                    # Evaluation (remote) workers.
                    # Note: The local eval worker is located on the driver
                    # CPU or not even created iff >0 eval workers.
                    "CPU":
                    eval_config.get("num_cpus_per_worker",
                                    cf["num_cpus_per_worker"]),
                } for _ in range(cf["evaluation_num_workers"])
            ] if cf["evaluation_interval"] else []),
            strategy=config.get("placement_strategy", "PACK"),
        )
示例#13
0
def main():
    # __tune_begin__
    from ray import tune

    # Set config
    config = {
        "tree_method": "approx",
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),
        "subsample": tune.uniform(0.5, 1.0),
        "max_depth": tune.randint(1, 9),
    }
    # __tune_end__

    # __tune_run_begin__
    analysis = tune.run(
        train_model,
        config=config,
        metric="eval-error",
        mode="min",
        num_samples=4,
        resources_per_trial=PlacementGroupFactory([{
            "CPU": 1.0
        }] + [{
            "CPU": float(num_cpus_per_actor)
        }] * num_actors),
    )

    # Load in the best performing model.
    best_bst = load_best_model(analysis.best_logdir)

    # Use the following code block instead if using Ray Client.
    # import ray
    # if ray.util.client.ray.is_connected():
    #     # If using Ray Client best_logdir is a directory on the server.
    #     # So we want to make sure we wrap model loading in a task.
    #     remote_load_fn = ray.remote(load_best_model)
    #     best_bst = ray.get(remote_load_fn.remote(analysis.best_logdir))

    # Do something with the best model.
    _ = best_bst

    accuracy = 1.0 - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")
示例#14
0
    def default_resource_request(cls, config):
        cf = dict(cls.get_default_config(), **config)

        eval_config = cf["evaluation_config"]

        # Return PlacementGroupFactory containing all needed resources
        # (already properly defined as device bundles).
        return PlacementGroupFactory(
            bundles=[{
                # Local worker + replay buffer actors.
                # Force replay buffers to be on same node to maximize
                # data bandwidth between buffers and the learner (driver).
                # Replay buffer actors each contain one shard of the total
                # replay buffer and use 1 CPU each.
                "CPU":
                cf["num_cpus_for_driver"] +
                cf["optimizer"]["num_replay_buffer_shards"],
                "GPU":
                0 if cf["_fake_gpus"] else cf["num_gpus"],
            }] + [
                {
                    # RolloutWorkers.
                    "CPU": cf["num_cpus_per_worker"],
                    "GPU": cf["num_gpus_per_worker"],
                    **cf["custom_resources_per_worker"],
                } for _ in range(cf["num_workers"])
            ] + ([
                {
                    # Evaluation workers.
                    # Note: The local eval worker is located on the driver
                    # CPU.
                    "CPU":
                    eval_config.get("num_cpus_per_worker",
                                    cf["num_cpus_per_worker"]),
                    "GPU":
                    eval_config.get("num_gpus_per_worker",
                                    cf["num_gpus_per_worker"]),
                    **eval_config.get(
                        "custom_resources_per_worker",
                        cf["custom_resources_per_worker"],
                    ),
                } for _ in range(cf["evaluation_num_workers"])
            ] if cf["evaluation_interval"] else []),
            strategy=config.get("placement_strategy", "PACK"),
        )
示例#15
0
    def test_default_resource_request_plus_manual_leads_to_error(self):
        config = DEFAULT_CONFIG.copy()
        config["model"]["fcnet_hiddens"] = [10]
        config["num_workers"] = 0
        config["env"] = "CartPole-v0"

        try:
            tune.run(
                "PG",
                config=config,
                stop={"training_iteration": 2},
                resources_per_trial=PlacementGroupFactory([{
                    "CPU": 1
                }]),
                verbose=2,
            )
        except ValueError as e:
            assert "have been automatically set to" in e.args[0]
示例#16
0
    def testExtraCustomResources(self):
        ray.init(num_cpus=4, num_gpus=2, resources={"a": 2})
        # Since each trial will occupy the full custom resources,
        # there are at most 1 trial running at any given moment.
        snapshot = TrialStatusSnapshot()
        runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)])
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "placement_group_factory": PlacementGroupFactory([{"CPU": 1}, {"a": 2}]),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        self.assertLess(snapshot.max_running_trials(), 2)
        self.assertTrue(snapshot.all_trials_are_terminated())
示例#17
0
    def testExtraResources(self):
        ray.init(num_cpus=4, num_gpus=2)
        snapshot = TrialStatusSnapshot()
        runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)])
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "placement_group_factory": PlacementGroupFactory(
                [{"CPU": 1}, {"CPU": 3, "GPU": 1}]
            ),
        }
        trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        while not runner.is_finished():
            runner.step()

        self.assertLess(snapshot.max_running_trials(), 2)
        self.assertTrue(snapshot.all_trials_are_terminated())
示例#18
0
    def as_placement_group_factory(self) -> "PlacementGroupFactory":
        """Returns a PlacementGroupFactory to specify resources for Tune."""
        from ray.tune.execution.placement_groups import PlacementGroupFactory

        trainer_resources = (self.trainer_resources
                             if self.trainer_resources else {
                                 "CPU": 1
                             })
        trainer_bundle = [trainer_resources]
        worker_resources = {
            "CPU": self.num_cpus_per_worker,
            "GPU": self.num_gpus_per_worker,
        }
        worker_resources_extra = ({} if self.resources_per_worker is None else
                                  self.resources_per_worker)
        worker_bundles = [{
            **worker_resources,
            **worker_resources_extra
        } for _ in range(self.num_workers if self.num_workers else 0)]
        bundles = trainer_bundle + worker_bundles
        return PlacementGroupFactory(bundles, strategy=self.placement_strategy)
示例#19
0
    def testWandbLoggerReporting(self):
        trial_config = {"par1": 4, "par2": 9.12345678}
        trial = Trial(
            trial_config,
            0,
            "trial_0",
            "trainable",
            PlacementGroupFactory([{
                "CPU": 1
            }]),
            "/tmp",
        )

        logger = WandbTestExperimentLogger(project="test_project",
                                           api_key="1234",
                                           excludes=["metric2"])
        logger.on_trial_start(0, [], trial)

        r1 = {
            "metric1": 0.8,
            "metric2": 1.4,
            "metric3": np.asarray(32.0),
            "metric4": np.float32(32.0),
            "const": "text",
            "config": trial_config,
        }

        logger.on_trial_result(0, [], trial, r1)

        logged = logger.trial_processes[trial].logs.get(timeout=10)
        self.assertIn("metric1", logged)
        self.assertNotIn("metric2", logged)
        self.assertIn("metric3", logged)
        self.assertIn("metric4", logged)
        self.assertNotIn("const", logged)
        self.assertNotIn("config", logged)

        del logger
示例#20
0
def tune_xgboost(use_class_trainable=True):
    search_space = {
        # You can mix constants with search space objects.
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": 9,
        "learning_rate": 1,
        "min_child_weight": tune.grid_search([2, 3]),
        "subsample": tune.grid_search([0.8, 0.9]),
        "colsample_bynode": tune.grid_search([0.8, 0.9]),
        "random_state": 1,
        "num_parallel_tree": 2000,
    }
    # This will enable aggressive early stopping of bad trials.
    base_scheduler = ASHAScheduler(
        max_t=16, grace_period=1, reduction_factor=2  # 16 training iterations
    )

    def example_resources_allocation_function(
        trial_runner: "trial_runner.TrialRunner",
        trial: Trial,
        result: Dict[str, Any],
        scheduler: "ResourceChangingScheduler",
    ) -> Optional[Union[PlacementGroupFactory, Resources]]:
        """This is a basic example of a resource allocating function.

        The function naively balances available CPUs over live trials.

        This function returns a new ``PlacementGroupFactory`` with updated
        resource requirements, or None. If the returned
        ``PlacementGroupFactory`` is equal by value to the one the
        trial has currently, the scheduler will skip the update process
        internally (same with None).

        See :class:`DistributeResources` for a more complex,
        robust approach.

        Args:
            trial_runner: Trial runner for this Tune run.
                Can be used to obtain information about other trials.
            trial: The trial to allocate new resources to.
            result: The latest results of trial.
            scheduler: The scheduler calling the function.
        """

        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler._base_trial_resources

        # Don't bother if this is just the first iteration
        if result["training_iteration"] < 1:
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        # Assume that the number of CPUs cannot go below what was
        # specified in tune.run
        min_cpu = base_trial_resource.required_resources.get("CPU", 0)

        # Get the number of CPUs available in total (not just free)
        total_available_cpus = (
            trial_runner.trial_executor._resource_updater.get_num_cpus()
        )

        # Divide the free CPUs among all live trials
        cpu_to_use = max(
            min_cpu, total_available_cpus // len(trial_runner.get_live_trials())
        )

        # Assign new CPUs to the trial in a PlacementGroupFactory
        return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])

    # You can either define your own resources_allocation_function, or
    # use the default one - DistributeResources

    # from ray.tune.schedulers.resource_changing_scheduler import \
    #    DistributeResources

    scheduler = ResourceChangingScheduler(
        base_scheduler=base_scheduler,
        resources_allocation_function=example_resources_allocation_function
        # resources_allocation_function=DistributeResources()  # default
    )

    if use_class_trainable:
        fn = BreastCancerTrainable
    else:
        fn = train_breast_cancer

    analysis = tune.run(
        fn,
        metric="eval-logloss",
        mode="min",
        resources_per_trial=PlacementGroupFactory([{"CPU": 1, "GPU": 0}]),
        config=search_space,
        num_samples=1,
        scheduler=scheduler,
        checkpoint_at_end=use_class_trainable,
    )

    if use_class_trainable:
        assert analysis.results_df["nthread"].max() > 1

    return analysis
示例#21
0
    def testWandbLoggerConfig(self):
        trial_config = {"par1": 4, "par2": 9.12345678}
        trial = Trial(
            trial_config,
            0,
            "trial_0",
            "trainable",
            PlacementGroupFactory([{
                "CPU": 1
            }]),
            "/tmp",
        )

        if WANDB_ENV_VAR in os.environ:
            del os.environ[WANDB_ENV_VAR]

        # No API key
        with self.assertRaises(ValueError):
            logger = WandbTestExperimentLogger(project="test_project")
            logger.setup()

        # API Key in config
        logger = WandbTestExperimentLogger(project="test_project",
                                           api_key="1234")
        logger.setup()
        self.assertEqual(os.environ[WANDB_ENV_VAR], "1234")

        del logger
        del os.environ[WANDB_ENV_VAR]

        # API Key file
        with tempfile.NamedTemporaryFile("wt") as fp:
            fp.write("5678")
            fp.flush()

            logger = WandbTestExperimentLogger(project="test_project",
                                               api_key_file=fp.name)
            logger.setup()
            self.assertEqual(os.environ[WANDB_ENV_VAR], "5678")

        del logger
        del os.environ[WANDB_ENV_VAR]

        # API Key in env
        os.environ[WANDB_ENV_VAR] = "9012"
        logger = WandbTestExperimentLogger(project="test_project")
        logger.setup()
        del logger

        # From now on, the API key is in the env variable.

        logger = WandbTestExperimentLogger(project="test_project")
        logger.log_trial_start(trial)

        self.assertEqual(logger.trial_processes[trial].kwargs["project"],
                         "test_project")
        self.assertEqual(logger.trial_processes[trial].kwargs["id"],
                         trial.trial_id)
        self.assertEqual(logger.trial_processes[trial].kwargs["name"],
                         trial.trial_name)
        self.assertEqual(logger.trial_processes[trial].kwargs["group"],
                         trial.trainable_name)
        self.assertIn("config", logger.trial_processes[trial]._exclude)

        del logger

        # log config.
        logger = WandbTestExperimentLogger(project="test_project",
                                           log_config=True)
        logger.log_trial_start(trial)
        self.assertNotIn("config", logger.trial_processes[trial]._exclude)
        self.assertNotIn("metric", logger.trial_processes[trial]._exclude)

        del logger

        # Exclude metric.
        logger = WandbTestExperimentLogger(project="test_project",
                                           excludes=["metric"])
        logger.log_trial_start(trial)
        self.assertIn("config", logger.trial_processes[trial]._exclude)
        self.assertIn("metric", logger.trial_processes[trial]._exclude)

        del logger
示例#22
0
    def testWandbDecoratorConfig(self):
        config = {"par1": 4, "par2": 9.12345678}
        trial = Trial(
            config,
            0,
            "trial_0",
            "trainable",
            PlacementGroupFactory([{
                "CPU": 1
            }]),
            "/tmp",
        )
        trial_info = _TrialInfo(trial)

        @wandb_mixin
        def train_fn(config):
            return 1

        train_fn.__mixins__ = (_MockWandbTrainableMixin, )

        config[TRIAL_INFO] = trial_info

        if WANDB_ENV_VAR in os.environ:
            del os.environ[WANDB_ENV_VAR]

        # Needs at least a project
        with self.assertRaises(ValueError):
            wrapped = wrap_function(train_fn)(config)

        # No API key
        config["wandb"] = {"project": "test_project"}
        with self.assertRaises(ValueError):
            wrapped = wrap_function(train_fn)(config)

        # API Key in config
        config["wandb"] = {"project": "test_project", "api_key": "1234"}
        wrapped = wrap_function(train_fn)(config)
        self.assertEqual(os.environ[WANDB_ENV_VAR], "1234")

        del os.environ[WANDB_ENV_VAR]

        # API Key file
        with tempfile.NamedTemporaryFile("wt") as fp:
            fp.write("5678")
            fp.flush()

            config["wandb"] = {
                "project": "test_project",
                "api_key_file": fp.name
            }

            wrapped = wrap_function(train_fn)(config)
            self.assertEqual(os.environ[WANDB_ENV_VAR], "5678")

        del os.environ[WANDB_ENV_VAR]

        # API Key in env
        os.environ[WANDB_ENV_VAR] = "9012"
        config["wandb"] = {"project": "test_project"}
        wrapped = wrap_function(train_fn)(config)

        # From now on, the API key is in the env variable.

        # Default configuration
        config["wandb"] = {"project": "test_project"}
        config[TRIAL_INFO] = trial_info

        wrapped = wrap_function(train_fn)(config)
        self.assertEqual(wrapped.wandb.kwargs["project"], "test_project")
        self.assertEqual(wrapped.wandb.kwargs["id"], trial.trial_id)
        self.assertEqual(wrapped.wandb.kwargs["name"], trial.trial_name)
示例#23
0
    def testWandbLoggerConfig(self):
        trial_config = {"par1": 4, "par2": 9.12345678}
        trial = Trial(
            trial_config,
            0,
            "trial_0",
            "trainable",
            PlacementGroupFactory([{
                "CPU": 1
            }]),
            "/tmp",
        )

        if WANDB_ENV_VAR in os.environ:
            del os.environ[WANDB_ENV_VAR]

        # Read project and group name from environment variable
        os.environ[WANDB_PROJECT_ENV_VAR] = "test_project_from_env_var"
        os.environ[WANDB_GROUP_ENV_VAR] = "test_group_from_env_var"
        logger = WandbTestExperimentLogger(api_key="1234")
        logger.setup()
        assert logger.project == "test_project_from_env_var"
        assert logger.group == "test_group_from_env_var"

        del logger
        del os.environ[WANDB_ENV_VAR]

        # No API key
        with self.assertRaises(ValueError):
            logger = WandbTestExperimentLogger(project="test_project")
            logger.setup()

        # API Key in config
        logger = WandbTestExperimentLogger(project="test_project",
                                           api_key="1234")
        logger.setup()
        self.assertEqual(os.environ[WANDB_ENV_VAR], "1234")

        del logger
        del os.environ[WANDB_ENV_VAR]

        # API Key file
        with tempfile.NamedTemporaryFile("wt") as fp:
            fp.write("5678")
            fp.flush()

            logger = WandbTestExperimentLogger(project="test_project",
                                               api_key_file=fp.name)
            logger.setup()
            self.assertEqual(os.environ[WANDB_ENV_VAR], "5678")

        del logger
        del os.environ[WANDB_ENV_VAR]

        # API Key from external hook
        os.environ[
            WANDB_SETUP_API_KEY_HOOK] = "ray._private.test_utils.wandb_setup_api_key_hook"
        logger = WandbTestExperimentLogger(project="test_project")
        logger.setup()
        self.assertEqual(os.environ[WANDB_ENV_VAR], "abcd")

        del logger
        del os.environ[WANDB_ENV_VAR]
        del os.environ[WANDB_SETUP_API_KEY_HOOK]

        # API Key in env
        os.environ[WANDB_ENV_VAR] = "9012"
        logger = WandbTestExperimentLogger(project="test_project")
        logger.setup()
        del logger

        # From now on, the API key is in the env variable.

        logger = WandbTestExperimentLogger(project="test_project")
        logger.log_trial_start(trial)

        self.assertEqual(logger.trial_processes[trial].kwargs["project"],
                         "test_project")
        self.assertEqual(logger.trial_processes[trial].kwargs["id"],
                         trial.trial_id)
        self.assertEqual(logger.trial_processes[trial].kwargs["name"],
                         trial.trial_name)
        self.assertEqual(logger.trial_processes[trial].kwargs["group"],
                         trial.trainable_name)
        self.assertIn("config", logger.trial_processes[trial]._exclude)

        del logger

        # log config.
        logger = WandbTestExperimentLogger(project="test_project",
                                           log_config=True)
        logger.log_trial_start(trial)
        self.assertNotIn("config", logger.trial_processes[trial]._exclude)
        self.assertNotIn("metric", logger.trial_processes[trial]._exclude)

        del logger

        # Exclude metric.
        logger = WandbTestExperimentLogger(project="test_project",
                                           excludes=["metric"])
        logger.log_trial_start(trial)
        self.assertIn("config", logger.trial_processes[trial]._exclude)
        self.assertIn("metric", logger.trial_processes[trial]._exclude)

        del logger
    def __call__(
        self,
        trial_runner: "trial_runner.TrialRunner",
        trial: Trial,
        result: Dict[str, Any],
        scheduler: "ResourceChangingScheduler",
    ) -> Optional[PlacementGroupFactory]:
        """Run resource allocation logic.

        Returns a new ``PlacementGroupFactory`` with updated
        resource requirements, or None. If the returned
        ``PlacementGroupFactory`` is equal by value to the one the
        trial has currently, the scheduler will skip the update process
        internally (same with None).

        Args:
            trial_runner: Trial runner for this Tune run.
                Can be used to obtain information about other trials.
            trial: The trial to allocate new resources to.
            result: The latest results of trial.
            scheduler: The scheduler calling
                the function.
        """
        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler.base_trial_resources

        if not self._validate(base_trial_resource=base_trial_resource, result=result):
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        if self.increase_by:
            increase_by = self.increase_by
            assert not self._is_bundle_empty(increase_by)
            assert increase_by.get("CPU", 0) >= 0 and increase_by.get("GPU", 0) >= 0
        elif self.add_bundles:
            increase_by = base_trial_resource.bundles[-1]
        elif base_trial_resource.bundles[0].get("GPU", 0):
            increase_by = {"GPU": 1}
        else:
            increase_by = {"CPU": 1}

        base_bundles = deepcopy(base_trial_resource.bundles)

        (
            total_available_cpus,
            total_available_gpus,
        ) = self._get_total_available_resources(trial_runner=trial_runner)

        all_trials = trial_runner.get_live_trials()

        used_cpus_and_gpus = [self._get_used_cpus_and_gpus(t) for t in all_trials]
        used_cpus, used_gpus = zip(*used_cpus_and_gpus)
        used_cpus = sum(used_cpus)
        used_gpus = sum(used_gpus)

        added_bundles = self._get_new_added_bundles(
            trial,
            all_trials,
            base_bundles,
            increase_by,
            total_available_cpus,
            total_available_gpus,
            used_cpus,
            used_gpus,
        )

        new_bundles = self._add_two_bundles(
            base_bundles, added_bundles, increase_by, False
        )

        pgf = PlacementGroupFactory(new_bundles)
        pgf._head_bundle_is_empty = base_trial_resource._head_bundle_is_empty
        return pgf
        T2 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))

        # Improve the policy using the T1 batch
        policy.learn_on_batch(T1)

        # Do some arbitrary updates based on the T2 batch
        policy.update_some_value(sum(T2["rewards"]))

        reporter(**collect_metrics(remote_workers=workers))


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)

    tune.run(
        training_workflow,
        resources_per_trial=PlacementGroupFactory(([{
            "CPU": 1,
            "GPU": 1 if args.gpu else 0
        }] + [{
            "CPU": 1
        }] * args.num_workers)),
        config={
            "num_workers": args.num_workers,
            "num_iters": args.num_iters,
        },
        verbose=1,
    )
示例#26
0
    def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10):
        """In this test we try to start 10 trials but only have resources
        for 2. Placement groups should still be created and PENDING.

        Eventually they should be scheduled sequentially (i.e. in pairs
        of two)."""
        # Since we check per-step placement groups, set the reconcilation
        # interval to 0
        os.environ["TUNE_PLACEMENT_GROUP_RECON_INTERVAL"] = "0"

        def train(config):
            time.sleep(1)
            now = time.time()
            tune.report(end=now - config["start_time"])

        head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
        child_bundle = {"custom": 1}
        # Manually calculated number of parallel trials
        max_num_parallel = 2

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle])

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        this = self

        class _TestCallback(Callback):
            def on_step_end(self, iteration, trials, **info):
                num_finished = len([
                    t for t in trials
                    if t.status == Trial.TERMINATED or t.status == Trial.ERROR
                ])

                num_staging = sum(
                    len(s)
                    for s in trial_executor._pg_manager._staging.values())
                num_ready = sum(
                    len(s) for s in trial_executor._pg_manager._ready.values())
                num_in_use = len(trial_executor._pg_manager._in_use_pgs)
                num_cached = len(trial_executor._pg_manager._cached_pgs)

                total_num_tracked = num_staging + num_ready + num_in_use + num_cached

                # All trials should be scheduled
                this.assertEqual(
                    scheduled,
                    min(scheduled, len(trials)),
                    msg=f"Num trials iter {iteration}",
                )

                # The following two tests were relaxed for reuse_actors=True
                # so that up to `max_num_parallel` more placement groups can
                # exist than we would expect. This is because caching
                # relies on reconciliation for cleanup to avoid overscheduling
                # of new placement groups.
                num_parallel_reuse = int(reuse_actors) * max_num_parallel

                # The number of PGs should decrease when trials finish
                this.assertGreaterEqual(
                    max(scheduled, len(trials)) - num_finished +
                    num_parallel_reuse,
                    total_num_tracked,
                    msg=f"Num tracked iter {iteration}",
                )

        start = time.time()
        out = tune.run(
            train,
            config={"start_time": start},
            resources_per_trial=placement_group_factory,
            num_samples=10,
            trial_executor=trial_executor,
            callbacks=[_TestCallback()],
            reuse_actors=reuse_actors,
            verbose=2,
        )

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)
示例#27
0
    def testPlacementGroupDistributedTraining(self, reuse_actors=False):
        """Run distributed training using placement groups.

        Each trial requests 4 CPUs and starts 4 remote training workers.
        """

        head_bundle = {"CPU": 1, "GPU": 0, "custom": 0}
        child_bundle = {"CPU": 1}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle, child_bundle])

        @ray.remote
        class TrainingActor:
            def train(self, val):
                time.sleep(1)
                return val

        def train(config):
            base = config["base"]
            actors = [TrainingActor.remote() for _ in range(4)]
            futures = [
                actor.train.remote(base + 2 * i)
                for i, actor in enumerate(actors)
            ]
            results = ray.get(futures)

            end = time.time() - config["start_time"]
            tune.report(avg=np.mean(results), end=end)

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        start = time.time()
        out = tune.run(
            train,
            config={
                "start_time": start,
                "base": tune.grid_search(list(range(0, 100, 10))),
            },
            resources_per_trial=placement_group_factory,
            num_samples=1,
            trial_executor=trial_executor,
            reuse_actors=reuse_actors,
            verbose=2,
        )

        avgs = sorted(t.last_result["avg"] for t in out.trials)
        self.assertSequenceEqual(avgs, list(range(3, 103, 10)))

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)