Exemplo n.º 1
0
    def testAllocateFreeResourcesWithIncreaseByTimes(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(
                add_bundles=True, increase_by={"GPU": 2}, increase_by_times=2
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)

        decision = scheduler.on_trial_result(
            self.trial_runner, trial1, {"metric": 1, "training_iteration": 4}
        )
        assert decision == TrialScheduler.CONTINUE

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2)
        )

        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial2, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2)
        )

        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 3)
        )
    def testAllocateFreeResourcesWithIncreaseBy(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResourcesToTopJob(
                add_bundles=False,
                increase_by={
                    "CPU": 2,
                    "GPU": 2
                },
                metric="metric",
                mode="max",
            ))

        base_pgf = PlacementGroupFactory([{"CPU": 2, "GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(
            scheduler, base_pgf)

        decision = scheduler.on_trial_result(self.trial_runner, trial2, {
            "metric": 0.9,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE

        decision = scheduler.on_trial_result(self.trial_runner, trial1, {
            "metric": 1.0,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{
                "CPU": 4,
                "GPU": 4
            }]))
        decision = scheduler.on_trial_result(self.trial_runner, trial2, {
            "metric": 1.1,
            "training_iteration": 4
        })
        assert decision == TrialScheduler.CONTINUE
        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(trial2,
                                            scheduler,
                                            PlacementGroupFactory([{
                                                "CPU": 4,
                                                "GPU": 4
                                            }]),
                                            metric=1.1)
        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(trial1,
                                            scheduler,
                                            PlacementGroupFactory([{
                                                "CPU": 8,
                                                "GPU": 8
                                            }]),
                                            metric=1.2)
Exemplo n.º 3
0
def test_data_parallel_trainer(ray_start_8_cpus):
    num_workers = 2
    trainer = AssertingDataParallelTrainer(
        train_fn, scaling_config=ScalingConfig(num_workers=num_workers)
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "num_epochs": 100,
                "metric": tune.grid_search([1, 2, 3, 4, 5]),
            }
        },
        tune_config=TuneConfig(
            mode="max",
            metric="metric",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
    # + 1 for Trainable
    assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
Exemplo n.º 4
0
    def testAllocateFreeResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(add_bundles=False)
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 2}])
        )
        self._allocateAndAssertNewResources(
            trial2, scheduler, PlacementGroupFactory([{"CPU": 2}])
        )

        trial4.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 3}])
        )

        trial3.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 4}])
        )

        trial2.status = Trial.TERMINATED

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 8}])
        )
Exemplo n.º 5
0
    def _create_scheduler(self, scheduler_config, parameters):
        if not scheduler_config:
            return None

        dynamic_resource_allocation = scheduler_config.pop("dynamic_resource_allocation", False)

        if scheduler_config.get("type") == "pbt":
            scheduler_config.update({"hyperparam_mutations": self.search_space})

        scheduler = tune.create_scheduler(scheduler_config.get("type"), **scheduler_config)

        if dynamic_resource_allocation:
            scheduler = ResourceChangingScheduler(scheduler, ray_resource_allocation_function)
        return scheduler
Exemplo n.º 6
0
    def testDeallocateResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResourcesToTopJob(
                add_bundles=False, increase_by={"GPU": 2}, metric="metric", mode="max"
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)
        trial1.placement_group_factory = PlacementGroupFactory([{"CPU": 1, "GPU": 4}])
        trial4.status = Trial.PENDING

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1, "GPU": 2}])
        )
Exemplo n.º 7
0
    def testDeallocateResources(self):
        scheduler = ResourceChangingScheduler(
            resources_allocation_function=DistributeResources(
                add_bundles=True, increase_by={"GPU": 2}
            )
        )

        base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf)
        trial1.placement_group_factory = PlacementGroupFactory(
            [{"CPU": 1}] + [{"GPU": 2}] * 2
        )
        trial4.status = Trial.PENDING

        self._allocateAndAssertNewResources(
            trial1, scheduler, PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}])
        )
Exemplo n.º 8
0
def test_gbdt_trainer(ray_start_8_cpus):
    data_raw = load_breast_cancer()
    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
    dataset_df["target"] = data_raw["target"]
    train_ds = ray.data.from_pandas(dataset_df).repartition(16)
    trainer = AssertingXGBoostTrainer(
        datasets={TRAIN_DATASET_KEY: train_ds},
        label_column="target",
        scaling_config=ScalingConfig(num_workers=2),
        params={
            "objective": "binary:logistic",
            "eval_metric": ["logloss"],
        },
    )
    tuner = Tuner(
        trainer,
        param_space={
            "num_boost_round": 100,
            "params": {
                "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]),
            },
        },
        tune_config=TuneConfig(
            mode="min",
            metric="train-logloss",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)