Exemplo n.º 1
0
    def testTrainingContinuationKilled(self):
        """This should continue after one actor died."""
        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[_kill_callback(self.die_lock_file)],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # End with two working actors
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Two workers finished, so N=32
        self.assertEqual(additional_results["total_n"], 32)
Exemplo n.º 2
0
    def testTrainingContinuationElasticMultiKilled(self):
        """This should still show 20 boost rounds after two failures."""
        logging.getLogger().setLevel(10)

        additional_results = {}

        bst = train(self.params,
                    RayDMatrix(self.x, self.y),
                    callbacks=[
                        _kill_callback(self.die_lock_file,
                                       fail_iteration=6,
                                       actor_rank=0),
                        _kill_callback(self.die_lock_file_2,
                                       fail_iteration=14,
                                       actor_rank=1),
                    ],
                    num_boost_round=20,
                    ray_params=RayParams(max_actor_restarts=2,
                                         num_actors=2,
                                         elastic_training=True,
                                         max_failed_actors=2),
                    additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")
Exemplo n.º 3
0
    def testTrainPredict(self, init=True, remote=None, **ray_param_dict):
        """Train with evaluation and predict"""
        if init:
            ray.init(num_cpus=2, num_gpus=0)

        dtrain = RayDMatrix(self.x, self.y)

        evals_result = {}
        bst = train(self.params,
                    dtrain,
                    num_boost_round=38,
                    ray_params=RayParams(num_actors=2, **ray_param_dict),
                    evals=[(dtrain, "dtrain")],
                    evals_result=evals_result,
                    _remote=remote)

        self.assertEqual(get_num_trees(bst), 38)

        self.assertTrue("dtrain" in evals_result)

        x_mat = RayDMatrix(self.x)
        pred_y = predict(bst,
                         x_mat,
                         ray_params=RayParams(num_actors=2, **ray_param_dict),
                         _remote=remote)
        self.assertSequenceEqual(list(self.y), list(pred_y))
    def testTrainingContinuationElasticKilledRestarted(self):
        """This should continue after one actor died and restart it."""
        logging.getLogger().setLevel(10)

        ft_manager = FaultToleranceManager.remote()

        ft_manager.schedule_kill.remote(rank=0, boost_round=6)
        ft_manager.delay_return.remote(rank=1,
                                       start_boost_round=12,
                                       end_boost_round=21)

        delay_callback = DelayedLoadingCallback(ft_manager,
                                                reload_data=True,
                                                sleep_time=0.1)
        die_callback = DieCallback(ft_manager, training_delay=0.25)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[die_callback],
                        num_boost_round=20,
                        ray_params=RayParams(
                            max_actor_restarts=1,
                            num_actors=2,
                            elastic_training=True,
                            max_failed_actors=1,
                            distributed_callbacks=[delay_callback]),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]

        # First actor gets recreated
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Both workers finished, so n=32
        self.assertEqual(additional_results["total_n"], 32)
Exemplo n.º 5
0
    def testTrainingContinuationElasticKilledRestarted(self):
        """This should continue after one actor died and restart it."""
        logging.getLogger().setLevel(10)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[
                            _kill_callback(self.die_lock_file,
                                           fail_iteration=6),
                            _sleep_callback(sleep_iteration=7,
                                            sleep_seconds=15),
                            _sleep_callback(sleep_iteration=9, sleep_seconds=5)
                        ],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2,
                                             elastic_training=True,
                                             max_failed_actors=1),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]

        # First actor gets recreated
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Both workers finished, so n=32
        self.assertEqual(additional_results["total_n"], 32)
Exemplo n.º 6
0
    def testTrainPredict(self,
                         init=True,
                         remote=None,
                         softprob=False,
                         **ray_param_dict):
        """Train with evaluation and predict"""
        if init:
            ray.init(num_cpus=2, num_gpus=0)

        dtrain = RayDMatrix(self.x, self.y)

        params = self.params
        if softprob:
            params = params.copy()
            params["objective"] = "multi:softprob"

        evals_result = {}
        bst = train(params,
                    dtrain,
                    num_boost_round=38,
                    ray_params=RayParams(num_actors=2, **ray_param_dict),
                    evals=[(dtrain, "dtrain")],
                    evals_result=evals_result,
                    _remote=remote)

        self.assertEqual(get_num_trees(bst), 38)

        self.assertTrue("dtrain" in evals_result)

        x_mat = RayDMatrix(self.x)
        pred_y = predict(bst,
                         x_mat,
                         ray_params=RayParams(num_actors=2, **ray_param_dict),
                         _remote=remote)

        if softprob:
            self.assertEqual(pred_y.shape[1], len(np.unique(self.y)))
            pred_y = np.argmax(pred_y, axis=1)

        self.assertSequenceEqual(list(self.y), list(pred_y))
    def testTrainingContinuationElasticKilled(self):
        """This should continue after one actor died."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"

        logging.getLogger().setLevel(10)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[_kill_callback(self.die_lock_file)],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2,
                                             elastic_training=True,
                                             max_failed_actors=1),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # First actor does not get recreated
        self.assertEqual(actors[0], None)
        self.assertTrue(actors[1])

        # Only one worker finished, so n=16
        self.assertEqual(additional_results["total_n"], 16)