Python _kill_callback示例，xgboost_ray.tests.utils._kill_callback Python示例

示例#1

0

显示文件

文件： test_fault_tolerance.py 项目： ijrsvt/xgboost_ray

    def testTrainingContinuationElasticMultiKilled(self):
        """This should still show 20 boost rounds after two failures."""
        logging.getLogger().setLevel(10)

        additional_results = {}

        bst = train(self.params,
                    RayDMatrix(self.x, self.y),
                    callbacks=[
                        _kill_callback(self.die_lock_file,
                                       fail_iteration=6,
                                       actor_rank=0),
                        _kill_callback(self.die_lock_file_2,
                                       fail_iteration=14,
                                       actor_rank=1),
                    ],
                    num_boost_round=20,
                    ray_params=RayParams(max_actor_restarts=2,
                                         num_actors=2,
                                         elastic_training=True,
                                         max_failed_actors=2),
                    additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

示例#2

0

显示文件

文件： test_fault_tolerance.py 项目： ijrsvt/xgboost_ray

    def testTrainingContinuationKilled(self):
        """This should continue after one actor died."""
        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[_kill_callback(self.die_lock_file)],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # End with two working actors
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Two workers finished, so N=32
        self.assertEqual(additional_results["total_n"], 32)

示例#3

0

显示文件

文件： test_fault_tolerance.py 项目： ijrsvt/xgboost_ray

 def testTrainingStop(self):
     """This should now stop training after one actor died."""
     # The `train()` function raises a RuntimeError
     with self.assertRaises(RuntimeError):
         train(self.params,
               RayDMatrix(self.x, self.y),
               callbacks=[_kill_callback(self.die_lock_file)],
               num_boost_round=20,
               ray_params=RayParams(max_actor_restarts=0, num_actors=2))

示例#4

0

显示文件

文件： test_fault_tolerance.py 项目： ijrsvt/xgboost_ray

 def testTrainingStopElastic(self):
     """This should now stop training after one actor died."""
     # The `train()` function raises a RuntimeError
     with self.assertRaises(RuntimeError):
         train(self.params,
               RayDMatrix(self.x, self.y),
               callbacks=[
                   _kill_callback(self.die_lock_file,
                                  actor_rank=0,
                                  fail_iteration=3),
                   _kill_callback(self.die_lock_file_2,
                                  actor_rank=1,
                                  fail_iteration=6)
               ],
               num_boost_round=20,
               ray_params=RayParams(elastic_training=True,
                                    max_failed_actors=1,
                                    max_actor_restarts=1,
                                    num_actors=2))

示例#5

0

显示文件

文件： test_fault_tolerance.py 项目： ijrsvt/xgboost_ray

    def testTrainingContinuationElasticKilledRestarted(self):
        """This should continue after one actor died and restart it."""
        logging.getLogger().setLevel(10)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[
                            _kill_callback(self.die_lock_file,
                                           fail_iteration=6),
                            _sleep_callback(sleep_iteration=7,
                                            sleep_seconds=15),
                            _sleep_callback(sleep_iteration=9, sleep_seconds=5)
                        ],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2,
                                             elastic_training=True,
                                             max_failed_actors=1),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]

        # First actor gets recreated
        self.assertTrue(actors[0])
        self.assertTrue(actors[1])

        # Both workers finished, so n=32
        self.assertEqual(additional_results["total_n"], 32)

示例#6

0

显示文件

文件： test_fault_tolerance.py 项目： ray-project/xgboost_ray

    def testTrainingContinuationElasticKilled(self):
        """This should continue after one actor died."""
        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"

        logging.getLogger().setLevel(10)

        additional_results = {}
        keep_actors = {}

        def keep(actors, *args, **kwargs):
            keep_actors["actors"] = actors.copy()
            return DEFAULT

        with patch("xgboost_ray.main._shutdown") as mocked:
            mocked.side_effect = keep
            bst = train(self.params,
                        RayDMatrix(self.x, self.y),
                        callbacks=[_kill_callback(self.die_lock_file)],
                        num_boost_round=20,
                        ray_params=RayParams(max_actor_restarts=1,
                                             num_actors=2,
                                             elastic_training=True,
                                             max_failed_actors=1),
                        additional_results=additional_results)

        self.assertEqual(20, get_num_trees(bst))

        x_mat = xgb.DMatrix(self.x)
        pred_y = bst.predict(x_mat)
        self.assertSequenceEqual(list(self.y), list(pred_y))
        print(f"Got correct predictions: {pred_y}")

        actors = keep_actors["actors"]
        # First actor does not get recreated
        self.assertEqual(actors[0], None)
        self.assertTrue(actors[1])

        # Only one worker finished, so n=16
        self.assertEqual(additional_results["total_n"], 16)