def testTrainingContinuationElasticMultiKilled(self): """This should still show 20 boost rounds after two failures.""" logging.getLogger().setLevel(10) additional_results = {} bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, fail_iteration=6, actor_rank=0), _kill_callback(self.die_lock_file_2, fail_iteration=14, actor_rank=1), ], num_boost_round=20, ray_params=RayParams(max_actor_restarts=2, num_actors=2, elastic_training=True, max_failed_actors=2), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}")
def testTrainingContinuationKilled(self): """This should continue after one actor died.""" additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # End with two working actors self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Two workers finished, so N=32 self.assertEqual(additional_results["total_n"], 32)
def testTrainingStop(self): """This should now stop training after one actor died.""" # The `train()` function raises a RuntimeError with self.assertRaises(RuntimeError): train(self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=0, num_actors=2))
def testTrainingStopElastic(self): """This should now stop training after one actor died.""" # The `train()` function raises a RuntimeError with self.assertRaises(RuntimeError): train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, actor_rank=0, fail_iteration=3), _kill_callback(self.die_lock_file_2, actor_rank=1, fail_iteration=6) ], num_boost_round=20, ray_params=RayParams(elastic_training=True, max_failed_actors=1, max_actor_restarts=1, num_actors=2))
def testTrainingContinuationElasticKilledRestarted(self): """This should continue after one actor died and restart it.""" logging.getLogger().setLevel(10) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, fail_iteration=6), _sleep_callback(sleep_iteration=7, sleep_seconds=15), _sleep_callback(sleep_iteration=9, sleep_seconds=5) ], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor gets recreated self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Both workers finished, so n=32 self.assertEqual(additional_results["total_n"], 32)
def testTrainingContinuationElasticKilled(self): """This should continue after one actor died.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1" logging.getLogger().setLevel(10) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor does not get recreated self.assertEqual(actors[0], None) self.assertTrue(actors[1]) # Only one worker finished, so n=16 self.assertEqual(additional_results["total_n"], 16)