def after_iteration(self, model, epoch, evals_log): # ray.get to make sure this is up to date in the next iteration ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch)) if self.training_delay > 0: time.sleep(self.training_delay) if get_actor_rank() == 0: ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank()))
def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if get_actor_rank() == actor_rank and \ epoch == fail_iteration and \ not os.path.exists(die_lock_file): with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) import sys print(f"Testing: Rank {get_actor_rank()} will now fail.") sys.exit(1)
def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if get_actor_rank() == actor_rank and \ epoch == fail_iteration and \ not os.path.exists(die_lock_file): # Get PID pid = os.getpid() print(f"Killing process: {pid}") with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) print(f"Testing: Rank {get_actor_rank()} will now die.") os.kill(pid, 9)
def before_iteration(self, model, epoch, evals_log): if ray.get(self.ft_manager.should_die.remote(get_actor_rank())): pid = os.getpid() print(f"Killing process: {pid}") print(f"Rank {get_actor_rank()} will now die.") time.sleep(1) os.kill(pid, 9) time.sleep(10) # Don't continue training, just die
def after_iteration(self, model, epoch, evals_log): if epoch == self._iteration: rank = get_actor_rank() if rank in self._ranks: if not ray.get(self._state.has_failed.remote(self._id)): success = ray.get(self._state.set_failed.remote(self._id)) if not success: # Another rank is already about to fail return pid = os.getpid() print(f"Killing process: {pid} for actor rank {rank}") time.sleep(1) os.kill(pid, 9)
def after_iteration(self, model, epoch: int, evals_log: Dict): if get_actor_rank() == 0: put_queue(lambda: self._create_checkpoint( model, epoch, self._filename, self._frequency))
def after_iteration(self, model, epoch: int, evals_log: Dict): if get_actor_rank() == 0: report_dict = self._get_report_dict(evals_log) put_queue(lambda: tune.report(**report_dict))
def after_iteration(self, model, epoch: int, evals_log: Dict): if get_actor_rank() == 0: self._checkpoint.after_iteration(model, epoch, evals_log) self._report.after_iteration(model, epoch, evals_log)
def before_iteration(self, model, epoch, evals_log): if get_actor_rank() == 3: print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}") put_queue(get_world_size())
def __call__(self, env): if get_actor_rank() == 0: result_dict = dict(env.evaluation_result_list) put_queue(lambda: tune.report(**result_dict))
def after_iteration(self, model, epoch, evals_log): print(f"My rank: {get_actor_rank()}") put_queue(("rank", get_actor_rank()))
def callback(env): print(f"My rank: {get_actor_rank()}") put_queue(("rank", get_actor_rank()))