def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if get_actor_rank() == actor_rank and \ epoch == fail_iteration and \ not os.path.exists(die_lock_file): with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) import sys print(f"Testing: Rank {get_actor_rank()} will now fail.") sys.exit(1)
def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) if get_actor_rank() == actor_rank and \ epoch == fail_iteration and \ not os.path.exists(die_lock_file): # Get PID pid = os.getpid() print(f"Killing process: {pid}") with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) print(f"Testing: Rank {get_actor_rank()} will now die.") os.kill(pid, 9)
def after_iteration(self, model, epoch: int, evals_log: Dict): if get_actor_rank() == 0: put_queue(lambda: self._create_checkpoint( model, epoch, self._filename, self._frequency))
def after_iteration(self, model, epoch: int, evals_log: Dict): if get_actor_rank() == 0: report_dict = self._get_report_dict(evals_log) put_queue(lambda: tune.report(**report_dict))
def before_training(self, model): ip_address = get_node_ip_address() put_queue(ip_address) return model
def after_iteration(self, model, epoch, evals_log): put_queue(get_world_size())
def before_iteration(self, model, epoch, evals_log): if get_actor_rank() == 3: print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}") put_queue(get_world_size())
def __call__(self, env): if get_actor_rank() == 0: result_dict = dict(env.evaluation_result_list) put_queue(lambda: tune.report(**result_dict))
def after_iteration(self, model, epoch, evals_log): print(f"My rank: {get_actor_rank()}") put_queue(("rank", get_actor_rank()))
def after_iteration(self, model, epoch, evals_log): if epoch % frequency == 0: put_queue(model.save_raw())
def callback(env): print(f"My rank: {get_actor_rank()}") put_queue(("rank", get_actor_rank()))