def find_winning_move_and_score(self, env: ReversiEnv, exactly=True): if env.done: b, w = env.board.number_of_black_and_white return None, b - w if time() - self.start_time > self.timeout: logger.debug("timeout!") raise Timeout() turn = env.turn key = black, white, next_player = env.board.black, env.board.white, env.next_player if key in self.cache: return self.cache[key] if next_player == Player.black: legal_moves = find_correct_moves(black, white) else: legal_moves = find_correct_moves(white, black) action_list = [idx for idx in range(225) if legal_moves & (1 << idx)] score_list = np.zeros(len(action_list), dtype=int) for i, action in enumerate(action_list): # env.update(black, white, next_player) env.board.black = black env.board.white = white env.next_player = next_player env.turn = turn env.done = False env.winner = None # env.step(action) _, score = self.find_winning_move_and_score(env, exactly=exactly) score_list[i] = score if not exactly: # do not need to find the best score move if next_player == Player.black and score > 0: break elif next_player == Player.white and score < 0: break # print(list(zip(action_list, score_list))) if next_player == Player.black: best_action = action_list[int(np.argmax(score_list))] best_score = np.max(score_list) else: best_action = action_list[int(np.argmin(score_list))] best_score = np.min(score_list) self.cache[key] = (best_action, best_score) return best_action, best_score
def play_game(self, best_model, ng_model): env = ReversiEnv().reset() best_player = ReversiPlayer(self.config, best_model, play_config=self.config.eval.play_config) ng_player = ReversiPlayer(self.config, ng_model, play_config=self.config.eval.play_config) best_is_black = random() < 0.5 if best_is_black: black, white = best_player, ng_player else: black, white = ng_player, best_player observation = env.observation while not env.done: if env.next_player == Player.black: action = black.action(observation.black, observation.white) else: action = white.action(observation.white, observation.black) observation, info = env.step(action) ng_win = None if env.winner == Winner.black: if best_is_black: ng_win = 0 else: ng_win = 1 elif env.winner == Winner.white: if best_is_black: ng_win = 1 else: ng_win = 0 return ng_win, best_is_black, observation.number_of_black_and_white
def play_game(self, model_1, model_2): env = ReversiEnv().reset() def make_sim_env_fn(): return env.copy() p1 = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=model_1, play_config=self.config.eval.play_config) p1.prepare(env, dir_noise=False) p2 = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=model_2, play_config=self.config.eval.play_config) p2.prepare(env, dir_noise=False) p1_is_black = random() < 0.5 if p1_is_black: black, white = p1, p2 else: black, white = p2, p1 while not env.done: if env.next_player == Player.black: action, _, _ = black.think() else: action, _, _ = white.think() env.step(action) black.play(action, env) white.play(action, env) if env.black_wins: p1_win = p1_is_black elif env.black_loses: p1_win = not p1_is_black else: p1_win = None return p1_win
async def search_my_move(self, env: ReversiEnv, is_root_node=False): """ Q, V is value for this Player(always black). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.black: return 1 elif env.winner == Winner.white: return -1 else: return 0 key = self.counter_key(env) another_side_key = self.another_side_counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.next_player == Player.black: return leaf_v # Value for black else: return -leaf_v # Value for white == -Value for black virtual_loss = self.config.play.virtual_loss virtual_loss_for_w = virtual_loss if env.next_player == Player.black else -virtual_loss action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss_for_w leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W self.var_n[key][action_t] += - virtual_loss + 1 self.var_w[key][action_t] += virtual_loss_for_w + leaf_v # update another side info(flip color and player) self.var_n[another_side_key][action_t] += 1 self.var_w[another_side_key][action_t] -= leaf_v # must flip the sign. return leaf_v
async def search_my_move(self, env: ReversiEnv, is_root_node=False): """ Q, V is value for this Player(always black). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.black: return 1 elif env.winner == Winner.white: return -1 else: return 0 key = self.counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.next_player == Player.black: return leaf_v # Value for black else: return -leaf_v # Value for white == -Value for black action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) virtual_loss = self.config.play.virtual_loss self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W, Q, U n = self.var_n[key][ action_t] = self.var_n[key][action_t] - virtual_loss + 1 w = self.var_w[key][ action_t] = self.var_w[key][action_t] + virtual_loss + leaf_v self.var_q[key][action_t] = w / n return leaf_v
def get_next_key(self, own, enemy, action): env = ReversiEnv().update(own, enemy, Player.black) env.step(action) return self.counter_key(env)
class NBoardEngine: def __init__(self, config: Config): self.config = config self.reader = NonBlockingStreamReader(sys.stdin) self.handler = NBoardProtocolVersion2(config, self) self.running = False self.nc = self.config.nboard # shorcut # self.env = ReversiEnv().reset() self.model = load_model(self.config) self.play_config = self.config.play self.player = self.create_player() self.turn_of_nboard = None def create_player(self): logger.debug("create new ReversiPlayer()") return ReversiPlayer(self.config, self.model, self.play_config, enable_resign=False) def start(self): self.running = True self.reader.start(push_callback=self.push_callback) while self.running: message = self.reader.readline(self.nc.read_stdin_timeout) if message is None: continue message = message.strip() logger.debug(f"> {message}") self.handler.handle_message(message) def push_callback(self, message: str): # note: called in another thread if message.startswith("ping"): # interupt self.stop_thinkng() def stop(self): self.running = False def reply(self, message): logger.debug(f"< {message}") sys.stdout.write(message + "\n") sys.stdout.flush() def stop_thinkng(self): self.player.stop_thinking() def set_depth(self, n): try: n = int(n) self.play_config.simulation_num_per_move = n * self.nc.simulation_num_per_depth_about logger.info( f"set simulation_num_per_move to {self.play_config.simulation_num_per_move}" ) except ValueError: pass def reset_state(self): self.player = self.create_player() def set_game(self, game_state: GameState): self.env.reset() self.env.update(game_state.black, game_state.white, game_state.player) self.turn_of_nboard = game_state.player for action in game_state.actions: self._change_turn() if action is not None: self.env.step(action) def _change_turn(self): if self.turn_of_nboard: self.turn_of_nboard = Player.black if self.turn_of_nboard == Player.white else Player.white def move(self, action): self._change_turn() if action is not None: self.env.step(action) def go(self) -> GoResponse: if self.env.next_player != self.turn_of_nboard: return GoResponse(None, 0, 0) board = self.env.board if self.env.next_player == Player.black: states = (board.black, board.white) else: states = (board.white, board.black) start_time = time() action = self.player.action(*states) item = self.player.ask_thought_about(*states) evaluation = item.values[action] time_took = time() - start_time return GoResponse(action, evaluation, time_took) def hint(self, n_hint): """ :param n_hint: """ board = self.env.board if self.env.next_player == Player.black: states = (board.black, board.white) else: states = (board.white, board.black) def hint_report_callback(values, visits): hint_list = [] for action, visit in list( sorted(enumerate(visits), key=lambda x: -x[1]))[:n_hint]: if visit > 0: hint_list.append( HintResponse(action, values[action], visit)) self.handler.report_hint(hint_list) callback_info = CallbackInMCTS( self.config.nboard.hint_callback_per_sim, hint_report_callback) self.player.action(*states, callback_in_mtcs=callback_info) item = self.player.ask_thought_about(*states) hint_report_callback(item.values, item.visit)
class PlayWithHuman: def __init__(self, config: Config, model_dir): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model(model_dir) self.ai = None # type: EvaluatePlayer self.ai_confidence = None def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() def make_sim_env_fn(): return self.env.copy() self.ai = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=self.model) self.ai.prepare(self.env, dir_noise=False) self.ai_confidence = None def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) elif np.amax(self.env.legal_moves) == 0: # pass print('pass move') pos = 64 self.env.step(pos) self.ai.play(pos, self.env) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(7,7)""" pos = int(py * 8 + px) assert 0 <= pos < 64 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.board.number_of_black_and_white def available(self, px, py): pos = int(py * 8 + px) if pos < 0 or 64 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 8 + px) assert 0 <= pos < 64 if self.next_player != self.human_color: raise Exception('not human\'s turn!') self.env.step(pos) self.ai.play(pos, self.env) def _load_model(self, model_dir): from reversi_zero.agent.model import ReversiModel model = ReversiModel(self.config) model.create_session() model.load(model_dir) return model def move_by_ai(self): if self.next_player == self.human_color: raise Exception('not AI\'s turn!') logger.info('start thinking...') action, _, vs = self.ai.think() self.ai_confidence = vs logger.info('end thinking...') self.env.step(action) self.ai.play(action, self.env) def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy
class PlayWithHuman: def __init__(self, config: Config): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model() self.ai = None # type: ReversiPlayer self.last_evaluation = None self.last_history = None # type: HistoryItem def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() self.ai = ReversiPlayer(self.config, self.model) def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(14,14)""" pos = int(py * 15 + px) assert 0 <= pos < 225 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.observation.number_of_black_and_white def available(self, px, py): pos = int(py * 15 + px) if pos < 0 or 225 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 15 + px) assert 0 <= pos < 225 if self.next_player != self.human_color: return False self.env.step(pos) def _load_model(self): return load_model(self.config) def move_by_ai(self): if self.next_player == self.human_color: return False own, enemy = self.get_state_of_next_player() action = self.ai.action(own, enemy) self.env.step(action) self.last_history = self.ai.ask_thought_about(own, enemy) self.last_evaluation = self.last_history.values[self.last_history.action] logger.debug(f"evaluation by ai={self.last_evaluation}") def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy
class PlayWithHuman: def __init__(self, config: Config): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model() self.ai = None # type: ReversiPlayer self.last_evaluation = None self.last_history = None # type: HistoryItem def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() self.ai = ReversiPlayer(self.config, self.model) def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(7,7)""" pos = int(py * 8 + px) assert 0 <= pos < 64 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.observation.number_of_black_and_white def available(self, px, py): pos = int(py * 8 + px) if pos < 0 or 64 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 8 + px) assert 0 <= pos < 64 if self.next_player != self.human_color: return False self.env.step(pos) def _load_model(self): from reversi_zero.agent.model import ReversiModel model = ReversiModel(self.config) if self.config.play.use_newest_next_generation_model: loaded = reload_newest_next_generation_model_if_changed( model) or load_best_model_weight(model) else: loaded = load_best_model_weight( model) or reload_newest_next_generation_model_if_changed(model) if not loaded: raise RuntimeError("No models found!") return model def move_by_ai(self): if self.next_player == self.human_color: return False own, enemy = self.get_state_of_next_player() action = self.ai.action(own, enemy) self.env.step(action) self.last_history = self.ai.ask_thought_about(own, enemy) self.last_evaluation = self.last_history.values[ self.last_history.action] logger.debug(f"evaluation by ai={self.last_evaluation}") def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy