def _playout(self, state): node = self._root while True: if node.is_leaf(): break # Greedily select next move. action, node = node.select(self._c_param) state = state.act(action) # Evaluate the leaf using a network which outputs a list of # (action, probability) tuples p and also a score v in [-1, 1] # for the current player. action_probs, leaf_value = self._policy(state) if not state.board.is_terminal(): node.expand(action_probs) else: exist, win_color = gomoku_util.check_five_in_row(state.board.board_state) # for end state,return the "true" leaf_value if win_color not in ['black', 'white']: return 0 else: leaf_value = 1 if win_color == state.color else -1 # Update value and visit count of nodes in this traversal. node.update_recursive(-leaf_value)
def is_terminal(self): exist, color = gomoku_util.check_five_in_row(self.board_state) is_full = gomoku_util.check_board_full(self.board_state) if (is_full): # if the board if full of stones and no extra empty spaces, game is finished return True else: return exist
def _step(self, action): ''' Args: action: int Return: observation: board encoding, reward: reward of the game, done: boolean, info: state dict Raise: Illegal Move action, basically the position on board is not empty ''' assert self.state.color == self.player_color # it's the player's turn # If already terminal, then don't do anything if self.done: return self.state.board.encode(), 0., True, {'state': self.state} # Player play prev_state = self.state self.state = self.state.act(action) self.moves.append(self.state.board.last_coord) self.action_space.remove(action) # remove current action from action_space # Opponent play if not self.state.board.is_terminal(): self.state, opponent_action = self._exec_opponent_play(self.state, prev_state, action) self.moves.append(self.state.board.last_coord) self.action_space.remove(opponent_action) # remove opponent action from action_space # After opponent play, we should be back to the original color assert self.state.color == self.player_color # Reward: if nonterminal, there is no 5 in a row, then the reward is 0 if not self.state.board.is_terminal(): self.done = False return self.state.board.encode(), 0., False, {'state': self.state} # We're in a terminal state. Reward is 1 if won, -1 if lost assert self.state.board.is_terminal(), 'The game is terminal' self.done = True # Check Fianl wins exist, win_color = gomoku_util.check_five_in_row(self.state.board.board_state) # 'empty', 'black', 'white' reward = 0. if win_color == "empty": # draw reward = 0. else: player_wins = (self.player_color == win_color) # check if player_color is the win_color reward = 1. if player_wins else -1. return self.state.board.encode(), reward, True, {'state': self.state}
def _evaluate_rollout(self, state, limit=1000): """ Return 1 if current player wins, -1 if other player wins, 0 for tie """ player = state.color for i in range(limit): if state.board.is_terminal(): break state = state.act(np.random.choice(state.board.get_legal_action())) else: print("WARNING: rollout reached move limit") exist, win_color = gomoku_util.check_five_in_row(state.board.board_state) if win_color not in ['black', 'white']: return 0 return 1 if win_color == player else -1
def self_play(self): player_map = {'black': 1, 'white': 2, 'empty': -1} _, state = self.env.reset() states, mcts_probs, current_players = [], [], [] while True: state = deepcopy(state) move, move_probs = self.zero_player.get_action( state, self.temp, True) states.append(state.board.board_state) mcts_probs.append(move_probs) current_players.append(player_map[state.color]) state = state.act(move) if not state.board.is_terminal(): oppo_move, oppo_move_probs = self.zero_player.get_action( state, self.temp, True) states.append(state.board.board_state) mcts_probs.append(oppo_move_probs) current_players.append(player_map[state.color]) observation, reward, done, state = self.env.step( move, oppo_move) else: observation, reward, done, state = self.env.step(move) _, winner = gomoku_util.check_five_in_row(state.board.board_state) winner = player_map[winner] if done: # winner from the perspective of the current player of each state winners_z = np.zeros(len(current_players)) if winner != -1: winners_z[np.array(current_players) == winner] = 1.0 winners_z[np.array(current_players) != winner] = -1.0 # reset MCTS root node self.zero_player.reset_player() _, state = self.env.reset() return winner, zip(states, mcts_probs, winners_z)