예제 #1
0
    def _playout(self, state):
        node = self._root
        while True:
            if node.is_leaf():
                break
            # Greedily select next move.
            action, node = node.select(self._c_param)
            state = state.act(action)

        # Evaluate the leaf using a network which outputs a list of
        # (action, probability) tuples p and also a score v in [-1, 1]
        # for the current player.
        action_probs, leaf_value = self._policy(state)

        if not state.board.is_terminal():
            node.expand(action_probs)
        else:
            exist, win_color = gomoku_util.check_five_in_row(state.board.board_state)
            # for end state,return the "true" leaf_value
            if win_color not in ['black', 'white']:
                return 0
            else:
                leaf_value = 1 if win_color == state.color else -1

        # Update value and visit count of nodes in this traversal.
        node.update_recursive(-leaf_value)
예제 #2
0
 def is_terminal(self):
     exist, color = gomoku_util.check_five_in_row(self.board_state)
     is_full = gomoku_util.check_board_full(self.board_state)
     if (is_full): # if the board if full of stones and no extra empty spaces, game is finished
         return True
     else:
         return exist
예제 #3
0
 def _step(self, action):
     '''
     Args: 
         action: int
     Return: 
         observation: board encoding, 
         reward: reward of the game, 
         done: boolean, 
         info: state dict
     Raise:
         Illegal Move action, basically the position on board is not empty
     '''
     assert self.state.color == self.player_color # it's the player's turn
     
     # If already terminal, then don't do anything
     if self.done:
         return self.state.board.encode(), 0., True, {'state': self.state}
     
     # Player play
     prev_state = self.state
     self.state = self.state.act(action)
     self.moves.append(self.state.board.last_coord)
     self.action_space.remove(action) # remove current action from action_space
     
     # Opponent play
     if not self.state.board.is_terminal():
         self.state, opponent_action = self._exec_opponent_play(self.state, prev_state, action)
         self.moves.append(self.state.board.last_coord)
         self.action_space.remove(opponent_action)   # remove opponent action from action_space
         # After opponent play, we should be back to the original color
         assert self.state.color == self.player_color
     
     # Reward: if nonterminal, there is no 5 in a row, then the reward is 0
     if not self.state.board.is_terminal():
         self.done = False
         return self.state.board.encode(), 0., False, {'state': self.state}
     
     # We're in a terminal state. Reward is 1 if won, -1 if lost
     assert self.state.board.is_terminal(), 'The game is terminal'
     self.done = True
     
     # Check Fianl wins
     exist, win_color = gomoku_util.check_five_in_row(self.state.board.board_state) # 'empty', 'black', 'white'
     reward = 0.
     if win_color == "empty": # draw
         reward = 0.
     else:
         player_wins = (self.player_color == win_color) # check if player_color is the win_color
         reward = 1. if player_wins else -1.
     return self.state.board.encode(), reward, True, {'state': self.state}
예제 #4
0
    def _evaluate_rollout(self, state, limit=1000):
        """
        Return 1 if current player wins, -1 if other player wins, 0 for tie
        """
        player = state.color
        for i in range(limit):
            if state.board.is_terminal():
                break
            state = state.act(np.random.choice(state.board.get_legal_action()))
        else:
            print("WARNING: rollout reached move limit")

        exist, win_color = gomoku_util.check_five_in_row(state.board.board_state)
        if win_color not in ['black', 'white']:
            return 0
        return 1 if win_color == player else -1
예제 #5
0
    def self_play(self):
        player_map = {'black': 1, 'white': 2, 'empty': -1}
        _, state = self.env.reset()
        states, mcts_probs, current_players = [], [], []
        while True:
            state = deepcopy(state)
            move, move_probs = self.zero_player.get_action(
                state, self.temp, True)
            states.append(state.board.board_state)
            mcts_probs.append(move_probs)
            current_players.append(player_map[state.color])
            state = state.act(move)

            if not state.board.is_terminal():
                oppo_move, oppo_move_probs = self.zero_player.get_action(
                    state, self.temp, True)
                states.append(state.board.board_state)
                mcts_probs.append(oppo_move_probs)
                current_players.append(player_map[state.color])
                observation, reward, done, state = self.env.step(
                    move, oppo_move)
            else:
                observation, reward, done, state = self.env.step(move)

            _, winner = gomoku_util.check_five_in_row(state.board.board_state)
            winner = player_map[winner]
            if done:
                # winner from the perspective of the current player of each state
                winners_z = np.zeros(len(current_players))
                if winner != -1:
                    winners_z[np.array(current_players) == winner] = 1.0
                    winners_z[np.array(current_players) != winner] = -1.0
                # reset MCTS root node
                self.zero_player.reset_player()
                _, state = self.env.reset()
                return winner, zip(states, mcts_probs, winners_z)