예제 #1
0
    def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF):
        print '-',
        # BEGIN_YOUR_CODE
        if game.is_end(state):
            return game.utility(state)

        actions = game.get_possible_actions(state)

        player = game.get_player_from_state(state)
        if player == game.MAX_PLAYER:
            value = -game.INT_INF
            for action in actions:
                value = max(
                    value,
                    self.V(game.get_next_state(state, action), alpha, beta))
                alpha = max(alpha, value)
                if beta <= alpha: break
        else:
            value = game.INT_INF
            for action in actions:
                value = min(
                    value,
                    self.V(game.get_next_state(state, action), alpha, beta))
                beta = min(beta, value)
                if beta <= alpha: break

        return value
예제 #2
0
    def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF):

        # If IsEnd(s)
        if game.is_end(state):
            return game.utility(state)

        # Get possible actions
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        # If player == agent (maximizing player)
        if game.get_player_from_state(state) == game.MAX_PLAYER:
            value = -game.INT_INF
            for action in actions:
                value = max(value, self.V(game.get_next_state(state, action), alpha, beta))
                alpha = max(alpha, value)
                if beta <= alpha: break

        # If player == opponent (minimzing player)
        else:
            value = game.INT_INF
            for action in actions:
                value = min(value, self.V(game.get_next_state(state, action), alpha, beta))
                beta = min(beta, value)
                if beta <= alpha: break

        return value
예제 #3
0
    def V(self, state):
        # If IsEnd(s)
        if game.is_end(state):
            return game.utility(state)

        # Get possible actions
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        # If player == agent (maximizing player)
        if game.get_player_from_state(state) == game.MAX_PLAYER:
            value = -game.INT_INF
            for action in actions:
                value = max(value, self.V(game.get_next_state(
                    state, action)))  # use 'game.get_next_state'
# use 'game.get_next_state'
# value = max(self.V(game,get_next_state(state,action)) for action in actions)
# use 'game.get_next_state'

# If player == opponent (minimzing player)
        else:
            value = game.INT_INF
            for action in actions:
                value = min(value, self.V(game.get_next_state(
                    state, action)))  # use 'game.get_next_state'

        return value
예제 #4
0
    def V(self, state, depth):
        # If IsEnd(s)
        if game.is_end(state):
            return game.utility(state)

        # If depth = 0
        if depth == 0:
            #print game.get_board_str(state), eval(state)
            return eval(state)

        # Get possible actions
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        # If player == agent (maximizing player)
        if game.get_player_from_state(state) == game.MAX_PLAYER:
            value = -game.INT_INF
            for action in actions:
                value = max(value,
                            self.V(game.get_next_state(state, action), depth))

        # If player == opponent (minimzing player)
        else:
            value = game.INT_INF
            for action in actions:
                value = min(
                    value, self.V(game.get_next_state(state, action),
                                  depth - 1))

        return value
예제 #5
0
    def policy(self, state):
        # BEGIN_YOUR_CODE
        actions = game.get_possible_actions(state)

        alpha = -game.INT_INF
        beta = game.INT_INF

        player = game.get_player_from_state(state)
        if player == game.MAX_PLAYER:
            values = []
            for action in actions:
                next_state = game.get_next_state(state, action)
                value = self.V(next_state, alpha, beta)
                values.append(value)
                alpha = max(alpha, value)
                if beta <= alpha: break
            idx = mp.argmax(values)
            return actions[idx]
            # return actions[mp.argmax([self.V(game.get_next_state(state, action)) for action in actions])]
        else:
            values = []
            for action in actions:
                next_state = game.get_next_state(state, action)
                value = self.V(next_state, alpha, beta)
                values.append(value)
                beta = min(beta, value)
                if beta <= alpha: break
            idx = mp.argmin(values)
            return actions[idx]
예제 #6
0
    def policy(self, state):
        # BEGIN_YOUR_CODE
        actions = game.get_possible_actions(state)

        player = game.get_player_from_state(state)
        if player == game.MAX_PLAYER:
            return actions[mp.argmax([
                self.V(game.get_next_state(state, action))
                for action in actions
            ])]
        else:
            return actions[mp.argmin([
                self.V(game.get_next_state(state, action))
                for action in actions
            ])]
예제 #7
0
    def V(self, state):
        # BEGIN_YOUR_CODE
        if game.is_end(state):
            return game.utility(state)
        player = game.get_player_from_state(state)
        if player == game.MAX_PLAYER:
            value = -game.INT_INF
            for action in game.get_possible_actions(state):
                value = max(value, self.V(game.get_next_state(state, action)))
        else:
            value = game.INT_INF
            for action in game.get_possible_actions(state):
                value = min(value, self.V(game.get_next_state(state, action)))

        return value
예제 #8
0
파일: play.py 프로젝트: talkin24/AI_POSCO
def user_turn(state):
    game.draw_board(state)

    while True:
        print('What is your next move? (1-9):', end=' ')
        action = int(input())
        if state[action] == game.EMPTY:
            break

    state = game.get_next_state(state, action)

    if game.is_win(state):
        game.draw_board(state)
        print('Lose!')
        return None

    if game.is_lose(state):
        game.draw_board(state)
        print('Win!')
        return None

    if game.is_draw(state):
        game.draw_board(state)
        print('Draw!')
        return None

    return state
예제 #9
0
    def run_episode(self):
        """
        Runs one episode of self-play, starting with player 1, and return a
        training sample containing (canon_state, policy, value) tuples.
        """
        train_samples = []
        state = game.get_init_state()
        current_player = 1
        episode_step = 0

        while True:
            episode_step += 1
            canon_state = game.get_canonical_form(state, current_player)
            temp = int(episode_step < self.config.temperature_threshold)

            policy = self.mcts.get_move_probabilities(canon_state, temp=temp)
            sym = game.get_symmetries(canon_state, policy)
            for s, p in sym:
                train_samples.append([s, current_player, p, None])

            move = np.random.choice(len(policy), p=policy)
            state, current_player = game.get_next_state(state, current_player, move)

            r = game.get_state_score(state, current_player)

            if r != 0:
                return [
                    (s, pcy, r * ((-1) ** (pyr != current_player)))
                    for s, pyr, pcy, _ in train_samples
                ]
예제 #10
0
    def policy(self, state):
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        if game.get_player_from_state(state) == game.MAX_PLAYER:
            return max(actions, key=lambda x: self.V(game.get_next_state(state, x)))
        else:
            return random.choice(actions)
예제 #11
0
    def policy(self, state):
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        optimal = max if game.get_player_from_state(
            state) == game.MAX_PLAYER else min
        return optimal(actions,
                       key=lambda x: self.V(game.get_next_state(state, x)))
예제 #12
0
    def V(self, state, alpha=-game.INT_INF, beta=game.INT_INF):
        # BEGIN_YOUR_CODE
        if game.is_end(state):
            return game.utility(state)

        actions = game.get_possible_actions(state)
        if game.get_player_from_state(state) == game.MAX_PLAYER:  # my-turn
            value = -game.INT_INF
            for action in actions:
                value = max(value, self.V(game.get_next_state(state, action)))
        else:  # opp-turn
            value = 0
            for action in actions:
                value += self.V(game.get_next_state(state,
                                                    action)) / len(actions)

        return value
예제 #13
0
    def V(self, state, depth):
        # BEGIN_YOUR_CODE
        if game.is_end(state):
            return game.utility(state)
        if depth == 0:
            return eval(state)

        if game.get_player_from_state(state) == game.MAX_PLAYER:  # my-turn
            value = -game.INT_INF
            for action in game.get_possible_actions(state):
                value = max(value,
                            self.V(game.get_next_state(state, action), depth))
        else:  # opp-turn
            value = game.INT_INF
            for action in game.get_possible_actions(state):
                value = min(
                    value, self.V(game.get_next_state(state, action),
                                  depth - 1))

        return value
예제 #14
0
    def policy(self, state):
        actions = game.get_possible_actions(state)
        assert len(actions) > 0

        alpha = -game.INT_INF
        beta = game.INT_INF

        if game.get_player_from_state(state) == game.MAX_PLAYER:
            values = []
            for action in actions:
                value = self.V(game.get_next_state(state, action), alpha, beta)
                values.append(value)
                alpha = max(alpha, value)
            return max(list(zip(actions, values)), key=lambda x: x[1])[0]
        else:
            values = []
            for action in actions:
                value = self.V(game.get_next_state(state, action), alpha, beta)
                values.append(value)
                beta = min(beta, value)
            return min(list(zip(actions, values)), key=lambda x: x[1])[0]
예제 #15
0
    def policy(self, state):
        # BEGIN_YOUR_CODE
        actions = game.get_possible_actions(state)

        player = game.get_player_from_state(state)
        if player == game.MAX_PLAYER:
            values = []
            for action in actions:
                next_state = game.get_next_state(state, action)
                value = self.V(next_state, self.max_depth)
                values.append(value)
            idx = mp.argmax(values)
            return actions[idx]
        else:
            values = []
            for action in actions:
                next_state = game.get_next_state(state, action)
                value = self.V(next_state, self.max_depth)
                values.append(value)
            idx = mp.argmin(values)
            return actions[idx]
예제 #16
0
    def play_game(self):
        """
        Run one episode and return the winner of the game (1 if player1, -1 if player2)
        or a draw result that is neither 1, -1, nor 0
        """
        players = [self.player2, None, self.player1]
        current_player = 1
        state = game.get_init_state()

        while game.get_state_score(state, current_player) == 0:
            move = players[current_player + 1](game.get_canonical_form(
                state, current_player))
            legal_moves = game.get_legal_moves(
                game.get_canonical_form(state, current_player), 1)
            if legal_moves[move] == 0:
                print(move)
                assert legal_moves[move] > 0
            state, current_player = game.get_next_state(
                state, current_player, move)

        return current_player * game.get_state_score(state, current_player)
예제 #17
0
파일: play.py 프로젝트: talkin24/AI_POSCO
def system_turn(state):
    action = agent.policy(state)
    print('action =', action)

    state = game.get_next_state(state, action)

    if game.is_win(state):
        game.draw_board(state)
        print('Win!')
        return None

    if game.is_lose(state):
        game.draw_board(state)
        print('Lose!')
        return None

    if game.is_draw(state):
        game.draw_board(state)
        print('Draw!')
        return None

    return state
예제 #18
0
    def search(self, state):
        """
        One iteration of MCTS. This method is recursively called until a
        leaf node is found.
        
        The action chosen at each node is the one with the maximum upper
        confidence bound.
        
        Returns:
            v: the negative of the value of the current state
        """
        s = game.hash_state(state)

        if s not in self.states_ending_score:
            self.states_ending_score[s] = game.get_state_score(state, 1)
        if self.states_ending_score[s] != 0:
            # terminal node: outcome propagated up the search path
            return -self.states_ending_score[s]

        # leaf node: neural net is used to get an initial policy and value for the state
        if s not in self.states_P:
            # transform state by using a randomly selected symmetry before it is evaluated
            # by the NN, so that the MC evaluation is averaged over different biases
            transformed_state = random.choice(game.get_symmetries(state))
            self.states_P[s], v = self.neural_net.predict(transformed_state)
            legal_moves = game.get_legal_moves(state, 1)
            # put 0 in the policy for illegal moves
            self.states_P[s] = self.states_P[s] * legal_moves
            # renormalize the policy
            policy_sum = self.states_P[s].sum().item()
            if policy_sum > 0:
                self.states_P[s] /= policy_sum
            else:
                # if all legal moves probabilities are 0, let all legal moves probabilities be equal
                # print something here as it is not expected to get this message often
                print(
                    "All legal moves probabilities are 0! Replacing with uniform distribution..."
                )
                self.states_P[s] = self.states_P[s] + legal_moves
                self.states_P[s] /= np.sum(self.states_P[s])

            self.states_valid_moves[s] = legal_moves
            self.states_N[s] = 0
            # the value is propagated up the search path
            return -v

        legal_moves = self.states_valid_moves[s]
        current_best = -float("inf")
        best_move = -1

        # pick the action with the highest upper confidence bound
        for a in range(game.ACTION_SIZE):
            if not legal_moves[a]:
                continue
            Q = self.states_actions_Q.get((s, a), 0)
            N = self.states_actions_N.get((s, a), 0)

            U = Q + self.config.cpuct * self.states_P[s][a] * math.sqrt(
                self.states_N[s]) / (1 + N)

            if U > current_best:
                current_best = U
                best_move = a

        a = best_move
        next_state, next_player = game.get_next_state(state, 1, a)
        next_state = game.get_canonical_form(next_state, next_player)

        # the value is retrieved from the next state
        v = self.search(next_state)

        if (s, a) in self.states_actions_Q:
            self.states_actions_Q[(
                s,
                a)] = (self.states_actions_N[(s, a)] * self.states_actions_Q[
                    (s, a)] + v) / (self.states_actions_N[(s, a)] + 1)
            self.states_actions_N[(s, a)] += 1
        else:
            self.states_actions_Q[(s, a)] = v
            self.states_actions_N[(s, a)] = 1

        self.states_N[s] += 1
        # the value is propagated up the remaining of the search path
        return -v