Python OptimalBoard примеры использования

Язык программирования: Python

Пространство имен/Пакет: tictactoe

Класс/Тип: OptimalBoard

Примеров на hotexamples.com: 11

Python OptimalBoard - 11 примеров найдено. Это лучшие примеры Python кода для tictactoe.OptimalBoard, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

OptimalBoard(8)

convert_action_to_optimal(7)

board_to_id(3)

convert_action_to_original(3)

Пример #1

Показать файл

Файл: player.py Проект: jaywon99/tictactoe

    def _choose(self, state, available_actions):
        if self.is_train_mode and random.random() < self.exploit_rate:
            return random.choice(available_actions)

        ob = OptimalBoard(state)
        converted_actions = ob.convert_action_to_optimal(available_actions)
        action = self.q.rargmax(ob.board_id, converted_actions)
        return ob.convert_action_to_original(action)

Пример #2

Показать файл

Файл: player.py Проект: jaywon99/tictactoe

    def _feedback(self, state, action, next_state, reward, done):
        state_ob = OptimalBoard(state)
        converted_action = state_ob.convert_action_to_optimal(action)
        converted_state = self.convert_state(state_ob.optimal_board)
        next_ob = OptimalBoard(next_state)
        converted_next_state = self.convert_state(next_ob.optimal_board)

        self.network.add_train_set(converted_state, converted_action, reward,
                                   converted_next_state, done)
        self.network.study()

Пример #3

Показать файл

Файл: find_all.py Проект: jaywon99/tictactoe

def find_next(board, color, seq):
    actions = SP.available_actions(board)

    for action in actions:
        new_board = board[:]
        reward, done = SP.play(new_board, action, color)
        if done == True:
            # print it?
            if reward == 0:
                print(seq + str(action), '=', OB.board_to_id(new_board))
            else:
                print(seq + str(action), MARKER[color],
                      OB.board_to_id(new_board))
        else:
            find_next(new_board, SP.next(color), seq + str(action))

Пример #4

Показать файл

    def _choose(self, state, actions):
        if self.is_train_mode and random.random() < self.exploit_rate:
            next_pos = random.choice(actions)
            if self.debug: print("SELECT", actions, "RANDOM", next_pos)

            return next_pos

        found_p = -1.0
        found_c = []
        
        ob = OptimalBoard(state)
        _id = ob.board_id
        if self.debug: print("FROM", _id)

        scores = self.p_table.lookup(_id)
        converted_actions = ob.convert_action_to_optimal(actions)
        for action in converted_actions:
            p = scores[action]
            if self.debug: print("ACTION", ob.convert_action_to_original(action), p)
            if p > found_p:
                found_p = p
                found_c = [ob.convert_action_to_original(action)]
            elif p == found_p:
                found_c.append(ob.convert_action_to_original(action))

        next_pos = random.choice(found_c)
        if self.debug: print("SELECT", found_c, found_p, next_pos)

        return next_pos

Пример #5

Показать файл

Файл: player.py Проект: jaywon99/tictactoe

    def _calculate_reward(self, history, final_reward):
        ''' convert turn history to learning data and
        calculate reward (multiply gamma)
        '''

        replay_buffer = []
        size = len(history)
        for idx, turn in enumerate(history):
            optimal_board = OptimalBoard(turn[self.HISTORY_STATE])
            converted_action = optimal_board.convert_action_to_optimal(
                turn[self.HISTORY_ACTION])
            converted_state = self.convert_state(optimal_board.optimal_board)
            replay_buffer.append([
                converted_state, converted_action,
                final_reward * GAMMA**(size - idx - 1)
            ])

        running_add = final_reward
        for i in reversed(range(len(replay_buffer))):
            replay_buffer[i][2] = running_add  # 2 is reward of every turn
            running_add = running_add * GAMMA

        return replay_buffer

Пример #6

Показать файл

    def negamax(self, state, color, depth=10):
        ''' implement negamax algorithm
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        # Transposition Table related work
        (state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)

        cache = self.tp.get(_id)
        if cache is not None:  # BUG FIX! cache can be 0, so should check None
            # case 1
            # return cache
            # case 2
            return cache[0], random.choice(cache[1])

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_actions = []
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax(next_s, SP.next(color), depth - 1)
                score = -score  # negamax

            # pick from all best moves
            if score > best_score:
                best_score = score
                best_actions = [action]
            elif score == best_score:
                best_actions.append(action)

        # case 1: choose random value 1 time
        # choosed_result = random.choice(best_scores)
        # tp.put(_id, choosed_result)
        # return choosed_result

        # case 2: choose random value every time
        self.tp.put(_id, (best_score, best_actions))
        return (best_score, random.choice(best_actions))

Пример #7

Показать файл

Файл: player.py Проект: jaywon99/tictactoe

    def _choose(self, state, available_actions):
        optimal_board = OptimalBoard(state)
        converted_actions = optimal_board.convert_action_to_optimal(
            available_actions)
        converted_state = self.convert_state(optimal_board.optimal_board)
        ###
        if self.is_train_mode:
            if random.random() < self.egreedy:
                action = random.choice(converted_actions)
            else:
                action = self.network.predict_one(converted_state)
        else:
            action = self.network.predict_one(converted_state)

        if action not in converted_actions:
            # 여기에 뭐를 학습으로 넣을 지 고민
            # 아니면, predict_one에서 필터를 넣을 지 고민
            self.network.add_train_set(converted_state, action, -1,
                                       self.convert_state([-1] * 9), True)
            action = random.choice(converted_actions)

        original_action = optimal_board.convert_action_to_original(action)

        return original_action

Пример #8

Показать файл

    def _episode_feedback(self, reward):
        # for winner
        history_left = reversed(self.all_history())
        (state, action, _, _, _) = next(history_left)   # pop last history and set it.
        ob = OptimalBoard(state)
        reward = self.p_table.set(ob.board_id, ob.convert_action_to_optimal(action), reward)

        for (state, action, _, _, _) in history_left:
            ob = OptimalBoard(state)
            reward = self.p_table.learn(ob.board_id, ob.convert_action_to_optimal(action), reward)

Пример #9

Показать файл

Файл: alpha_beta_pruning.py Проект: jaywon99/tictactoe

    def negamax_alpha_beta_pruning(self,
                                   state,
                                   color,
                                   alpha=-math.inf,
                                   beta=math.inf,
                                   depth=10):
        ''' implement negamax algorithm with alpha-beta purning
        https://en.wikipedia.org/wiki/Negamax
        '''
        # negamax.counter += 1

        # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small
        # LEAF NODE is checked on play time

        orig_alpha = alpha

        # Transposition Table related work
        # ob = OptimalBoard(state)
        # _id = ob.board_id
        _id = OptimalBoard.board_to_id(state)
        cache = self.tp.get(_id)
        if cache and cache['depth'] >= depth:
            (cached_score, cached_action) = cache['value']
            if cache['flag'] == self.tp.EXACT:
                return (cached_score, cached_action)
            elif cache['flag'] == self.tp.LOWERBOUND:
                alpha = max(alpha, cached_score)
            elif cache['flag'] == self.tp.UPPERBOUND:
                beta = min(beta, cached_score)
            if alpha >= beta:
                return cached_score, cached_action
        # else:
        #     print("MISS", t.seq)

        # RECURSIVE
        actions = SP.available_actions(state)
        random.shuffle(
            actions)  # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐
        best_score = -math.inf
        best_move = -1
        for action in actions:
            next_s = state[:]
            score, done = SP.play(next_s, action, color)
            if not done:
                score, _ = self.negamax_alpha_beta_pruning(next_s,
                                                           SP.next(color),
                                                           alpha=-beta,
                                                           beta=-alpha,
                                                           depth=depth - 1)
                score = -score  # negamax

            # just pick up 1 first best move (random.shuffle make randomness)
            if best_score < score or (score == best_score
                                      and random.random() < 0.5):
                best_score = score
                best_move = action

            if alpha < score:
                alpha = score
                # 결국 alpha = max(alpha, best_score)
                if alpha > beta:
                    break

        if best_score <= orig_alpha:
            flag = self.tp.UPPERBOUND
        elif best_score >= beta:
            flag = self.tp.LOWERBOUND
        else:
            flag = self.tp.EXACT

        self.tp.put(key=_id,
                    depth=depth,
                    value=(best_score, best_move),
                    flag=flag)

        return (alpha, best_move)

Пример #10

Показать файл

Файл: player.py Проект: jaywon99/tictactoe

 def _feedback(self, state, action, next_state, reward, done):
     ob1 = OptimalBoard(state)
     ob2 = OptimalBoard(next_state)
     self.q.learn(ob1.board_id, ob1.convert_action_to_optimal(action),
                  reward, ob2.board_id)

Пример #11

Показать файл

 def to_board_id(board):
     ''' board id to make node '''
     return OptimalBoard(board).board_id