Exemplo n.º 1
def mcts(payload):
    root = Node(None, None)
    my_id = (payload['role_id'] + 2) % 3

    next_id = (payload['role_id'] + 1) % 3
    next_next_id = (payload['role_id'] + 2) % 3
    my_card_ = payload['hand_card'][payload['role_id']]
    my_card = card_list_to_dict(card_to_list(change_card_form_reversal(my_card_)))
    next_card_ = payload['hand_card'][next_id]
    next_card = card_list_to_dict(card_to_list(change_card_form_reversal(next_card_)))
    next_next_card_ = payload['hand_card'][next_next_id]
    next_next_card = card_list_to_dict(card_to_list(change_card_form_reversal(next_next_card_)))
    last_move_, last_p_ = get_last_move(payload['role_id'], next_id, next_next_id, payload['last_taken'])
    last_move = change_card_form_reversal(last_move_)
    last_p = (last_p_ + 2) % 3
    moves_num = len(get_moves(my_card, last_move))
    state = State(my_id, my_card, next_card, next_next_card, last_move, -1, moves_num, None, last_p)

    computation_budget = 1000
    for i in range(computation_budget):
        expand_node = tree_policy(root, my_id)
        reward = default_policy(expand_node, my_id)
        backup(expand_node, reward)
    best_next_node = get_bestchild_(root)
    move = best_next_node.get_state().action

    return move
Exemplo n.º 2
def search(h, proc, max_iter, clean=True):
    This function implements the UCT algorithm.

        h (History): history in the current root of the tree
        proc (DecisionProcess): model of domain knowledge of the pomdp
        max_iter (int): maxium number of iterations
        clean (bool): toggle to reset the tree

        POMDPAction: the optimal action
    assert isinstance(h, History)
    assert isinstance(proc, DecisionProcess)
    # init global vars
    params['start_time'] = time.time()
    if clean:
        params['root'] = Node(h.last_action(), h, 0, 0, list())

    root = params['root'].children[h.last_action(
    )] if h.last_action() != POMDPAction() else params['root']

    # root should have history given as args but B from previous root
    root.h = h.clone()
    # at each call to search, children of the current root must be regenerated, to
    # consider the last real action-observation obtained
    root.inTree = False

    if params['log'] >= 1:
        print("current root: {}, len(h): {}".format(h.actions[0], len(h)))
    ite = 0

    # time out
    def time_remaining():
        return ite < max_iter and (time.time() -
                                   params['start_time']) < params['timeout']

    # search
    while time_remaining():
        s = proc.initial_belief()
        if len(root.h) > 1:
            s = random.choice(tuple(root.B))
        simulate(s, root, proc)
        ite += 1

    # greedy action selection
    a = UCB1_action_selection(root, greedy=True)[0]
    params['root'] = root
    child = root.children[a]

    # particle reinvigoration
    proc.invigoration(child.B, ite)
    if params['log'] >= 1:
        print("next belief size: {}".format(len(child.B)))
    return a
Exemplo n.º 3
    def choose(self, state):
        min_crads = min([sum(p.get_hand_card()) for p in self.game.players])
        if min_crads > 7:
            # 获得手牌
            hand_card = self.get_hand_card()
            # 拆牌器和引擎用了不同的编码 1 -> A, B -> *, R -> $
            trans_hand_card = [
                card_list[i] for i in range(15) for _ in range(hand_card[i])
            # 获得上家出牌
            last_move = [
                card_list[i] for i in range(15)
                for _ in range(state.last_move[i])
            # 拆牌
            D = Decomposer()
            combs, fine_mask = D.get_combinations(trans_hand_card, last_move)
            # 根据对手剩余最少牌数决定每多一手牌的惩罚
            left_crads = [sum(p.get_hand_card()) for p in self.game.players]
            min_oppo_crads = min(
                left_crads[2]) if self.player_id == 0 else left_crads[0]
            round_penalty = 15 - 12 * min_oppo_crads / 20
            # 寻找最优出牌
            best_move = None
            best_comb = None
            max_value = -np.inf
            for i in range(len(combs)):
                # 手牌总分
                total_value = sum([cards_value[x] for x in combs[i]])
                small_num = 0
                for j in range(0, len(combs[i])):
                    if j > 0 and action_space[j][0] not in ["2", "R", "B"]:
                        small_num += 1
                total_value -= small_num * round_penalty
                for j in range(0, len(combs[i])):
                    # Pass 得分
                    if combs[i][j] == 0 and min_oppo_crads > 4:
                        if total_value > max_value:
                            max_value = total_value
                            best_comb = combs[i]
                            best_move = 0
                    # 出牌得分
                    elif combs[i][j] > 0 and (fine_mask is None
                                              or fine_mask[i, j] == True):
                        # 特判只有一手
                        if len(combs[i]) == 1 or len(
                                combs[i]) == 2 and combs[i][0] == 0:
                            max_value = np.inf
                            best_comb = combs[i]
                            best_move = combs[i][-1]
                        move_value = total_value - cards_value[
                            combs[i][j]] + round_penalty
                        if move_value > max_value:
                            max_value = move_value
                            best_comb = combs[i]
                            best_move = combs[i][j]
                if best_move is None:
                    best_comb = [0]
                    best_move = 0
            # 最优出牌
            best_cards = action_space[best_move]
            move = [best_cards.count(x) for x in card_list]
            # 输出选择的牌组
            # print("\nbest comb: ")
            # for m in best_comb:
            #     print(action_space[m], cards_value[m])
            # 输出 player i [手牌] // [出牌]
            print("Player {}".format(self.player_id),
                  ' ',
                  end=' // ')
            print(Card.visual_card(move), "From RuleBasedModel")
            return move, None

        #  start = time.time()
        #  定位current_node
        cards_out = self.game.cards_out
        length = len(cards_out)
        #  判断是否定位到current_node的flag
        flag = 0
        if self.new_game is False:
            #  前两步对手选择的move
            out1 = self.list_to_card(cards_out[length - 2][1])
            out2 = self.list_to_card(cards_out[length - 1][1])
            for child in self.current_node.get_children():
                if self.compare(child.state.action, out1):
                    self.current_node = child
                    flag = 1
            if flag == 1:
                for child in self.current_node.get_children():
                    if self.compare(child.state.action, out2):
                        self.current_node = child
                        flag = 2

        my_id = self.player_id
        if flag != 2:
            self.new_game = False
            root = Node(None, None)
            self.current_node = root

            #  下家id
            next_id = (my_id + 1) % 3
            #  下下家id
            next_next_id = (my_id + 2) % 3
            my_card = self.card_list_to_dict(self.get_hand_card())
            #  下家牌
            next_card = self.card_list_to_dict(
            #  下下家牌
            next_next_card = self.card_list_to_dict(
            last_move = self.trans_card(Card.visual_card(self.game.last_move))
            last_p = self.game.last_pid
            moves_num = len(get_moves(my_card, last_move))
            state = State(my_id, my_card, next_card, next_next_card, last_move,
                          -1, moves_num, None, last_p)

        #  搜索
        computation_budget = 2000
        for i in range(computation_budget):
            expand_node = tree_policy(self.current_node, my_id)
            reward = default_policy(expand_node, my_id)
            backup(expand_node, reward)
        best_next_node = get_bestchild(self.current_node, my_id)
        move = best_next_node.get_state().action
        self.current_node = best_next_node
        new_move = self.card_to_list(move)

        hand_card = []
        for i, n in enumerate(Card.all_card_name):
            hand_card.extend([n] * self.get_hand_card()[i])
        print("Player {}".format(self.player_id), ' ', hand_card, end=' // ')
        print(Card.visual_card(new_move), "From MctsModel")
        #  end = time.time()
        #  dur = end - start
        #  print('cost: {}'.format(dur))
        return new_move, None
Exemplo n.º 4
 def __init__(self, player_id):
     super(MixModel, self).__init__(player_id)
     root = Node(None, None)
     self.current_node = root
     self.new_game = False
Exemplo n.º 5
import os

params = {
    'K': 50,  # number of particles (size of the belief state space)
    0,  # exploration / exploitation ratio scalar constant (domain specific)
    'epsilon': 0.0,  # history discount factor
    'gamma': 1,  # reward discount factor
    'R_lo': 0,  # lowest value V(h) reached 
    'R_hi': 1,  # highest value V(h) reached
    'timeout': 120,  # timeout for each iteration in seconds
    'start_time': 0,  # start time in seconds
    'max_depth': 20,  # max depth
    'log': 1,  # level of logs printed on console [0,2]
    'prefs': True,  # enable/disable prefered actions
    'root': Node(POMDPAction(), History(), 0, 0, list())

# start time
start_time = 0

def UCB1_action_selection(node, greedy=False):
    Implementation of the UCB1 algorithm for solving a multi-armed bandit problem.


    Each action a available from the history h are assigned a value V(ha), 
    computed from simulations of the POMDP from the history h.
    In non-greedy mode, this value is augmented by an exploration bonus for rarely-tried actions.
 def __init__(self, player_id):
     super(MctsModel, self).__init__(player_id)
     root = Node(None, None)
     self.current_node = root