示例#1
0
    def greedy_action(self, free_qblock_id_lists, collapsed_qttts, mark):
        assert len(collapsed_qttts) > 0
        states = {}
        for i in range(len(collapsed_qttts)):
            if free_qblock_id_lists[i] is None:
                nstate = after_action_state(collapsed_qttts[i], None, mark)
                states[(i, -1, -1)] = GameTree.get_state_val(nstate)
                continue
            n = len(free_qblock_id_lists[i])
            for j in range(n - 1):
                for k in range(j + 1, n):
                    loc1 = free_qblock_id_lists[i][j]
                    loc2 = free_qblock_id_lists[i][k]
                    nstate = after_action_state(collapsed_qttts[i],
                                                (loc1, loc2), mark)
                    states[(i, loc1, loc2)] = GameTree.get_state_val(nstate)
        if mark % 2 == 1:
            indices = GameTree.best_states(states, min)
        else:
            indices = GameTree.best_states(states, max)

        i, j, k = random.choice(indices)

        action = (collapsed_qttts[i], (j, k))
        return action
示例#2
0
 def bellman_backup(self, qttt, next_qttt, reward, mark):
     """
     Bellman backup for TD learning
     :param Qttt state: current state of qttt
     :param Qttt next_state: next state after action is take
     :param int  reward: immediate reward for this round
     :return: None
     """
     state_value = GameTree.get_state_val(qttt.get_state())
     next_state_value = GameTree.get_state_val(next_qttt.get_state())
     updated_state_value = state_value + self.alpha * (
         reward + gamma * next_state_value - state_value)
     GameTree.set_state_value(qttt.get_state(), updated_state_value)