def greedy_action(self, free_qblock_id_lists, collapsed_qttts, mark): assert len(collapsed_qttts) > 0 states = {} for i in range(len(collapsed_qttts)): if free_qblock_id_lists[i] is None: nstate = after_action_state(collapsed_qttts[i], None, mark) states[(i, -1, -1)] = GameTree.get_state_val(nstate) continue n = len(free_qblock_id_lists[i]) for j in range(n - 1): for k in range(j + 1, n): loc1 = free_qblock_id_lists[i][j] loc2 = free_qblock_id_lists[i][k] nstate = after_action_state(collapsed_qttts[i], (loc1, loc2), mark) states[(i, loc1, loc2)] = GameTree.get_state_val(nstate) if mark % 2 == 1: indices = GameTree.best_states(states, min) else: indices = GameTree.best_states(states, max) i, j, k = random.choice(indices) action = (collapsed_qttts[i], (j, k)) return action
def bellman_backup(self, qttt, next_qttt, reward, mark): """ Bellman backup for TD learning :param Qttt state: current state of qttt :param Qttt next_state: next state after action is take :param int reward: immediate reward for this round :return: None """ state_value = GameTree.get_state_val(qttt.get_state()) next_state_value = GameTree.get_state_val(next_qttt.get_state()) updated_state_value = state_value + self.alpha * ( reward + gamma * next_state_value - state_value) GameTree.set_state_value(qttt.get_state(), updated_state_value)