示例#1
0
 def close_episode(self, ep, flag=""):
     episode = ep[2:].copy()
     # backward
     episode.reverse()
     #for i in range(1, len(episode), 2):
     ###
     ###   TD learning
     ###   Hint: 1. calculate TD error (notice for the last state)
     ###         2. update the weight table
     ###
     return
示例#2
0
 def close_episode(self, ep, flag=""):
     episode = ep[2:].copy()
     # backward
     episode.reverse()
     for i in range(1, len(episode) - 1, 2):
         before_state_next, _, _, _ = episode[i - 1]
         after_state, move, reward, _ = episode[i]
         before_state, _, _, _ = episode[i + 1]
         #self.test_learn(before_state, move, reward, after_state, before_state_next)
         self.learn(before_state, move, reward, after_state,
                    before_state_next)
     return
示例#3
0
 def close_episode(self, ep, flag = ""):
     total_reward = 0
     episode = ep[2:].copy()
     # backward
     ## episode element => (state, action, reward, time usage)
     episode.reverse()
     exact = 0
     for i in range(3, len(episode), 2):
         reward = episode[i-2][2]
         total_reward += reward
         value = self.get_value(episode[i][0])
         error = exact - value
         v = self.alpha * error / 32
         exact = reward + self.update_value(episode[i][0], v)
     return total_reward
示例#4
0
    def close_episode(self, ep, flag=""):
        episode = ep[2:].copy()
        episode.reverse()

        def best_action(state):  # Return the best action
            expValues = []
            rewards = []
            for op in range(4):
                tmpBoard = copy.copy(state)
                rewards.append(
                    tmpBoard.slide(op))  # get the reward of afterstate
                if rewards[-1] == -1:
                    # When the action is not allowed (reward==-1),
                    # it is impossible to take the action
                    expValues.append(-float("inf"))
                else:
                    expValues.append(rewards[-1] + self.lineValue(tmpBoard))
            best_move = np.argmax(expValues)
            return best_move, rewards[best_move]

        for idx in range(1, len(episode), 2):
            if idx == 1:  # Update the last state as 0
                idx0, idx1, idx2, idx3 = self.lineIndex(episode[2][0])
                self.net[0][idx0] = 0
                self.net[1][idx1] = 0
                self.net[2][idx2] = 0
                self.net[3][idx3] = 0
                continue
            sPrime = copy.copy(episode[idx][0])  # State s'
            sPrime2 = copy.copy(episode[idx - 1][0])  # State s''
            tmpBoard = copy.copy(sPrime2)
            actionNext, rewardNext = best_action(
                tmpBoard)  # best action and reward at State s''
            tmpBoard.slide(actionNext)
            sPrime2Next = copy.copy(tmpBoard)  # State s'(next)
            value = rewardNext + self.lineValue(sPrime2Next) - self.lineValue(
                sPrime)
            self.updateLineValue(board_state=sPrime, value=value)
        return