def close_episode(self, ep, flag=""): episode = ep[2:].copy() # backward episode.reverse() #for i in range(1, len(episode), 2): ### ### TD learning ### Hint: 1. calculate TD error (notice for the last state) ### 2. update the weight table ### return
def close_episode(self, ep, flag=""): episode = ep[2:].copy() # backward episode.reverse() for i in range(1, len(episode) - 1, 2): before_state_next, _, _, _ = episode[i - 1] after_state, move, reward, _ = episode[i] before_state, _, _, _ = episode[i + 1] #self.test_learn(before_state, move, reward, after_state, before_state_next) self.learn(before_state, move, reward, after_state, before_state_next) return
def close_episode(self, ep, flag = ""): total_reward = 0 episode = ep[2:].copy() # backward ## episode element => (state, action, reward, time usage) episode.reverse() exact = 0 for i in range(3, len(episode), 2): reward = episode[i-2][2] total_reward += reward value = self.get_value(episode[i][0]) error = exact - value v = self.alpha * error / 32 exact = reward + self.update_value(episode[i][0], v) return total_reward
def close_episode(self, ep, flag=""): episode = ep[2:].copy() episode.reverse() def best_action(state): # Return the best action expValues = [] rewards = [] for op in range(4): tmpBoard = copy.copy(state) rewards.append( tmpBoard.slide(op)) # get the reward of afterstate if rewards[-1] == -1: # When the action is not allowed (reward==-1), # it is impossible to take the action expValues.append(-float("inf")) else: expValues.append(rewards[-1] + self.lineValue(tmpBoard)) best_move = np.argmax(expValues) return best_move, rewards[best_move] for idx in range(1, len(episode), 2): if idx == 1: # Update the last state as 0 idx0, idx1, idx2, idx3 = self.lineIndex(episode[2][0]) self.net[0][idx0] = 0 self.net[1][idx1] = 0 self.net[2][idx2] = 0 self.net[3][idx3] = 0 continue sPrime = copy.copy(episode[idx][0]) # State s' sPrime2 = copy.copy(episode[idx - 1][0]) # State s'' tmpBoard = copy.copy(sPrime2) actionNext, rewardNext = best_action( tmpBoard) # best action and reward at State s'' tmpBoard.slide(actionNext) sPrime2Next = copy.copy(tmpBoard) # State s'(next) value = rewardNext + self.lineValue(sPrime2Next) - self.lineValue( sPrime) self.updateLineValue(board_state=sPrime, value=value) return