(2, 2): 'U', (2, 3): 'L', } model = Model() deltas = [] # repeat until convergence k = 1.0 for it in range(20000): if it % 10 == 0: k += 0.01 alpha = ALPHA / k biggest_change = 0 states_and_rewards = play_game(grid, policy) for t in range(len(states_and_rewards) - 1): s, _ = states_and_rewards[t] s2, r = states_and_rewards[t+1] # We update V(s) as we experience the episode old_theta = model.theta.copy() if grid.is_terminal(s2): target = r else: target = r + GAMMA * model.predict(s2) model.theta += alpha*(target - model.predict(s))*model.grad(s) biggest_change = max(biggest_change, np.abs(old_theta - model.theta).sum()) deltas.append(biggest_change) plt.plot(deltas)
(2, 3): 'U', } model = Model() deltas = [] # repeat until convergence k = 1.0 for it in range(20000): if it % 10 == 0: k += 0.01 alpha = ALPHA/k biggest_change = 0 # generate an episode using pi states_and_rewards = play_game(grid, policy) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. for t in range(len(states_and_rewards) - 1): s, _ = states_and_rewards[t] s2, r = states_and_rewards[t+1] # we will update V(s) AS we experience the episode old_theta = model.theta.copy() if grid.is_terminal(s2): target = r else: target = r + GAMMA*model.predict(s2) model.theta += alpha*(target - model.predict(s))*model.grad(s)