# our model is V_hat = theta.dot(x) # where x = [row, col, row*col, 1] - 1 for bias term theta = np.random.randn(4) / 2 def s2x(s): return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1]) # repeat until convergence deltas = [] t = 1.0 for it in xrange(20000): if it % 100 == 0: t += 0.01 alpha = LEARNING_RATE/t # generate an episode using pi biggest_change = 0 states_and_returns = play_game(grid, policy) seen_states = set() for s, G in states_and_returns: # check if we have already seen s # called "first-visit" MC policy evaluation if s not in seen_states: old_theta = theta.copy() x = s2x(s) V_hat = theta.dot(x) # grad(V_hat) wrt theta = x theta += alpha*(G - V_hat)*x biggest_change = max(biggest_change, np.abs(old_theta - theta).sum()) seen_states.add(s) deltas.append(biggest_change) plt.plot(deltas)
} theta = np.random.randn(4) / 2 def s2x(s): return np.array([s[0] - 1, s[1] - 1.5, s[0] * s[1] - 3, 1]) deltas = [] t = 1.0 for it in range(20000): if it % 100 == 0: t += 0.01 alpha = LEARNING_RATE / t biggest_change = 0 states_and_returns = play_game(grid, policy) seen_states = set() for s, G in states_and_returns: if s not in seen_states: old_theta = theta.copy() x = s2x(s) V_hat = theta.dot(x) theta += alpha * (G - V_hat) * x biggest_change = max(biggest_change, np.abs(old_theta - theta).sum()) seen_states.add(s) deltas.append(biggest_change) plt.plot(deltas) plt.show()
} theta = np.random.randn(4) / 2 # V = theta.dot(x) def s2x(s): return np.array([s[0] - 1, s[1] - 1.5, s[0] * s[1] - 3, 1]) delta = [] t = 1.0 for it in range(10000): if it % 100 == 0: t += 10e-3 biggest_change = 0 alpha = LEARNING_RATE / t state_return = play_game(grid, policy) seen_states = set() for s, G in state_return: if s not in seen_states: old_theta = theta.copy() x = s2x(s) V_hat = theta.dot(x) theta += alpha * (G - V_hat) * x biggest_change = max(biggest_change, np.abs(old_theta - theta).sum()) seen_states.add(s) delta.append(biggest_change) plt.plot(delta) plt.show()