def sarsa(env, num_episodes, alpha, gamma=1.0): # initialize action-value function (empty dictionary of arrays) Q = defaultdict(lambda: np.zeros(env.nA)) # initialize performance monitor # loop over episodes for i_episode in range(1, num_episodes + 1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() # plot the estimated optimal state-value function V_sarsa = ([ np.max(Q[key]) if key in Q else 0 for key in np.arange(48) ]) plot_values(V_sarsa) s0 = env.reset() """ Q[s0, a0] = (1-alpha) * Q[s0, a0] + alpha * (r + gamma * Q[s1,a1]) """ a0 = eps_greedy_act(Q[s0], env, 1.0 / i_episode) for i in range(1000): [s1, r, done, info] = env.step(a0) if not done: a1 = eps_greedy_act(Q[s1], env, 1.0 / i_episode) Q[s0][a0] = (1 - alpha) * Q[s0][a0] + alpha * ( r + gamma * Q[s1][a1]) else: Q[s0][a0] = (1 - alpha) * Q[s0][a0] + alpha * r break a0 = a1 s0 = s1 return Q
def plot_optimal_policy(): # define the optimal state-value function V_opt = np.zeros((4, 12)) V_opt[0:13][0] = -np.arange(3, 15)[::-1] V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1 V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2 V_opt[3][0] = -13 plot_values(V_opt)
def exploitQ(Q): # print the estimated optimal policy derived_policy = np.array([np.argmax(Q[key]) if key in Q else -1 for key in np.arange(48)]).reshape(4,12) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(derived_policy) # plot the estimated optimal state-value function V_sarsa = ([np.max(Q[key]) if key in Q else 0 for key in np.arange(48)]) plot_values(V_sarsa)
def test_expected_sarsa(env): # obtain the estimated optimal policy and corresponding action-value function Q_expsarsa = TD.expected_sarsa(env, 500, .2) # print the estimated optimal policy policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12) check_test.run_check('td_control_check', policy_expsarsa) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_expsarsa) # plot the estimated optimal state-value function plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])
def test_q_learning(env): #visualize the estimated optimal policy and the corresponding state-value function # obtain the estimated optimal policy and corresponding action-value function Q_sarsamax = TD.q_learning(env, 500, .2) # print the estimated optimal policy policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) check_test.run_check('td_control_check', policy_sarsamax) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsamax) # plot the estimated optimal state-value function plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])
def test_sarsa(): # obtain the estimated optimal policy and corresponding action-value function # eps=.1 safe path, eps = .01 optimal path Q_sarsa = sarsa(env, 5000, .01, eps=0.01) # print the estimated optimal policy policy_sarsa = np.array([np.argmax( Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4, 12) check_test.run_check('td_control_check', policy_sarsa) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = ( [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]) plot_values(V_sarsa)
def evaluate_sarsa(): env = gym.make("CliffWalking-v0") # obtain the estimated optimal policy and corresponding action-value function Q_sarsa = sarsa(env, 5000, 0.01) # print the estimated optimal policy policy_sarsa = np.array( [np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)] ).reshape(4, 12) check_test.run_check("td_control_check", policy_sarsa) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)] plot_values(V_sarsa)
def prediction(): policy = np.hstack([1*np.ones(11), 2, 0, np.zeros(10), 2, 0, np.zeros(10), 2, 0, -1*np.ones(11)]) print("\nPolicy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy.reshape(4,12)) V_true = np.zeros((4,12)) for i in range(3): V_true[0:12][i] = -np.arange(3, 15)[::-1] - i V_true[1][11] = -2 V_true[2][11] = -1 V_true[3][0] = -17 plot_values(V_true) V_pred = td_prediction(env, 5000, policy, .01) V_pred_plot = np.reshape([V_pred[key] if key in V_pred else 0 for key in np.arange(48)], (4,12)) plot_values(V_pred_plot)
q = q_from_v(env, V, s, gamma) for a, action_prob in enumerate(policy[s]): v += action_prob * q[a] V[s] = v num_it += 1 return V def truncated_policy_iteration(env, max_it=1, gamma=1, theta=1e-8): V = np.zeros(env.nS) policy = np.zeros([env.nS, env.nA]) / env.nA while True: policy = policy_improvement(env, V) old_V = copy.copy(V) V = truncated_policy_evaluation(env, policy, V, max_it, gamma) if max(abs(V - old_V)) < theta: break return policy, V env = FrozenLakeEnv() policy_tpi, V_tpi = truncated_policy_iteration(env, max_it=2) # print the optimal policy print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):") print(policy_tpi, "\n") # plot the optimal state-value function plot_values(V_tpi)
@param epsilon: returns an array of probabilities for each action """ possible_actions_count = len(Q[state]) policy_s: [] = np.ones( possible_actions_count) * epsilon / possible_actions_count # look what is the best action according to the Q-Table best_a: int = np.argmax(Q[state]) policy_s[best_a] = 1 - epsilon + (epsilon / possible_actions_count) return policy_s # obtain the estimated optimal policy and corresponding action-value function Q_sarsa = sarsa(env, 5000, .01) # print the estimated optimal policy policy_sarsa = np.array([ np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48) ]).reshape(4, 12) check_test.run_check('td_control_check', policy_sarsa) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = ([ np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48) ]) plot_values(V_sarsa)
Vs = 0 for a, action_prob in enumerate(policy[s]): for prob, next_state, reward, done in env.P[s][a]: Vs += action_prob * prob * (reward + gamma * V[next_state]) delta = max(delta, np.abs(V[s] - Vs)) V[s] = Vs if delta < theta: break return V random_policy = np.ones([env.nS, env.nA]) / env.nA # evaluate the policy V = policy_evaluation(env, random_policy) plot_values(V) check_test.run_check('policy_evaluation_check', policy_evaluation) def q_from_v(env, V, s, gamma=1): q = np.zeros(env.nA) for a in range(env.nA): for prob, next_state, reward, done in env.P[s][a]: q[a] += prob * (reward + gamma * V[next_state]) return q Q = np.zeros([env.nS, env.nA]) for s in range(env.nS): Q[s] = q_from_v(env, V, s) print("Action-Value Function:")
# 此时输出的策略近似于最优策略 return policy, V # 截断的策略迭代算法是 def truncated_policy_iteration(env, max_num, gamma=1, theta=1e-8): # 初始策略是随机策略 # policy = np.ones([env.nS, env.nA]) / env.nA V = np.zeros(env.nS) # 更新到一定步后停止更新 while True: policy = policy_improvement(env, V, gamma) V_old = copy.copy(V) V = truncated_policy_evaluation(env, policy, V, max_num) if (np.max(np.abs(V_old - V))) < theta: break # 此时输出的策略近似于最优策略 return policy, V env = FrozenLakeEnv() policy_pi, V_pi = truncated_policy_iteration(env, 100) # # # print the optimal policy # print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):") # print(policy_pi,"\n") # plot_values(V_pi)
import check_test env = gym.make('CliffWalking-v0') print(env.action_space) print(env.observation_space) if False: # define the optimal state-value function V_opt = np.zeros((4, 12)) V_opt[0:13][0] = -np.arange(3, 15)[::-1] V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1 V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2 V_opt[3][0] = -13 plot_values(V_opt) policy = np.hstack([ 1 * np.ones(11), 2, 0, np.zeros(10), 2, 0, np.zeros(10), 2, 0, -1 * np.ones(11) ]) print("\nPolicy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy.reshape(4, 12)) V_true = np.zeros((4, 12)) for i in range(3): V_true[0:12][i] = -np.arange(3, 15)[::-1] - i V_true[1][11] = -2 V_true[2][11] = -1 V_true[3][0] = -17
tem_scores.append(score) break if (i_episode % plot_every == 0): avg_scores.append(np.mean(tem_scores)) plt.plot(np.linspace(0, num_episodes, len(avg_scores), endpoint=False), np.asarray(avg_scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(avg_scores)) return Q Q_learn = Q_learning(env, 5000, .01) sarsamax_policy = np.array([ np.argmax(Q_learn[key]) if key in Q_learn else -1 for key in np.arange(48) ]).reshape(4, 12) check_test.run_check('td_control_check', sarsamax_policy) print( "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):" ) print(sarsamax_policy) V_sarsamax = ([ np.max(Q_learn[key]) if key in Q_learn else 0 for key in np.arange(48) ]) plot_values(V_sarsamax)
def q_learning(env, num_episodes, alpha, gamma=1.0): # initialize action-value function (empty dictionary of arrays) Q = defaultdict(lambda: np.zeros(env.nA)) # initialize performance monitor plot_every = 100 tmp_scores = deque(maxlen=plot_every) scores = deque(maxlen=num_episodes) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() # initialize score score = 0 # begin an episode, observe S state = env.reset() while True: # get epsilon-greedy action probabilities policy_s = epsilon_greedy_probs(env, Q[state], i_episode) # pick next action A action = np.random.choice(np.arange(env.nA), p=policy_s) # take action A, observe R, S' next_state, reward, done, info = env.step(action) # update Q Q[state][action] = update_Q(Q[state][action], np.max(Q[next_state]), \ reward, alpha, gamma) # S <- S' state = next_state # until S is terminal if done: # append score tmp_scores.append(score) break if(i_episode % plot_every == 0): scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores)) return Q # obtain the estimated optimal policy and corresponding action-value function Q_sarsamax = q_learning(env, 5000, .01) # print the estimated optimal policy policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12)) check_test.run_check('td_control_check', policy_sarsamax) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsamax) # plot the estimated optimal state-value function plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)]) # Part 3: TD Control: Expected Sarsa # Input: policy π, positive integer num episodes, small positive fraction α, GLIE {εi} # Output: value function Q (≈ qπ if num episodes is large enough) # Initialize Q arbitrarily (e.g., Q(s, a) = 0 for all s ∈ S and a ∈ A(s), and Q(terminal-state, ·) = 0) # for i ← 1 to num episodes do # ε ← εi # Observe S0 # t←0 # repeat # Choose action At using policy derived from Q (e.g., ε-greedy) # Take action At and observe Rt+1 , St+1 # Q(St, At) ← Q(St, At) + α(Rt+1 + γ a π(a|St+1)Q(St+1, a) − Q(St, At)) # t←t+1 # until St is terminal; # end # return Q def expected_sarsa(env, num_episodes, alpha, gamma=1.0): # initialize action-value function (empty dictionary of arrays) Q = defaultdict(lambda: np.zeros(env.nA)) # initialize performance moniter plot_every = 100 tmp_scores deque(maxlen=plot_every) scores = deque(maxlen=num_episodes) # loop over episodes for i_episode in range(1, num_episodes+1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() # initialize score socre = 0 # begin an episode state = env.reset() # get epsilon-greedy action probabilities policy_s = epsilon_greedy_probs(env, Q[state], i_episode, 0.005) while True: # pick next action action = np.random.choice(np.arange(env.nA), p=policy_s) # take action A, observe R, S' next_state, reward, done, info = env.step(action) # add reward to score score += reward # get epsilon-greedy action probabilities (for S') policy_s = epsilon_greedy_probs(env, Q[next_state], i_episode, 0.005) # update Q Q[state][action] = update_Q(Q[state][action], np.dot(Q[next_state], policy_s), \ reward, alpha, gamma) # S <- S' state = next_state # until S is terminal if done: # append score tmp_scores.append(score) break if (i_episode % plot_every == 0): scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores)) return Q
policy[s] = np.sum([np.eye(env.nA)[i] for i in best_a], axis=0) / len(best_a) return policy def value_iteration(env, gamma=1, theta=1e-8): V = np.zeros(env.nS) while True: delta = 0 for s in range(env.nS): v = V[s] V[s] = max(q_from_v(env, V, s, gamma)) delta = max(delta, abs(V[s] - v)) if delta < theta: break policy = policy_improvement(env, V, gamma) return policy, V env = FrozenLakeEnv() policy_vi, V_vi = value_iteration(env) # print the optimal policy print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):") print(policy_vi, "\n") # plot the optimal state-value function plot_values(V_vi)
## TODO: complete the function return Q Q_sarsa = sarsa(env, 2000, .01) # print the estimated optimal policy policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12) #check_test.run_check('td_control_check', policy_sarsa) print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):") print(policy_sarsa) # plot the estimated optimal state-value function V_sarsa = ([np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]) plot_values(V_sarsa) #----------------Q Learning -------------------# def generate_episode_and_update_Q(env, Q, epsilon, nA,gamma,alpha): """ generates an episode from following the epsilon-greedy policy """ episode = [] state = env.reset() action = np.random.choice(np.arange(nA), p=get_probs(Q[state], epsilon, nA)) \ if state in Q else env.action_space.sample() score=0 while True:
# monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes), end="") sys.stdout.flush() score = 0 state = env.reset() while True: policy_s = policy[state] action = np.random.choice(np.arange(env.nA), p=policy_s) next_state, reward, done, info = env.step(action) score += reward V[state] = update_V(V[state], np.max(V[next_state]), reward, alpha, gamma) state = next_state if done: tmp_scores.append(score) break if (i_episode % plot_every == 0): scores.append(np.mean(tmp_scores)) print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores)) return V random_policy = np.ones([env.nS, env.nA]) / env.nA from plot_utils import plot_values # evaluate the policy V = policy_evaluation(env, random_policy) plot_values(V)