def sarsa(env, num_episodes, alpha, gamma=1.0):
    # initialize action-value function (empty dictionary of arrays)
    Q = defaultdict(lambda: np.zeros(env.nA))
    # initialize performance monitor
    # loop over episodes
    for i_episode in range(1, num_episodes + 1):
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
            # plot the estimated optimal state-value function
            V_sarsa = ([
                np.max(Q[key]) if key in Q else 0 for key in np.arange(48)
            ])
            plot_values(V_sarsa)

        s0 = env.reset()
        """
        Q[s0, a0] = (1-alpha) * Q[s0, a0] + alpha * (r + gamma * Q[s1,a1])
        """
        a0 = eps_greedy_act(Q[s0], env, 1.0 / i_episode)

        for i in range(1000):
            [s1, r, done, info] = env.step(a0)
            if not done:
                a1 = eps_greedy_act(Q[s1], env, 1.0 / i_episode)
                Q[s0][a0] = (1 - alpha) * Q[s0][a0] + alpha * (
                    r + gamma * Q[s1][a1])
            else:
                Q[s0][a0] = (1 - alpha) * Q[s0][a0] + alpha * r
                break
            a0 = a1
            s0 = s1

    return Q
def plot_optimal_policy():
    # define the optimal state-value function
    V_opt = np.zeros((4, 12))
    V_opt[0:13][0] = -np.arange(3, 15)[::-1]
    V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
    V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
    V_opt[3][0] = -13

    plot_values(V_opt)
예제 #3
0
def exploitQ(Q):
    
    # print the estimated optimal policy
    derived_policy = np.array([np.argmax(Q[key]) if key in Q else -1 for key in np.arange(48)]).reshape(4,12)
    print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
    print(derived_policy)

    # plot the estimated optimal state-value function
    V_sarsa = ([np.max(Q[key]) if key in Q else 0 for key in np.arange(48)])
    plot_values(V_sarsa)
 def test_expected_sarsa(env):
     # obtain the estimated optimal policy and corresponding action-value function
     Q_expsarsa = TD.expected_sarsa(env, 500, .2)    
     # print the estimated optimal policy
     policy_expsarsa = np.array([np.argmax(Q_expsarsa[key]) if key in Q_expsarsa else -1 for key in np.arange(48)]).reshape(4,12)
     check_test.run_check('td_control_check', policy_expsarsa)
     print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
     print(policy_expsarsa)    
     # plot the estimated optimal state-value function
     plot_values([np.max(Q_expsarsa[key]) if key in Q_expsarsa else 0 for key in np.arange(48)])    
 def test_q_learning(env):
     #visualize the estimated optimal policy and the corresponding state-value function
     # obtain the estimated optimal policy and corresponding action-value function
     Q_sarsamax = TD.q_learning(env, 500, .2)
     
     # print the estimated optimal policy
     policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
     check_test.run_check('td_control_check', policy_sarsamax)
     print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
     print(policy_sarsamax)
     # plot the estimated optimal state-value function
     plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])    
def test_sarsa():
    # obtain the estimated optimal policy and corresponding action-value function
    # eps=.1 safe path, eps = .01 optimal path
    Q_sarsa = sarsa(env, 5000, .01, eps=0.01)

    # print the estimated optimal policy
    policy_sarsa = np.array([np.argmax(
        Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4, 12)
    check_test.run_check('td_control_check', policy_sarsa)
    print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
    print(policy_sarsa)

    # plot the estimated optimal state-value function
    V_sarsa = (
        [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)])
    plot_values(V_sarsa)
def evaluate_sarsa():
    env = gym.make("CliffWalking-v0")
    # obtain the estimated optimal policy and corresponding action-value function
    Q_sarsa = sarsa(env, 5000, 0.01)

    # print the estimated optimal policy
    policy_sarsa = np.array(
        [np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]
    ).reshape(4, 12)
    check_test.run_check("td_control_check", policy_sarsa)
    print(
        "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
    )
    print(policy_sarsa)

    # plot the estimated optimal state-value function
    V_sarsa = [np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)]
    plot_values(V_sarsa)
예제 #8
0
def prediction():
    policy = np.hstack([1*np.ones(11), 2, 0, np.zeros(10), 2, 0, np.zeros(10), 2, 0, -1*np.ones(11)])
    print("\nPolicy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
    print(policy.reshape(4,12))
    
    V_true = np.zeros((4,12))
    for i in range(3):
        V_true[0:12][i] = -np.arange(3, 15)[::-1] - i
    V_true[1][11] = -2
    V_true[2][11] = -1
    V_true[3][0] = -17

    plot_values(V_true)
    
    V_pred = td_prediction(env, 5000, policy, .01)

    V_pred_plot = np.reshape([V_pred[key] if key in V_pred else 0 for key in np.arange(48)], (4,12)) 
    plot_values(V_pred_plot)
예제 #9
0
            q = q_from_v(env, V, s, gamma)
            for a, action_prob in enumerate(policy[s]):
                v += action_prob * q[a]
            V[s] = v
        num_it += 1
    return V


def truncated_policy_iteration(env, max_it=1, gamma=1, theta=1e-8):
    V = np.zeros(env.nS)
    policy = np.zeros([env.nS, env.nA]) / env.nA
    while True:
        policy = policy_improvement(env, V)
        old_V = copy.copy(V)
        V = truncated_policy_evaluation(env, policy, V, max_it, gamma)
        if max(abs(V - old_V)) < theta:
            break
    return policy, V


env = FrozenLakeEnv()

policy_tpi, V_tpi = truncated_policy_iteration(env, max_it=2)

# print the optimal policy
print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
print(policy_tpi, "\n")

# plot the optimal state-value function
plot_values(V_tpi)
    @param epsilon:
    returns an array of probabilities for each action
    """
    possible_actions_count = len(Q[state])
    policy_s: [] = np.ones(
        possible_actions_count) * epsilon / possible_actions_count
    # look what is the best action according to the Q-Table
    best_a: int = np.argmax(Q[state])
    policy_s[best_a] = 1 - epsilon + (epsilon / possible_actions_count)
    return policy_s


# obtain the estimated optimal policy and corresponding action-value function
Q_sarsa = sarsa(env, 5000, .01)

# print the estimated optimal policy
policy_sarsa = np.array([
    np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)
]).reshape(4, 12)
check_test.run_check('td_control_check', policy_sarsa)
print(
    "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
)
print(policy_sarsa)

# plot the estimated optimal state-value function
V_sarsa = ([
    np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)
])
plot_values(V_sarsa)
예제 #11
0
            Vs = 0
            for a, action_prob in enumerate(policy[s]):
                for prob, next_state, reward, done in env.P[s][a]:
                    Vs += action_prob * prob * (reward + gamma * V[next_state])
            delta = max(delta, np.abs(V[s] - Vs))
            V[s] = Vs
        if delta < theta:
            break
    return V


random_policy = np.ones([env.nS, env.nA]) / env.nA
# evaluate the policy
V = policy_evaluation(env, random_policy)

plot_values(V)
check_test.run_check('policy_evaluation_check', policy_evaluation)


def q_from_v(env, V, s, gamma=1):
    q = np.zeros(env.nA)
    for a in range(env.nA):
        for prob, next_state, reward, done in env.P[s][a]:
            q[a] += prob * (reward + gamma * V[next_state])
    return q


Q = np.zeros([env.nS, env.nA])
for s in range(env.nS):
    Q[s] = q_from_v(env, V, s)
print("Action-Value Function:")
예제 #12
0
    # 此时输出的策略近似于最优策略
    return policy, V


# 截断的策略迭代算法是
def truncated_policy_iteration(env, max_num, gamma=1, theta=1e-8):
    # 初始策略是随机策略
    # policy = np.ones([env.nS, env.nA]) / env.nA
    V = np.zeros(env.nS)

    # 更新到一定步后停止更新
    while True:
        policy = policy_improvement(env, V, gamma)
        V_old = copy.copy(V)
        V = truncated_policy_evaluation(env, policy, V, max_num)
        if (np.max(np.abs(V_old - V))) < theta:
            break

    # 此时输出的策略近似于最优策略
    return policy, V


env = FrozenLakeEnv()
policy_pi, V_pi = truncated_policy_iteration(env, 100)
#
# # print the optimal policy
# print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
# print(policy_pi,"\n")
#
plot_values(V_pi)
예제 #13
0
import check_test

env = gym.make('CliffWalking-v0')
print(env.action_space)
print(env.observation_space)

if False:

    # define the optimal state-value function
    V_opt = np.zeros((4, 12))
    V_opt[0:13][0] = -np.arange(3, 15)[::-1]
    V_opt[0:13][1] = -np.arange(3, 15)[::-1] + 1
    V_opt[0:13][2] = -np.arange(3, 15)[::-1] + 2
    V_opt[3][0] = -13

    plot_values(V_opt)

    policy = np.hstack([
        1 * np.ones(11), 2, 0,
        np.zeros(10), 2, 0,
        np.zeros(10), 2, 0, -1 * np.ones(11)
    ])
    print("\nPolicy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
    print(policy.reshape(4, 12))

    V_true = np.zeros((4, 12))
    for i in range(3):
        V_true[0:12][i] = -np.arange(3, 15)[::-1] - i
    V_true[1][11] = -2
    V_true[2][11] = -1
    V_true[3][0] = -17
                tem_scores.append(score)
                break
        if (i_episode % plot_every == 0):
            avg_scores.append(np.mean(tem_scores))
    plt.plot(np.linspace(0, num_episodes, len(avg_scores), endpoint=False),
             np.asarray(avg_scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Episodes: ' % plot_every),
          np.max(avg_scores))

    return Q


Q_learn = Q_learning(env, 5000, .01)
sarsamax_policy = np.array([
    np.argmax(Q_learn[key]) if key in Q_learn else -1 for key in np.arange(48)
]).reshape(4, 12)
check_test.run_check('td_control_check', sarsamax_policy)
print(
    "\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):"
)
print(sarsamax_policy)

V_sarsamax = ([
    np.max(Q_learn[key]) if key in Q_learn else 0 for key in np.arange(48)
])
plot_values(V_sarsamax)
def  q_learning(env, num_episodes, alpha, gamma=1.0):
	# initialize action-value function (empty dictionary of arrays)
	Q = defaultdict(lambda: np.zeros(env.nA))
	# initialize performance monitor
	plot_every = 100
	tmp_scores = deque(maxlen=plot_every)
	scores = deque(maxlen=num_episodes)
	# loop over episodes
	for i_episode in range(1, num_episodes+1):
		# monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        # initialize score
        score = 0
        # begin an episode, observe S
        state = env.reset()
        while True:
        	# get epsilon-greedy action probabilities
        	policy_s = epsilon_greedy_probs(env, Q[state], i_episode)
        	# pick next action A
        	action = np.random.choice(np.arange(env.nA), p=policy_s)
        	# take action A, observe R, S'
        	next_state, reward, done, info = env.step(action)
        	# update Q
        	Q[state][action] = update_Q(Q[state][action], np.max(Q[next_state]), \
        									reward, alpha, gamma)

        	# S <- S'
        	state = next_state
        	# until S is terminal
        	if done:
        		# append score
        		tmp_scores.append(score)
        		break
        if(i_episode % plot_every == 0):
        	scores.append(np.mean(tmp_scores))
    # plot performance
    plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores))
    plt.xlabel('Episode Number')
    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
    plt.show()
    # print best 100-episode performance
    print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores))
    return Q


	# obtain the estimated optimal policy and corresponding action-value function
	Q_sarsamax = q_learning(env, 5000, .01)

	# print the estimated optimal policy
	policy_sarsamax = np.array([np.argmax(Q_sarsamax[key]) if key in Q_sarsamax else -1 for key in np.arange(48)]).reshape((4,12))
	check_test.run_check('td_control_check', policy_sarsamax)
	print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
	print(policy_sarsamax)

	# plot the estimated optimal state-value function
	plot_values([np.max(Q_sarsamax[key]) if key in Q_sarsamax else 0 for key in np.arange(48)])

	# Part 3: TD Control: Expected Sarsa

	# Input: policy π, positive integer num episodes, small positive fraction α, GLIE {εi}
	# Output: value function Q (≈ qπ if num episodes is large enough)
	# Initialize Q arbitrarily (e.g., Q(s, a) = 0 for all s ∈ S and a ∈ A(s), and Q(terminal-state, ·) = 0)
	# for i ← 1 to num episodes do 
	#     ε ← εi
	#     Observe S0 
	#     t←0 
	#     repeat
	#       Choose action At using policy derived from Q (e.g., ε-greedy)
	#       Take action At and observe Rt+1 , St+1
	#       Q(St, At) ← Q(St, At) + α(Rt+1 + γ 􏰀a π(a|St+1)Q(St+1, a) − Q(St, At)) 
	#       t←t+1
	#     until St is terminal; 
	# end
	# return Q

	def expected_sarsa(env, num_episodes, alpha, gamma=1.0):
		# initialize action-value function (empty dictionary of arrays)
		Q = defaultdict(lambda: np.zeros(env.nA))
		# initialize performance moniter
		plot_every = 100
		tmp_scores deque(maxlen=plot_every)
		scores = deque(maxlen=num_episodes)
		# loop over episodes
		for i_episode in range(1, num_episodes+1):
			# monitor progress
			if i_episode % 100 == 0:
				print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
				sys.stdout.flush()
			# initialize score
			socre = 0
			# begin an episode
			state = env.reset()
			# get epsilon-greedy action probabilities
			policy_s = epsilon_greedy_probs(env, Q[state], i_episode, 0.005)
			while True:
				# pick next action
				action = np.random.choice(np.arange(env.nA), p=policy_s)
				# take action A, observe R, S'
				next_state, reward, done, info = env.step(action)
				# add reward to score
				score += reward
				# get epsilon-greedy action probabilities (for S')
				policy_s = epsilon_greedy_probs(env, Q[next_state], i_episode, 0.005)
				# update Q
				Q[state][action] = update_Q(Q[state][action], np.dot(Q[next_state], policy_s), \
													reward, alpha, gamma)

				# S <- S'
				state = next_state
				# until S is terminal
				if done:
					# append score
					tmp_scores.append(score)
					break
	        if (i_episode % plot_every == 0):
	            scores.append(np.mean(tmp_scores))
	    # plot performance
	    plt.plot(np.linspace(0,num_episodes,len(scores),endpoint=False),np.asarray(scores))
	    plt.xlabel('Episode Number')
	    plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every)
	    plt.show()
	    # print best 100-episode performance
	    print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores))
	    return Q
예제 #16
0
        policy[s] = np.sum([np.eye(env.nA)[i]
                            for i in best_a], axis=0) / len(best_a)

    return policy


def value_iteration(env, gamma=1, theta=1e-8):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            v = V[s]
            V[s] = max(q_from_v(env, V, s, gamma))
            delta = max(delta, abs(V[s] - v))
        if delta < theta:
            break
    policy = policy_improvement(env, V, gamma)
    return policy, V


env = FrozenLakeEnv()

policy_vi, V_vi = value_iteration(env)

# print the optimal policy
print("\nOptimal Policy (LEFT = 0, DOWN = 1, RIGHT = 2, UP = 3):")
print(policy_vi, "\n")

# plot the optimal state-value function
plot_values(V_vi)
        ## TODO: complete the function

    return Q

Q_sarsa = sarsa(env, 2000, .01)

# print the estimated optimal policy
policy_sarsa = np.array([np.argmax(Q_sarsa[key]) if key in Q_sarsa else -1 for key in np.arange(48)]).reshape(4,12)
#check_test.run_check('td_control_check', policy_sarsa)
print("\nEstimated Optimal Policy (UP = 0, RIGHT = 1, DOWN = 2, LEFT = 3, N/A = -1):")
print(policy_sarsa)

# plot the estimated optimal state-value function
V_sarsa = ([np.max(Q_sarsa[key]) if key in Q_sarsa else 0 for key in np.arange(48)])
plot_values(V_sarsa)




#----------------Q Learning -------------------#


def generate_episode_and_update_Q(env, Q, epsilon, nA,gamma,alpha):
    """ generates an episode from following the epsilon-greedy policy """
    episode = []
    state = env.reset()
    action = np.random.choice(np.arange(nA), p=get_probs(Q[state], epsilon, nA)) \
                                    if state in Q else env.action_space.sample()
    score=0
    while True:
예제 #18
0
        # monitor progress
        if i_episode % 100 == 0:
            print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        score = 0
        state = env.reset()
        while True:
            policy_s = policy[state]
            action = np.random.choice(np.arange(env.nA), p=policy_s)
            next_state, reward, done, info = env.step(action)
            score += reward
            V[state] = update_V(V[state], np.max(V[next_state]), reward, alpha, gamma)
            state = next_state
            if done:
                tmp_scores.append(score)
                break
        if (i_episode % plot_every == 0):
            scores.append(np.mean(tmp_scores))

    print(('Best Average Reward over %d Episodes: ' % plot_every), np.max(scores))
    return V

random_policy = np.ones([env.nS, env.nA]) / env.nA

from plot_utils import plot_values

# evaluate the policy
V = policy_evaluation(env, random_policy)

plot_values(V)