def value_iteration(state_count, gamma, theta, get_available_actions, get_transitions): """ This function computes the optimal value function and policy for the specified MDP, using the Value Iteration algorithm. 'state_count' is the total number of states in the MDP. States are represented as 0-relative numbers. 'gamma' is the MDP discount factor for rewards. 'theta' is the small number threshold to signal convergence of the value function (see Iterative Policy Evaluation algorithm). 'get_available_actions' returns a list of the MDP available actions for the specified state parameter. 'get_transitions' is the MDP state / reward transiton function. It accepts two parameters, state and action, and returns a list of tuples, where each tuple is of the form: (next_state, reward, probabiliity). """ # init all state value estimates to 0 V = state_count * [0] pi = state_count * [0] # init with a policy with first avail action for each state for s in range(state_count): avail_actions = get_available_actions(s) pi[s] = avail_actions[0] # print("Initial policy", pi) while True: delta = 0 V_prev = list(V) V = state_count * [-11] for state in range(state_count): V_best = V_prev[state] actions = gw.get_available_actions(state) for action in actions: #print("State:", state, "Action:", action) transitions = gw.get_transitions(state=state, action=action) V_sa = 0 for (trans) in transitions: next_state, reward, probability = trans # unpack tuple V_sa = V_sa + probability * (reward + gamma * V_prev[next_state]) if V_sa > V_best: pi[state] = action V_best = V_sa V[state] = V_sa delta = max(delta, abs(V[state] - V_prev[state])) if delta < theta: break V = V_prev # insert code here to iterate using policy evaluation and policy improvement (see Policy Iteration algorithm) return (V, pi) # return both the final value function and the final policy
def examine_transitions(): for action in actions: transitions = gw.get_transitions(state=state, action=action) # examine each return transition (only 1 per call for this MDP) for (trans) in transitions: next_state, reward, probability = trans # unpack tuple print( "transition(" + str(state) + ", " + action + "):", "next_state=", next_state, ", reward=", reward, ", probability=", probability, )
def policy_eval_two_arrays(state_count, gamma, theta, get_policy, get_transitions): """ This function uses the two-array approach to evaluate the specified policy for the specified MDP: 'state_count' is the total number of states in the MDP. States are represented as 0-relative numbers. 'gamma' is the MDP discount factor for rewards. 'theta' is the small number threshold to signal convergence of the value function (see Iterative Policy Evaluation algorithm). 'get_policy' is the stochastic policy function - it takes a state parameter and returns list of tuples, where each tuple is of the form: (action, probability). It represents the policy being evaluated. 'get_transitions' is the state/reward transiton function. It accepts two parameters, state and action, and returns a list of tuples, where each tuple is of the form: (next_state, reward, probabiliity). """ V = state_count * [0] # # INSERT CODE HERE to evaluate the policy using the 2 array approach while True: delta = 0 V_prev = list(V) V = state_count * [0] for state in range(state_count): actions = gw.get_available_actions(state) policy = dict(get_policy(state)) for action in actions: transitions = gw.get_transitions(state=state, action=action) for (trans) in transitions: next_state, reward, probability = trans # unpack tuple V[state] = V[state] + policy[action] * ( probability * (reward + gamma * V_prev[next_state])) delta = max(delta, abs(V[state] - V_prev[state])) if delta < theta: break # return V
def policy_eval_in_place(state_count, gamma, theta, get_policy, get_transitions): V = state_count * [0] while True: delta = 0 for state in range(state_count): actions = gw.get_available_actions(state) policy = dict(get_policy(state)) v = V[state] v_new = 0 for action in actions: transitions = gw.get_transitions(state=state, action=action) for (trans) in transitions: next_state, reward, probability = trans # unpack tuple v_new = v_new + policy[action] * ( probability * (reward + gamma * V[next_state])) V[state] = v_new delta = max(delta, abs(v - V[state])) if delta < theta: break # return V
#Policy Evaluation calculates the value function for a policy, given the policy and the full definition of the associated Markov Decision Process. #The full definition of an MDP is the set of states, the set of available actions for each state, the set of rewards, the discount factor, #and the state/reward transition function. import test_dp # required for testing and grading your code import gridworld_mdp as gw # defines the MDP for a 4x4 gridworld #The gridworld MDP defines the probability of state transitions for our 4x4 gridworld using a "get_transitions()" function. #Let's try it out now, with state=2 and all defined actions. # try out the gw.get_transitions(state, action) function state = 2 actions = gw.get_available_actions(state) for action in actions: transitions = gw.get_transitions(state=state, action=action) # examine each return transition (only 1 per call for this MDP) for (trans) in transitions: next_state, reward, probability = trans # unpack tuple print("transition(" + str(state) + ", " + action + "):", "next_state=", next_state, ", reward=", reward, ", probability=", probability) #Implement the algorithm for Iterative Policy Evaluation using the 2 array approach. #In the 2 array approach, one array holds the value estimates for each state computed on the previous iteration, #and one array holds the value estimates for the states computing in the current iteration. #A empty function policy_eval_two_arrays is provided below; implement the body of the function to correctly calculate #the value of the policy using the 2 array approach. The function defines 5 parameters - a definition of each parameter #is given in the comment block for the function. For sample parameter values, see the calling code in the cell following the function.
import test_dp # required for testing and grading your code import gridworld_mdp as gw # defines the MDP for a 4x4 gridworld state_count = gw.get_state_count() pi = state_count * [0] for s in range(state_count): avail_actions = gw.get_available_actions(s) pi[s] = avail_actions[0] avail_actions = gw.get_available_actions(2) print(avail_actions) transitions = gw.get_transitions(state=2, action=pi[2]) print(transitions) next_state, reward, probability = transitions[0] print(next_state, reward, probability) for s in range(state_count): avail_actions = gw.get_available_actions(s) print(avail_actions) pi[s] = avail_actions[0] print(len(avail_actions)) print(pi)