def value_iteration(state_count, gamma, theta, get_available_actions,
                    get_transitions):
    """
    This function computes the optimal value function and policy for the specified MDP, using the Value Iteration algorithm.
    
    'state_count' is the total number of states in the MDP. States are represented as 0-relative numbers.
    
    'gamma' is the MDP discount factor for rewards.
    
    'theta' is the small number threshold to signal convergence of the value function (see Iterative Policy Evaluation algorithm).
    
    'get_available_actions' returns a list of the MDP available actions for the specified state parameter.
    
    'get_transitions' is the MDP state / reward transiton function.  It accepts two parameters, state and action, and returns
        a list of tuples, where each tuple is of the form: (next_state, reward, probabiliity).  
    """
    # init all state value estimates to 0
    V = state_count * [0]
    pi = state_count * [0]

    # init with a policy with first avail action for each state
    for s in range(state_count):
        avail_actions = get_available_actions(s)
        pi[s] = avail_actions[0]
    # print("Initial policy", pi)

    while True:
        delta = 0
        V_prev = list(V)
        V = state_count * [-11]

        for state in range(state_count):
            V_best = V_prev[state]
            actions = gw.get_available_actions(state)
            for action in actions:
                #print("State:", state, "Action:", action)
                transitions = gw.get_transitions(state=state, action=action)
                V_sa = 0
                for (trans) in transitions:
                    next_state, reward, probability = trans  # unpack tuple
                    V_sa = V_sa + probability * (reward +
                                                 gamma * V_prev[next_state])

                if V_sa > V_best:
                    pi[state] = action
                    V_best = V_sa
                    V[state] = V_sa
            delta = max(delta, abs(V[state] - V_prev[state]))

        if delta < theta:
            break

    V = V_prev
    # insert code here to iterate using policy evaluation and policy improvement (see Policy Iteration algorithm)
    return (V, pi)  # return both the final value function and the final policy
示例#2
0
def examine_transitions():
    for action in actions:
        transitions = gw.get_transitions(state=state, action=action)

        # examine each return transition (only 1 per call for this MDP)
        for (trans) in transitions:
            next_state, reward, probability = trans  # unpack tuple

            print(
                "transition(" + str(state) + ", " + action + "):",
                "next_state=",
                next_state,
                ", reward=",
                reward,
                ", probability=",
                probability,
            )
def policy_eval_two_arrays(state_count, gamma, theta, get_policy,
                           get_transitions):
    """
    This function uses the two-array approach to evaluate the specified policy for the specified MDP:
    
    'state_count' is the total number of states in the MDP. States are represented as 0-relative numbers.
    
    'gamma' is the MDP discount factor for rewards.
    
    'theta' is the small number threshold to signal convergence of the value function (see Iterative Policy Evaluation algorithm).
    
    'get_policy' is the stochastic policy function - it takes a state parameter and returns list of tuples, 
        where each tuple is of the form: (action, probability).  It represents the policy being evaluated.
        
    'get_transitions' is the state/reward transiton function.  It accepts two parameters, state and action, and returns
        a list of tuples, where each tuple is of the form: (next_state, reward, probabiliity).  
        
    """
    V = state_count * [0]
    #
    # INSERT CODE HERE to evaluate the policy using the 2 array approach

    while True:
        delta = 0
        V_prev = list(V)
        V = state_count * [0]
        for state in range(state_count):
            actions = gw.get_available_actions(state)
            policy = dict(get_policy(state))
            for action in actions:
                transitions = gw.get_transitions(state=state, action=action)
                for (trans) in transitions:
                    next_state, reward, probability = trans  # unpack tuple
                    V[state] = V[state] + policy[action] * (
                        probability * (reward + gamma * V_prev[next_state]))
            delta = max(delta, abs(V[state] - V_prev[state]))
        if delta < theta:
            break
    #
    return V
示例#4
0
def policy_eval_in_place(state_count, gamma, theta, get_policy,
                         get_transitions):
    V = state_count * [0]

    while True:
        delta = 0
        for state in range(state_count):
            actions = gw.get_available_actions(state)
            policy = dict(get_policy(state))
            v = V[state]
            v_new = 0
            for action in actions:
                transitions = gw.get_transitions(state=state, action=action)
                for (trans) in transitions:
                    next_state, reward, probability = trans  # unpack tuple
                    v_new = v_new + policy[action] * (
                        probability * (reward + gamma * V[next_state]))
            V[state] = v_new
            delta = max(delta, abs(v - V[state]))
        if delta < theta:
            break
    #
    return V
#Policy Evaluation calculates the value function for a policy, given the policy and the full definition of the associated Markov Decision Process.
#The full definition of an MDP is the set of states, the set of available actions for each state, the set of rewards, the discount factor,
#and the state/reward transition function.

import test_dp  # required for testing and grading your code
import gridworld_mdp as gw  # defines the MDP for a 4x4 gridworld

#The gridworld MDP defines the probability of state transitions for our 4x4 gridworld using a "get_transitions()" function.
#Let's try it out now, with state=2 and all defined actions.

# try out the gw.get_transitions(state, action) function
state = 2
actions = gw.get_available_actions(state)

for action in actions:
    transitions = gw.get_transitions(state=state, action=action)

    # examine each return transition (only 1 per call for this MDP)
    for (trans) in transitions:
        next_state, reward, probability = trans  # unpack tuple
        print("transition(" + str(state) + ", " + action + "):", "next_state=",
              next_state, ", reward=", reward, ", probability=", probability)

#Implement the algorithm for Iterative Policy Evaluation using the 2 array approach.
#In the 2 array approach, one array holds the value estimates for each state computed on the previous iteration,
#and one array holds the value estimates for the states computing in the current iteration.


#A empty function policy_eval_two_arrays is provided below; implement the body of the function to correctly calculate
#the value of the policy using the 2 array approach. The function defines 5 parameters - a definition of each parameter
#is given in the comment block for the function. For sample parameter values, see the calling code in the cell following the function.
示例#6
0
import test_dp  # required for testing and grading your code
import gridworld_mdp as gw  # defines the MDP for a 4x4 gridworld

state_count = gw.get_state_count()
pi = state_count * [0]
for s in range(state_count):
    avail_actions = gw.get_available_actions(s)
    pi[s] = avail_actions[0]

avail_actions = gw.get_available_actions(2)
print(avail_actions)
transitions = gw.get_transitions(state=2, action=pi[2])
print(transitions)
next_state, reward, probability = transitions[0]
print(next_state, reward, probability)

for s in range(state_count):
    avail_actions = gw.get_available_actions(s)
    print(avail_actions)
    pi[s] = avail_actions[0]
    print(len(avail_actions))

print(pi)