V[state] = V_updated
    return V, delta


def policy_improve(V, states_actions):
    pi = {}
    print(sum(abs(np.array(list(V.values())))))
    for state, actions in states_actions.items():
        actions_list = []  # list(actions.keys())
        expected_rewards = []  #np.zeros(len(actions))
        for i, (action, data) in enumerate(actions.items()):
            actions_list.append(action)
            next_state = data['next_state']
            reward = data['status']
            if next_state in V:
                expected_rewards.append(-(reward + V[next_state]))
            else:
                expected_rewards.append(-reward)

        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi


pi = get_deterministic_policy(states)
# pi = get_deterministic_policy_uniform(states)
pi, V = policy_iteration(
    states,
    pi,
    deterministic_policy_eval_step=deterministic_policy_eval_step,
    policy_improve=policy_improve,
    verbose=1)
示例#2
0
    'Evaluating random policy, except for the goal state, where policy always executes stop:'
)
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
initial_value = np.zeros(dimensions)
value = policy_evaluation(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing value iteration
print('Value iteration:')
value = value_iteration(grid_world, initial_value)
policy = greedy_policy(grid_world, value)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing policy iteration
print('Policy iteration:')
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
value, policy = policy_iteration(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')
示例#3
0
from gridworld import GridWorld1
import gridrender as gui
import numpy as np
import matplotlib.pyplot as plt
import time
from utils import v_from_q
from dynamic_programming import value_iteration, policy_iteration

################################################################################
# Dynamic programming
################################################################################

value_iteration()
policy_iteration()




env = GridWorld1

################################################################################
# Work to do: Q4
################################################################################
# here the v-function and q-function to be used for question 4
v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855,
        -0.93358351, -0.99447514]

### Compute mu0
mu0 = np.array([env.reset() for i in range(5000)])
unique, counts = np.unique(mu0, return_counts=True)
mu0 = counts/np.sum(counts)