V[state] = V_updated return V, delta def policy_improve(V, states_actions): pi = {} print(sum(abs(np.array(list(V.values()))))) for state, actions in states_actions.items(): actions_list = [] # list(actions.keys()) expected_rewards = [] #np.zeros(len(actions)) for i, (action, data) in enumerate(actions.items()): actions_list.append(action) next_state = data['next_state'] reward = data['status'] if next_state in V: expected_rewards.append(-(reward + V[next_state])) else: expected_rewards.append(-reward) pi[state] = actions_list[np.argmax(expected_rewards)] return pi pi = get_deterministic_policy(states) # pi = get_deterministic_policy_uniform(states) pi, V = policy_iteration( states, pi, deterministic_policy_eval_step=deterministic_policy_eval_step, policy_improve=policy_improve, verbose=1)
'Evaluating random policy, except for the goal state, where policy always executes stop:' ) policy = random_policy(grid_world) policy[goal_state[0], goal_state[1], STOP] = 1.0 policy[goal_state[0], goal_state[1], UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1) initial_value = np.zeros(dimensions) value = policy_evaluation(grid_world, initial_value, policy) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n') # Testing value iteration print('Value iteration:') value = value_iteration(grid_world, initial_value) policy = greedy_policy(grid_world, value) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n') # Testing policy iteration print('Policy iteration:') policy = random_policy(grid_world) policy[goal_state[0], goal_state[1], STOP] = 1.0 policy[goal_state[0], goal_state[1], UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1) value, policy = policy_iteration(grid_world, initial_value, policy) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n')
from gridworld import GridWorld1 import gridrender as gui import numpy as np import matplotlib.pyplot as plt import time from utils import v_from_q from dynamic_programming import value_iteration, policy_iteration ################################################################################ # Dynamic programming ################################################################################ value_iteration() policy_iteration() env = GridWorld1 ################################################################################ # Work to do: Q4 ################################################################################ # here the v-function and q-function to be used for question 4 v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855, -0.93358351, -0.99447514] ### Compute mu0 mu0 = np.array([env.reset() for i in range(5000)]) unique, counts = np.unique(mu0, return_counts=True) mu0 = counts/np.sum(counts)