import time if __name__ == '__main__': env = MazeEnv(10, 10, 0.3) with open('mazefile', 'w') as f: f.write(str(env.maze)) # obtain the optimal policy and optimal state-value function print('\n') LINELEN = 100 print('\t\tValue Iteration') print('-' * LINELEN) start_time = time.time() policy_pi, V_pi = value_iteration(env, max_iter=100) end_time = time.time() # print the optimal policy print("Optimal Policy:", '\n') print_policy(policy_pi, env.nrow, env.ncol, env.maze) print_path(policy_pi, env.nrow, env.ncol, env.maze) print("Runtime: " + str(end_time - start_time)) print('-' * LINELEN) print('\n') print('\t\tPolicy Iteration') print('-' * LINELEN) start_time = time.time()
print( 'Evaluating random policy, except for the goal state, where policy always executes stop:' ) policy = random_policy(grid_world) policy[goal_state[0], goal_state[1], STOP] = 1.0 policy[goal_state[0], goal_state[1], UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1) initial_value = np.zeros(dimensions) value = policy_evaluation(grid_world, initial_value, policy) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n') # Testing value iteration print('Value iteration:') value = value_iteration(grid_world, initial_value) policy = greedy_policy(grid_world, value) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n') # Testing policy iteration print('Policy iteration:') policy = random_policy(grid_world) policy[goal_state[0], goal_state[1], STOP] = 1.0 policy[goal_state[0], goal_state[1], UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1) value, policy = policy_iteration(grid_world, initial_value, policy) print_value(value) print_policy(policy) print('----------------------------------------------------------------\n')
from gridworld import GridWorld1 import gridrender as gui import numpy as np import matplotlib.pyplot as plt import time from utils import v_from_q from dynamic_programming import value_iteration, policy_iteration ################################################################################ # Dynamic programming ################################################################################ value_iteration() policy_iteration() env = GridWorld1 ################################################################################ # Work to do: Q4 ################################################################################ # here the v-function and q-function to be used for question 4 v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855, -0.93358351, -0.99447514] ### Compute mu0 mu0 = np.array([env.reset() for i in range(5000)]) unique, counts = np.unique(mu0, return_counts=True) mu0 = counts/np.sum(counts)