示例#1
0
import time

if __name__ == '__main__':
    env = MazeEnv(10, 10, 0.3)

    with open('mazefile', 'w') as f:
        f.write(str(env.maze))

    # obtain the optimal policy and optimal state-value function
    print('\n')
    LINELEN = 100
    print('\t\tValue Iteration')
    print('-' * LINELEN)

    start_time = time.time()
    policy_pi, V_pi = value_iteration(env, max_iter=100)
    end_time = time.time()

    # print the optimal policy
    print("Optimal Policy:", '\n')
    print_policy(policy_pi, env.nrow, env.ncol, env.maze)
    print_path(policy_pi, env.nrow, env.ncol, env.maze)
    print("Runtime: " + str(end_time - start_time))
    print('-' * LINELEN)

    print('\n')

    print('\t\tPolicy Iteration')
    print('-' * LINELEN)

    start_time = time.time()
示例#2
0
print(
    'Evaluating random policy, except for the goal state, where policy always executes stop:'
)
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
initial_value = np.zeros(dimensions)
value = policy_evaluation(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing value iteration
print('Value iteration:')
value = value_iteration(grid_world, initial_value)
policy = greedy_policy(grid_world, value)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing policy iteration
print('Policy iteration:')
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
value, policy = policy_iteration(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')
示例#3
0
from gridworld import GridWorld1
import gridrender as gui
import numpy as np
import matplotlib.pyplot as plt
import time
from utils import v_from_q
from dynamic_programming import value_iteration, policy_iteration

################################################################################
# Dynamic programming
################################################################################

value_iteration()
policy_iteration()




env = GridWorld1

################################################################################
# Work to do: Q4
################################################################################
# here the v-function and q-function to be used for question 4
v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855,
        -0.93358351, -0.99447514]

### Compute mu0
mu0 = np.array([env.reset() for i in range(5000)])
unique, counts = np.unique(mu0, return_counts=True)
mu0 = counts/np.sum(counts)