Python policy_iteration示例

编程语言: Python

命名空间/包名称: dynamic_programming

方法/功能: policy_iteration

hotexamples.com的示例: 3

Python policy_iteration - 已找到3个示例。这些是从开源项目中提取的最受好评的dynamic_programming.policy_iteration现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： train_chess.py 项目： deeplearning-itba/othello-reinforcement-learning

        V[state] = V_updated
    return V, delta


def policy_improve(V, states_actions):
    pi = {}
    print(sum(abs(np.array(list(V.values())))))
    for state, actions in states_actions.items():
        actions_list = []  # list(actions.keys())
        expected_rewards = []  #np.zeros(len(actions))
        for i, (action, data) in enumerate(actions.items()):
            actions_list.append(action)
            next_state = data['next_state']
            reward = data['status']
            if next_state in V:
                expected_rewards.append(-(reward + V[next_state]))
            else:
                expected_rewards.append(-reward)

        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi


pi = get_deterministic_policy(states)
# pi = get_deterministic_policy_uniform(states)
pi, V = policy_iteration(
    states,
    pi,
    deterministic_policy_eval_step=deterministic_policy_eval_step,
    policy_improve=policy_improve,
    verbose=1)

示例#2

显示文件

    'Evaluating random policy, except for the goal state, where policy always executes stop:'
)
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
initial_value = np.zeros(dimensions)
value = policy_evaluation(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing value iteration
print('Value iteration:')
value = value_iteration(grid_world, initial_value)
policy = greedy_policy(grid_world, value)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

# Testing policy iteration
print('Policy iteration:')
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
value, policy = policy_iteration(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')

示例#3

显示文件

from gridworld import GridWorld1
import gridrender as gui
import numpy as np
import matplotlib.pyplot as plt
import time
from utils import v_from_q
from dynamic_programming import value_iteration, policy_iteration

################################################################################
# Dynamic programming
################################################################################

value_iteration()
policy_iteration()




env = GridWorld1

################################################################################
# Work to do: Q4
################################################################################
# here the v-function and q-function to be used for question 4
v_q4 = [0.87691855, 0.92820033, 0.98817903, 0.00000000, 0.67106071, -0.99447514, 0.00000000, -0.82847001, -0.87691855,
        -0.93358351, -0.99447514]

### Compute mu0
mu0 = np.array([env.reset() for i in range(5000)])
unique, counts = np.unique(mu0, return_counts=True)
mu0 = counts/np.sum(counts)