action_to_take = policy[
            grid.pos] if grid.pos in policy else np.random.choice(
                grid.possible_actions())
        reward = grid.player_take_action(action_to_take)
        state_and_rewards.append((grid.pos, reward))

    G = 0
    states_and_returns = []
    for (state, reward) in reversed(state_and_rewards):
        G = reward + alpha * G
        states_and_returns.append((state, G))
    return list(reversed(states_and_returns))[1:]


if __name__ == "__main__":
    grid = standard_grid(success_prob=0.8)
    states = grid.all_states()
    possible_states = [s for s in states if not grid.is_terminal_state(s)]

    # Initialize V
    print("Initialize steps")
    alpha = 0.9
    epsilon = 10e-4
    iterations = 0
    V = {state: 0 for state in states}
    counts = {state: 1 for state in states}
    policy = {
        (2, 0): Action("U", -1, 0),
        (1, 0): Action("U", -1, 0),
        (0, 0): Action("R", 0, 1),
        (0, 1): Action("R", 0, 1),
import numpy as np
import math
from rl.gridworld.utils import standard_grid
from rl.gridworld.Action import Action
from collections import defaultdict
from rl.gridworld.utils import *
if __name__ == "__main__":
    grid = standard_grid(0.5, -0.1)
    states = grid.all_states()
    possible_states = [s for s in states if not grid.is_terminal_state(s)]

    # Initialize V, policy
    print("Initialize steps")
    alpha = 0.9
    epsilon = 10e-4
    V = {state: 0 for state in states}

    policy = {}
    for state in possible_states:
        grid.set_player_position(state)
        policy[state] = np.random.choice(grid.possible_actions())

    print_values(V, grid)
    print_policy(policy, grid)

    # repeat until policy converged

    while True:

        # policy evaluation
        while True:
import numpy as np
import math
from rl.gridworld.utils import standard_grid
from rl.gridworld.Action import Action
from collections import defaultdict
from rl.gridworld.utils import *
if __name__ == "__main__":
    grid = standard_grid()
    states = grid.all_states()
    possible_states = [s for s in states if not grid.is_terminal_state(s)]

    # Initialize V, policy
    print("Initialize steps")
    alpha = 0.9
    epsilon = 10e-4
    iterations = 0
    V = {state: 0 for state in states}

    policy = {}
    for state in possible_states:
        grid.set_player_position(state)
        policy[state] = np.random.choice(grid.possible_actions())

    print_values(V, grid)
    print_policy(policy, grid)

    # repeat until policy converged

    while True:
        iterations += 1
        # policy evaluation
Exemplo n.º 4
0
import numpy as np
import math
from rl.gridworld.utils import standard_grid
from rl.gridworld.Action import Action
from collections import defaultdict
from rl.gridworld.utils import *
if __name__ == "__main__":
    grid = standard_grid(normal_reward=-0.1)
    states = grid.all_states()
    possible_states = [s for s in states if not grid.is_terminal_state(s)]

    # Initialize V
    print("Initialize steps")
    alpha = 0.9
    epsilon = 10e-4
    iterations = 0
    V = {state: 0 for state in states}

    print_values(V, grid)
    # repeat until policy converged

    # value iteration
    while True:
        iterations += 1
        biggest_change = -math.inf
        for state in possible_states:
            old_v = V[state]
            grid.set_player_position(state)
            new_v = -math.inf
            for action in grid.possible_actions():
                grid.set_player_position(state)