saG.reverse()
    return saG


# Function to return the argmax value from a given dictionary and its key
def max_dict(d):
    max_key, max_val = None, float('-inf')
    for k, v in d.items():
        if v > max_val:
            max_val = v
            max_key = k
    return max_key, max_val


if __name__ == '__main__':
    grid = negative_maze(step_cost=-.5)

    # Randomly initialize a policy
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_ACTIONS)

    # Q = mean of returns G for state s and action a
    Q = {}
    G = {}
    state = grid.all_states()
    for s in state:
        if s in grid.actions.keys():
            Q[s] = {}
            for a in ALL_ACTIONS:
                Q[s][a] = 0
Exemplo n.º 2
0
        return np.random.choice(ALL_ACTIONS)


# Argmax key from a dictionary
def max_dict(d):
    max_k, max_v = None, float('-inf')
    for k, v in d.items():
        if v > max_v:
            max_k = k
            max_v = v
    return max_k, max_v


if __name__ == '__main__':

    maze = negative_maze()
    print("Values: ")
    print_values(maze.rewards, maze)

    states = maze.all_states()

    # Initialize Q for all states and actions
    Q = {}
    for s in states:
        Q[s] = {}
        for a in ALL_ACTIONS:
            Q[s][a] = 0

    # This would determine the decay in learning rate after each update
    state_lr_decay = {}
    for s in states:
Exemplo n.º 3
0
import numpy as np
from Maze import negative_maze, print_values, print_policy

# Create a minimum threshold to check for convergence
THRESHOLD = 1e-3
# A discount variable is added so that the program is not greedy
GAMMA = 0.9
ALL_ACTIONS = ('U', 'D', 'L', 'R')

if __name__ == '__main__':

    # Make a grid object
    grid = negative_maze()
    print("Rewards: ")
    print_values(grid.rewards, grid)

    # Randomly initialize a policy for all playable states
    policy = {}
    states = grid.all_states()
    for s in states:
        if s in grid.actions.keys():
            policy[s] = np.random.choice(ALL_ACTIONS)
    print("Random initial Policy")
    print_policy(policy, grid)

    # Randomly initialize a Value for all playable states and 0 for other states
    V = {}
    for s in states:
        if s in grid.actions.keys():
            V[s] = np.random.random()
        else: