Printing out the actions that will be taken at each
    place on the grid, according to the policy.
    """
    for i in range(grid.height):
        print("-" + "-------" * grid.width)
        for j in range(grid.width):
            if not j:
                print("|", end="")  # begin row with vertical line
            a = policy.get((i, j), ' ')
            print("   %s  |" % a, end="")
        print("")  # new line
    print("-" + "-------" * grid.width)


if __name__ == '__main__':
    the_grid = standard_grid()
    states = the_grid.all_states()

    # # # poilcy with uniformly random actions # # #
    V = {}
    for s in states:
        V[s] = 0  # initialize all state values to 0
    gamma = 1  # discount factor

    # repeat until convergence
    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]  # keep track so we can measure change

            # terminal states have no value (no future returns)
示例#2
0
    def calculate_state_values(self):
        """
        Calculate state-value function from state-action value function.
        For each state s, V(s) = Q(s, max(a)).
        """
        visited = set()
        for (s, _) in self.Q.keys():
            if s not in visited:
                visited.add(s)
                self.V[s] = np.max(
                    [self.Q.get((s, a), 0) for a in ALL_ACTIONS])


if __name__ == '__main__':
    the_grid = standard_grid(step_cost=-0.1, windy=True)

    # print rewards associated with transitioning into each state on the grid
    print("Rewards:")
    the_grid.display_rewards()

    # Learn using SARSA or off-policy Q-learning control strategy.
    # Both have very similar results (optimal value-function and policy found),
    # though off-policy Q-learning takes longer as the Agent is not actively
    # trying to make it to the goal.
    SARSA = True
    if SARSA:
        # SARSA follows an explore-exploit strategy, so agent moves through
        # environment semi-greedily (epsilon-greedy here) according to policy.
        policy = TemporalDifferencePolicy(the_grid, alpha=.1, gamma=.9)
        policy.Qlearning(1000, SARSA=SARSA)
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is deterministic (1 or 0)
    if grid_type == 'negative':
        # get the grid:
        grid = negative_grid()

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        V[s] = 0
        # or perform a random initialization:
        # if s in grid.actions: # if not a terminal state
        # 	V[s] = np.random.random()
        # else:
        # 	# terminal
        # 	V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            # NOTE: all of the actions, next states and rewards
            #       are considered deterministic

            max_change = 0
            for s in states:
                old_v = V[s]  # save the old value of the state

                # check if not a terminal state:
                if s in grid.actions:
                    grid.set_state(s)

                    # take an action according to the policy and get the reward:
                    a = policy[s]
                    r = grid.move(a)

                    # the "look-ahead" - get the value of the next state, s_prime:
                    s_prime = grid.current_state
                    # s_prime is needed in order to calculate
                    # the value of the current state - the Bellman equation:
                    V[s] = r + GAMMA * V[s_prime]

                    # update max_change:
                    max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values;
        # we choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    grid.set_state(s)

                    # take an action, receive your keto-chocolate bar:
                    r = grid.move(a)

                    s_prime = grid.current_state
                    new_v = r + GAMMA * V[s_prime]

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
                    p_a = .5 / 3  # 16.6% chance of moving in other 3 directions
                r = grid.move(a)  # move and get associated reward
                new_v += p_a * (r + gamma * V.get(the_grid.get_state(), 0))
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))
            V[s] = new_v  # update

        if biggest_change < convergence_threshold:
            break

    return V


if __name__ == '__main__':
    # Agent will try to end game as quickly as possible with step costs this
    # high. Even if that means taking the negative terminal state.
    the_grid = standard_grid(step_cost=-1)
    all_actions = list(the_grid.moves.keys())

    # print rewards
    print("Rewards:")
    display_values(the_grid.rewards, the_grid)
    print("")

    # state -> action
    # randomly choose an action and update as we learn
    policy = {}
    for s, options in the_grid.actions.items():
        policy[s] = np.random.choice(options)
    print("Randomly initialized policy:")
    display_policy(policy, the_grid)
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1],
    #       but the policy is deterministic!
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        # V[s] = 0
        # or perform a random initialization:
        if s in grid.actions:  # if not a terminal state
            V[s] = np.random.random()
        else:
            # terminal
            V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement
    #         with random state-transitions:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            max_change = 0

            for s in states:
                old_v = V[s]  # save the old value of the state
                new_v = 0

                # check if not a terminal state:
                if s in grid.actions:

                    for a in ALL_POSSIBLE_ACTIONS:

                        grid.set_state(s)

                        # possible_actions = list(grid.actions[s])
                        # print('\npossible actions from the state (%d, %d):' % grid.current_state)
                        # print(possible_actions)

                        if a == policy[s]:
                            # take this action with the probability p(a|s)=P_A:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)
                            # same as: p(s',r|s,!policy[s])

                        # move in the chosen direciton:
                        r = grid.move(a)

                        # the "look-ahead" - get the value of the next state, s_prime:
                        s_prime = grid.current_state
                        # s_prime is needed in order to calculate
                        # the value of the current state - the Bellman equation:
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                V[s] = new_v

                # update max_change:
                max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values, but now we also
        # take into account that our state-transitions are random!!!
        # we then choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    new_v = 0  # we're to accumulate the value

                    for another_a in ALL_POSSIBLE_ACTIONS:
                        grid.set_state(s)

                        # since the state-transitions are random,
                        # we check if the action is desired:
                        if another_a == a:
                            # take this action with the probability p(a|s)=0.5:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)

                        # take an action, receive your keto-chocolate bar:
                        r = grid.move(another_a)

                        s_prime = grid.current_state
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
def main():
    grid = standard_grid()

    states = grid.all_states()

    V = {}
    for s in states:
        V[s] = 0
    gamma = 1.0

    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]

            if s in grid._actions:
                new_v = 0
                p_a = 1.0 / len(grid._actions[s])
                for a in grid._actions:
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v += p_a * (r + gamma * V[grid.current_state()])
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))
            if biggest_change < SMALL_ENOUGH:
                break
        print("values for uniform random actions")
        print_values(V, grid)
        print("\n\n")

        policy = {
            (2, 0): 'U',
            (1, 0): 'U',
            (0, 0): 'R',
            (0, 1): 'R',
            (0, 2): 'R',
            (1, 2): 'R',
            (2, 1): 'R',
            (2, 2): 'R',
            (2, 3): 'U',
        }
        print_policy(policy, grid)

        #init V(s) = 0
        V = {}
        for s in states:
            V[s] = 0

        gamma = .9

        #repeat until convergence
        while True:
            biggest_change = 0
            for s in states:
                old_v = V[s]

                #V(s) only has value if it's not a terminal state
                if s in policy:
                    a = policy[s]
                    grid.set_state(s)
                    r = grid.move(a)
                    V[s] = r + gamma * V[grid.current_state()]
                    biggest_change = max(biggest_change, np.abs(old_v - V[s]))

            if biggest_change < SMALL_ENOUGH:
                break

        print("values for a fixed policy:")
        print_values(V, grid)
示例#7
0
    for s in states:
        evaluate the value for state
        del = max(0, |new_val-old_val|)
    if del < threshold:
        break
return values

TODO: Run all the code and analyse it.
'''
import numpy as np
from gridworld import standard_grid
import prettytable as pt
import math

# importing the satndard grid and defining some variables
grid = standard_grid() # the standard grid has rewards only at the terminal states and 0 rewards for all other states
rewards = grid.rewards
tolerance = 1e-3 # the tolerance

gamma = 1.0     # the gamma value

# initialize value for each state
states = grid.all_states()
values = {st:0.00 for st in states} # set to be 0 for each state

def print_in_gridworld(v):
    '''
        Function to print the values in the gridworld
    '''
    # making the grid list
    out = []
def main(policy='uniform'):
    # let's find value function V(s), given a policy p(a|s).
    #
    # recall that there are 2 different policies:
    # 1) completely random policy;
    # 2) completely deterministic (fixed) policy.
    # we are going to find value function for both.
    #
    # NOTE:
    # there are 2 probability distributions in the Bellman equation:
    # 1) p(a|s) - the policy, defines what action to take given the state;
    # 2) p(s',r|s,a) - state-transition probability,
    #                  defines the next state and reward
    #                  given a state-action pair.
    # we will only model a uniform random policy, i.e., p(a|s) = uniform.
    grid = standard_grid()

    # the states will be positions (i, j).
    # gridworld is simpler than tic-tac-toe, b/c there's only one player
    # (i.e., a robot) that can only be at one position at a time.
    states = grid.all_states

    if policy == 'uniform':
        #################### 1) UNIFORM POLICY ####################
        # initialize V(s) to 0:
        V = {}
        for s in states:
            V[s] = 0

        # define the discount factor:
        gamma = 1.0

        i = 0
        # repeat until convergence:
        while True:
            max_change = 0  # max change for the currenent iteration

            for s in states:
                # keep a copy of old V(s), s.t. we can keep track
                # of the magnitude of each change:
                old_v = V[s]

                # NOTE: V(terminal_state) has no value:
                # check if not a terminal state:
                if s in grid.actions:
                    # accumulate the value of this state:
                    new_v = 0
                    # we consider a UNIFORM policy,
                    # i.e., the probability of taking any action is the same;
                    p_a = 1.0 / len(grid.actions[s])
                    # loop over all possible actions that can be taken
                    # from the current state, s:

                    for a in grid.actions[s]:
                        # set our current state on the grid:
                        grid.set_state(s)

                        # make a move to get the reward, r, and next state, s':
                        r = grid.move(a)
                        s_prime = grid.current_state
                        # for debugging:
                        #print('s:', s, 's_prime:', s_prime, 'r:', r)

                        # calculate (basically, accumulate) the Bellman equation:
                        new_v += p_a * (r + gamma * V[s_prime])

                    # update the value of the current state
                    V[s] = new_v

                    # update max_change:
                    max_change = max(max_change, np.abs(old_v - V[s]))
            i += 1
            # check if converged:
            if max_change < THRESHOLD:
                break

        print('iterations to converge:', i, '\n')
        print('values for uniform policy:')
        print_values(V, grid)

    else:
        #################### 2) FIXED POLICY ####################
        # define our policy:
        policy = {
            (0, 0): 'R',
            (0, 1): 'R',
            (0, 2): 'R',
            (1, 0): 'U',
            (1, 2): 'R',
            (2, 0): 'U',
            (2, 1): 'R',
            (2, 2): 'R',
            (2, 3): 'U',
        }

        # display the policy:
        print('the policy:')
        print_policy(policy, grid)
        print('\n')

        # initialize V(s) to 0:
        V = {}
        for s in states:
            V[s] = 0

        # define the discount factor:
        gamma = 0.9  # so now the further we get away from the winning state,
        # the smaller V(s) should be

        # repeat untill convergence:
        i = 0
        while True:
            max_change = 0  # maximum change for the current iteration
            # print('i:', i)
            for s in states:
                # copy the value of the current state
                old_v = V[s]

                # NOTE: V(terminal_state) has no value:
                if s in policy:
                    # set our state:
                    grid.set_state(s)

                    # take the action and receive a reward:
                    a = policy[s]
                    r = grid.move(a)
                    s_prime = grid.current_state
                    # for debugging:
                    # print('s:', s, 's_prime:', s_prime, 'r:', r)

                    # update the value of the state:
                    V[s] = r + gamma * V[s_prime]

                    # update the maximum change:
                    max_change = max(max_change, np.abs(old_v - V[s]))
            i += 1
            # check if converged:
            if max_change < THRESHOLD:
                break

        print('iterations to converge:', i, '\n')
        print('values for fixed policy:')
        print_values(V, grid)
def main(grid_type='negative'):
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # display rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)

    states = grid.all_states

    # STEP 1: randomly initialize the value function, V(s):
    V = {}  # the values
    for s in states:
        # as an option, initialize to 0:
        # V[s] = 0

        # check if not a terminal state:
        if s in grid.actions:
            V[s] = np.random.random()
        else:
            V[s] = 0

    print('\ninitial values:')
    print_values(V, grid)

    # STEP 2: value iteration
    while True:
        max_change = 0

        for s in states:
            old_v = V[s]

            # if we're not in a terminal state:
            if s in grid.actions:
                # choose an action that results in the maximum value
                # for this state:
                best_v = np.float('-inf')
                # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS)

                for a in ALL_POSSIBLE_ACTIONS:
                    # arrive in the state:
                    grid.set_state(s)

                    # take the action and receive the reward:
                    r = grid.move(a)

                    # calculate the Bellman equation:
                    v = r + GAMMA * V[grid.current_state]

                    if v > best_v:
                        best_v = v
                        # p[s] = a      # we'll do it in another loop later

                # update the value of this state:
                V[s] = best_v

                # update the maximum change:
                max_change = max(max_change, np.abs(old_v - V[s]))

        # check if converged:
        if max_change < THRESHOLD:
            break

    # STEP 3: take our optimal value funciton
    #         and find our optimal policy
    p = {}  # the policy
    for s in states:
        best_a = None
        best_v = float('-inf')

        # if not a terminal state:
        if s in grid.actions:
            # find the best action:
            for a in ALL_POSSIBLE_ACTIONS:
                grid.set_state(s)
                r = grid.move(a)
                v = r + GAMMA * V[grid.current_state]

                if v > best_v:
                    best_v = v
                    best_a = a

            p[s] = best_a

    # optimal values:
    print('\noptimal values:')
    print_values(V, grid)

    # optimal policy:
    print('\noptimal policy:')
    print_policy(p, grid)
"""
    This file comtains the code implrmentation of the Policy Iteration Algorithm
"""

import numpy as np
from gridworld import standard_grid, ACTION_SPACE
# from IterativePolicyEvaluation_probabilistic import print_in_gridworld, print_policy

grid = standard_grid()  # initializing the grid
gamma = 0.9
tol = 1e-3


def print_in_gridworld(v):
    '''
        Function to print the values in the gridworld
    '''
    # making the grid list
    out = []
    for i in range(7):
        if i % 2 != 0:
            inv = []
            for j in range(9):
                if j % 2 == 0:
                    inv.append("|")
                else:
                    if (i // 2, j // 2) == (1, 1):
                        inv.append(0.00)
                    else:
                        _ = v[(i // 2, j // 2)]
                        # _ = round(_, 2)