Exemplo n.º 1
0
    states = grid.all_states

    #V for value
    V = {}
    for s in states:
        V[s] = 0

    gamma = 1.0

    while True:
        biggest_change = 0
        for s in states:
            old_v = V[s]

            if s in grid.location_to_action:
                # calculate new value.  given chance of each possible move form state/location
                new_v = 0

                p_a = 1.0 / len(grid.location_to_action[s])
                for a in grid.location_to_action[s]:
                    grid.set_state(s)
                    r = grid.move(a)
                    new_v += p_a * (r + gamma * V[grid.current_state])
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        if biggest_change < SMALL_ENOUGH:
            break

    print_values(V, grid)
Exemplo n.º 2
0
            if s in policy:
                # pick action in this case. we have fix policy
                r = grid.get_reward(s, policy[s])
                V[s] = r + GAMMA * V[grid.current_state]
                biggest_change = max(biggest_change, np.abs(old_v - V[s]))

        if biggest_change < SMALL_ENOUGH:
            break


if __name__ == '__main__':
    grid = negative_grid(-0.3)

    print("rewards:")
    print_values(grid.location_to_rewards, grid)

    # intialize random policy. then update
    policy = {}
    for s in grid.location_to_action.keys():
        policy[s] = np.random.choice(grid.location_to_action[s])

    print("initial policy:")
    print_policy(policy, grid)

    V = initalize_V(grid)
    while True:
        #evaluate policy to find V
        evalulate_v_for_policy(policy, grid, V)
        """
        Summary: change policy for biggest V
Exemplo n.º 3
0
            s = s2
            a = a2

            #for logging only
            logging_update_counts[s] = logging_update_counts.get(s, 0) + 1
            log_biggest_change = max(log_biggest_change,
                                     np.abs(log_old_qsa - Q[s][a]))
        logging_deltas.append(log_biggest_change)

    plt.plot(logging_deltas)
    plt.show()

    policy = {}
    V = {}
    for s in grid.location_to_action.keys():
        policy[s] = get_best_action_from_q(Q, s, grid)
        V[s] = Q[s][policy[s]]

    # what's the proportion of time we spend updating each part of Q?
    print("update counts:")
    total = np.sum(list(logging_update_counts.values()))
    for k, v in logging_update_counts.items():
        logging_update_counts[k] = float(v) / total
    print_values(logging_update_counts, grid)

    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(policy, grid)
            else:
                Qs2 = getQs(model, s2)
                a2, MaxQs2a2 = max_dict(Qs2)
                a2 = random_action(a2, eps=0.5 / t)
                model.theta += alpha * (r + gamma * MaxQs2a2 - model.predict(
                    s, a)) * model.grad(s, a)
                s = s2
                a = a2

            delta = max(delta, np.abs(old_theta - model.theta).sum())

        deltas.append(delta)

    plt.plot(deltas)
    plt.show()

    #Find Policy and V function
    Policy = {}
    V = {}
    Q = {}
    for s in g.actions.keys():
        Q[s] = getQs(model, s)
        a, max_q = max_dict(Q[s])
        Policy[s] = a
        V[s] = max_q

    print("Values")
    print_values(V, g)
    print("Policy")
    print_policy(Policy, g)
    first = True
    for s, r in reversed(states_and_rewards):
        if first:
            first = False
        else:
            states_and_returns.append((s, G))
        G = r + gamma * G
    states_and_returns.reverse()
    return states_and_returns


if __name__ == "__main__":
    g = standard_grid()

    print('rewards:')
    print_values(g.rewards, g)

    Policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'U',
        (2, 1): 'L',
        (2, 2): 'U',
        (2, 3): 'L',
    }

    #Intilize V(s) and returns
    V = {}