Exemplo n.º 1
0
def policy_iteration(T, r, tot_states=12, gamma=0.999, epsilon=0.01):
    iteration = 0
    # инициализируем случайную политику
    p = np.random.randint(0, 4, size=tot_states).astype(np.float32)
    # в одно состояние мы не можем попасть
    p[5] = np.NaN
    # в терминальных состояниях ничего не делаем
    p[3] = p[7] = -1
    # начальное состояние вектора ценностей состояний
    u = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    while True:
        iteration += 1
        # 1 - Получаем оценки ценностей состояния для текущей политики
        u_0 = u.copy()
        u = return_policy_evaluation(p, u, r, T, gamma)
        # Проверяем, может нам пора остановиться
        delta = np.absolute(u - u_0).max()
        if delta < epsilon * (1 - gamma) / gamma:
            break
        for s in range(tot_states):
            if not np.isnan(p[s]) and not p[s] == -1:
                v = np.zeros((1, tot_states))
                v[0, s] = 1.0
                # 2 - для текущих оценок ценностей состояний получаем оптимальное действие и обновляем политику
                p[s] = return_expected_action(u, T, v)
        print_policy(p, shape=(3, 4))

    print_result(u, iteration, delta, gamma, epsilon)
    return p, u
Exemplo n.º 2
0
def main():
    grid = standard_grid(obey_prob=1.0, step_cost=None)
    print_values(grid.rewards, grid)
    V, Policy, Deltas = monte_carlo(grid)
    print_values(V, grid)
    print_policy(Policy, grid)
    plt.plot(Deltas)
    plt.show()
Exemplo n.º 3
0
def visit(inst, s, solved, values):
    # TODO: add your code here.
    # Make use of compute_greedy_action_and_value, sample_successor, and
    # check_solved.
    # Return updated labeling solved and updated value function values.


"""
Run the LRTDP algorithm until it converges.
"""
def lrtdp(inst, values):
    solved = { s: False for s in inst.states }
    iteration = 1
    while not solved[inst.init]:
        wait_for_input("Press enter for another iteration of LRTDP...".format(iteration))
        solved, values = visit(inst, inst.init, solved, values)
        print("Values after iteration {}: ".format(iteration))
        print_values(inst, values)
        print("Solved after iteration {}: ".format(iteration))
        print_solved(inst, solved)
        iteration += 1
    return values


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'algorithm', choices=['rtdp', 'lrtdp'],
        help="Choose the algorithm."
    )
    args = parser.parse_args()

    inst = instance.get_example_instance()
    print(inst)

    values = { s : heuristic(inst, s) for s in inst.states }

    print("")
    print("Initial state-values:")
    print_values(inst, values)

    if args.algorithm == 'rtdp':
        values = rtdp(inst, values)
    elif args.algorithm == 'lrtdp':
        values = lrtdp(inst, values)
    else:
        sys.exit("Unknown algorithm")
    print("")

    print("Final values:")
    print_values(inst, values)

    policy = get_greedy_policy(inst, values)
    print("Corresponding final policy:")
    print_policy(inst, policy)
def main():
  grid = standard_grid(obey_prob=1.0, step_cost=None)

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  V, policy, deltas = monte_carlo(grid)

  print("final values:")
  print_values(V, grid)
  print("final policy:")
  print_policy(policy, grid)

  plt.plot(deltas)
  plt.show()
Exemplo n.º 5
0
def main():

    # Матрица вероятностей переходов s -> s' при совержении действия a. p(s, s', a) = T[s, s', a]
    T = np.load("T.npy")

    # Вектор вознаграждений получаемых агентов в каждом состоянии
    r = np.array([
        -0.04, -0.04, -0.04, +1.0, -0.04, 0.0, -0.04, -1.0, -0.04, -0.04,
        -0.04, -0.04
    ])

    # запускаем алгоритм Value Iteration и находим вектор ценности состояний
    u = value_iteration(T, r)
    # запускаем алгоритм Policy Iteration и находим вектор ценности состояний
    p, u = policy_iteration(T, r)
    print_policy(p, shape=(3, 4))
Exemplo n.º 6
0
                Q[s][a] = np.mean(returns[sa])
                biggest_change = max(biggest_change, np.abs(old_q - Q[s][a]))
                seen_state_action_pairs.add(sa)
        deltas.append(biggest_change)

        for s in policy.keys():
            a, _ = max_dict(Q[s])
            policy[s] = a
    V = {}
    for s in policy.keys():
        V[s] = max_dict(Q[s])[1]

    return V, policy, deltas


if __name__ == '__main__':
    grid = standard_grid(obey_prob=1.0, step_cost=None)

    print("rewards:")
    print_values(grid.rewards, grid)

    V, policy, deltas = monte_carlo(grid)

    print("final values:")
    print_values(V, grid)
    print("final policy:")
    print_policy(policy, grid)

    plt.plot(deltas)
    plt.show()
Exemplo n.º 7
0
        # append the terminal state
        memory.append((observation_, action, reward))

        returns = 0
        last = True  # start at t = T - 1
        for state, action, reward in reversed(memory):
            if last:
                last = False
            else:
                states_actions_returns.append((state, action, returns))
            returns = DISCOUNT * returns + reward

        states_actions_returns.reverse()
        states_and_actions = []
        for state, action, returns in states_actions_returns:
            if (state, action) not in states_and_actions:
                PAIRS_VISITED[(state, action)] += 1
                RETURNS[(state,
                         action)] += ((1 / PAIRS_VISITED[(state, action)]) *
                                      (returns - RETURNS[(state, action)]))
                ESTIMATES[(state, action)] = RETURNS[(state, action)]
                states_and_actions.append((state, action))
                values = np.array(
                    [ESTIMATES[(state, a)] for a in GRID.possible_actions])
                best = np.argmax(values)
                POLICY[state] = GRID.possible_actions[best]

    print_estimates(ESTIMATES, GRID)
    print_policy(POLICY, GRID)
Exemplo n.º 8
0
    for t in range(1, N):
        if t%1000 == 0:
            print(t)
        s = (2, 0)
        grid.set_state(s)
        a, _ = greedy_from(Q[s])
        a = random_action(a, eps=0.1)
        while not grid.game_over():
            a = random_action(a, eps=(1.0/t))
            r = grid.move(a)
            s_prime = grid.current_state()
            a_prime, _ = greedy_from(Q[s_prime])
            q_sa = Q[s][a]
            num_seen_sa[(s, a)] += 1
            # print('Learning Rate: ', ALPHA/num_seen_sa[(s, a)])
            Q[s][a] = q_sa + (ALPHA/num_seen_sa[(s, a)]) * (r + GAMMA*Q[s_prime][a_prime] - q_sa)
            delta = np.abs(Q[s][a] - q_sa)
            deltas.append(delta)
            s = s_prime
            a = a_prime

    pi = {}
    for s in S:
        if not grid.is_terminal(s):
            pi[s], _ = greedy_from(Q[s])
    print('Results:')
    print_policy(pi, grid)
    print(len(deltas))
    plt.plot(deltas)
    plt.show()
Exemplo n.º 9
0
            observation = observation_
        memory.append((observation, action, reward))

        returns = 0
        relative_probability = 1
        last = True
        for (state, action, reward) in reversed(memory):
            if last:
                last = False
            else:
                C_ESTIMATES[(state, action)] += relative_probability
                ESTIMATES[(state, action)] += (
                    (relative_probability / C_ESTIMATES[(state, action)])
                    * (returns-ESTIMATES[(state, action)])
                )
                vals = np.array([
                    ESTIMATES[(state, a)] for a in GRID.possible_actions
                ])
                argmax = np.argmax(vals)
                TARGET_POLICY[state] = GRID.possible_actions[argmax]
                if action != TARGET_POLICY[state]:
                    break
                if len(behavior_policy[state]) == 1:
                    prob = 1 - EPSILON
                else:
                    prob = EPSILON / len(behavior_policy[state])
                relative_probability *= 1 / prob
            returns = DISCOUNT * returns + reward

    print_policy(TARGET_POLICY, GRID)
Exemplo n.º 10
0
def calculate_greedy_policy(grid, V):
    P = dict()
    for state in grid.non_terminal_states():
        P[state] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    for state in grid.non_terminal_states():
        best_action, _ = best_action_value(grid, V, state)
        P[state] = best_action
    return P


if __name__ == '__main__':
  grid = standard_grid(obey_prob=0.8, step_cost=None)

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  # calculate accurate values for each square
  V = calculate_values(grid)

  # calculate the optimum policy based on our values
  P = calculate_greedy_policy(grid, V)

  # our goal here is to verify that we get the same answer as with policy iteration
  print("values:")
  print_values(V, grid)
  print("policy:")
  print_policy(P, grid)


Exemplo n.º 11
0
        # Convert state path:
        path_1d = gw.convert_state_log_2d_to_1d(path_2d)

        dem_paths.append(path_1d)

    utils.print_value(v_states)

    # Create a grid to show the optimal policy:
    # 0: stay, 1: north, 2:east, 3: south, 4: west
    grid_pol = copy.copy(gw.grid)
    for s_i in range(grid_pol.shape[0]):
        for s_j in range(grid_pol.shape[1]):
            s_1d = gw.convert_state_2d_to_1d((s_i, s_j))
            a_opt = policy_opt[s_1d]
            grid_pol[s_i, s_j] = a_opt
    utils.print_policy(grid_pol, gw.actions_2d)

    # # Create a grid to show the optimal policy:
    # # 0: stay, 1: north, 2:east, 3: south, 4: west
    # grid_pol = copy.copy(gw.grid);
    # for s_i in range(grid_pol.shape[0]):
    #     for s_j in range(grid_pol.shape[1]):
    #         s_1d = gw.convert_state_2d_to_1d((s_i, s_j));
    #         a_opt = policy_opt[s_1d];
    #         grid_pol[s_i,s_j] = a_opt;

    # for path_1d in dem_paths:

    #     grid_path = copy.copy(gw.grid);
    #     steps = 0;
    #     for state_1d in path_1d:
Exemplo n.º 12
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('algorithm',
                        choices=['rtdp', 'lrtdp'],
                        help="Choose the algorithm.")
    args = parser.parse_args()

    inst = instance.get_example_instance()
    print(inst)

    values = {s: heuristic(inst, s) for s in inst.states}

    print("")
    print("Initial state-values:")
    print_values(inst, values)

    if args.algorithm == 'rtdp':
        values = rtdp(inst, values)
    elif args.algorithm == 'lrtdp':
        values = lrtdp(inst, values)
    else:
        sys.exit("Unknown algorithm")
    print("")

    print("Final values:")
    print_values(inst, values)

    policy = get_greedy_policy(inst, values)
    print("Corresponding final policy:")
    print_policy(inst, policy)