Exemplo n.º 1
0
def one_random_step(i, j, sim):
    A = return_pointwise_A(i, j, sim)
    nA = np.size(A, 0)
    chosen_action_id = np.random.randint(low=0, high=nA)
    action = A[chosen_action_id]
    iprime, jprime = np.array([i, j]) + action
    return iprime, jprime
def one_step(i, j, pi, sim):
    A = return_pointwise_A(i, j, sim)
    chosen_action_id = pi[i, j]
    action = A[chosen_action_id]
    iprime, jprime = np.array([i, j]) + action

    # checking for the overalps
    if iprime == jprime:  # if the action is allowed; this part can be modified
        overlap = True
    else:
        overlap = False

    # checking for the swaps
    swap = False  #swap is anyways always false when particles can move simultaneously, i.e. when sim == True
    if (i == j - 1 and jprime == iprime - 1):
        swap = True
    if (i == j + 1 and jprime == iprime + 1):
        swap = True

    if (i == 2 and j == 9 and iprime == 9 and jprime == 2):
        swap = True
    if (i == 9 and j == 2 and iprime == 2 and jprime == 9):
        swap = True

    if (swap == True or overlap == True):
        action_allowed = False
        iprime = i
        jprime = j
    else:
        action_allowed = True

    return iprime, jprime
def one_directed_step(i, j, sim):

    if (i == 9):
        i_in_the_hole = True
    else:
        i_in_the_hole = False

    if (j == 9):
        j_in_the_hole = True
    else:
        j_in_the_hole = False

    A = return_pointwise_A(i, j, sim)
    rand = np.random.rand()
    nA = np.size(A, 0)
    progressive_actions = np.array([])
    exist_p_m = exist_p_0 = exist_0_m = False
    for f**k in range(0, nA):
        if (np.array_equal(A[f**k], np.array([1, -1]))):
            exist_p_m = True
        if (np.array_equal(A[f**k], np.array([1, 0]))):
            exist_p_0 = True
        if (np.array_equal(A[f**k], np.array([0, -1]))):
            exist_0_m = True

    # the order of the if statements are important here.
    if (exist_p_0 == True):
        action = np.array([1, 0])
        if (j_in_the_hole):
            action = np.array([1, -7])
    if (exist_0_m == True):
        action = np.array([0, -1])
        if (i_in_the_hole):
            action = np.array([-7, -1])

    if (exist_p_m == True):
        action = np.array([1, -1])

    if ((exist_p_0 or exist_0_m or exist_p_m)):
        if (rand < 0.2):
            chosen_action_id = np.random.randint(low=0, high=nA)
            action = A[chosen_action_id]
    else:
        chosen_action_id = np.random.randint(low=0, high=nA)
        action = A[chosen_action_id]

    print("action")
    print(action)
    iprime, jprime = np.array([i, j]) + action
    if (iprime == jprime):
        iprime = i
        jprime = j
    print("iprime = " + str(iprime) + "  jprime = " + str(jprime))
    return iprime, jprime
Exemplo n.º 4
0
v = np.zeros(shape=(n, n))

# 3.3 the main iterative loop
# Eq. (4.7) of the book:
# each step is the Bellman operation for policy evaluation
# followed by a policy improvement
step = 0
while (step < 500):
    new_pi = np.zeros(shape=(n, n))
    # policy evaluation
    v = Bellmann_iteration(pi, r, v, gamma, sim)
    # policy iteration
    for i in range(0, n):
        for j in range(0, n):
            Q_max = -1000
            A = return_pointwise_A(i, j, sim)
            nr_actions = np.size(A, 0)
            for candidate_action_id in range(
                    0, nr_actions
            ):  #iterate over all candidate to find the largest Q
                Q = Q_estimation_for_state_s(i, j, gamma, r, v,
                                             candidate_action_id, sim)
                if Q >= Q_max:
                    Q_max = Q
                    new_pi[i, j] = candidate_action_id
    pi = new_pi + 0.0
    step += 1
    if (step % 100 == 1):
        print("#iteration: " + str(step - 1))

#### 4. printing and saving ####
def one_step(i, j, pi, sim):
    A = return_pointwise_A(i, j, sim)
    chosen_action_id = pi[i, j]
    action = A[chosen_action_id]
    iprime, jprime = np.array([i, j]) + action
    return iprime, jprime
# 3.3 the main iterative loop
# Eq. (4.7) of the book:
# each step is the Bellman operation for policy evaluation
# followed by a policy improvement
step = 0

while step < nr_iterations:
    new_pi = np.zeros(shape=(n, n))
    # policy evaluation
    v = Bellmann_iteration(pi, v, gamma)
    # policy improvement
    for i in range(0, n):
        for j in range(0, n):
            Q_max = -1000
            A = return_pointwise_A(i, j)
            nr_actions = np.size(A, 0)
            # iterate over all candidate to find the largest Q
            for action_id in range(0, nr_actions):
                Q = Q_estimate(i, j, action_id, gamma, v)
                if Q >= Q_max:
                    Q_max = Q
                    new_pi[i, j] = action_id
    pi = new_pi + 0.0
    plotter(ax, v)
    step += 1

    if (step % 100 == 1):
        print("#iteration: " + str(step - 1))

simulate(4, 5, pi)