import numpy as np
from MazeEnv import *
import copy

world_size=np.array([5,5])
env = MazeEnv(world_size=world_size, \
              gold_pos=np.array([[1,2]]), \
              bad_pos=np.array([[3,4]]), \
              max_ite=20)
env.reset()
DISCOUNT = 0.9
epsilon = 0.1
alpha = 0.01
Q = np.zeros((world_size[0] * world_size[1], 4))
Q_old = copy.deepcopy(Q)
Q1 = np.zeros((world_size[0] * world_size[1], 4))
Q1_old = copy.deepcopy(Q1)
Q2 = np.zeros((world_size[0] * world_size[1], 4))
Q2_old = copy.deepcopy(Q2)
V = np.zeros((world_size[0]*world_size[1]))
V_old = copy.deepcopy(V)

ite = 0
while True:  # episode循环
    ite += 1
    Q_old = copy.deepcopy(Q)
    s = env.reset()
    while True:  # step循环
        a = np.argmax(Q1[s, :]+Q2[s, :]) if (np.random.rand() > epsilon) else env.random_action()
        s_, r, d = env.step(a)
Exemplo n.º 2
0
import numpy as np
from MazeEnv import *
import copy

world_size = np.array([5, 5])
env = MazeEnv(world_size=world_size, \
              gold_pos=np.array([[1,2]]), \
              bad_pos=np.array([[3,4]]), \
              max_ite=10)
env.reset()
DISCOUNT = 0.9
V = np.zeros((world_size[0] * world_size[1]))
V_old = copy.deepcopy(V)
PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int)  #0,1,2,3分别代表4个动作


def value_iteration():
    ite = 0
    while True:
        V_old = copy.deepcopy(V)
        for s in env.feasible_states:
            values = []
            for a in env.feasible_actions:
                s_, r, d = env.step_state(s, a)
                values.append(r + DISCOUNT * V[s_])
            V[s] = np.max(np.array(values))
        if np.max(np.abs(V - V_old)) < 0.0001:
            break


def policy_improvement():
Exemplo n.º 3
0
import numpy as np
from MazeEnv import *
import copy

world_size = np.array([5, 5])
env = MazeEnv(world_size=world_size, \
              gold_pos=np.array([[1,2]]), \
              bad_pos=np.array([[3,4]]), \
              max_ite=10)
env.reset()
# 四个动作各1/4
ACTION_PROB = 0.25
DISCOUNT = 0.9
V = np.zeros((world_size[0] * world_size[1]))
V_old = copy.deepcopy(V)

while True:
    V_old = copy.deepcopy(V)
    for s in env.feasible_states:
        values = []
        for a in env.feasible_actions:
            s_, r, d = env.step_state(s, a)
            values.append(ACTION_PROB * (r + DISCOUNT * V[s_]))
        V[s] = np.sum(np.array(values))
    if np.max(np.abs(V - V_old)) < 0.0001:
        break

print('env.ite = ', env.ite)
print(V.reshape(world_size))
import numpy as np
from MazeEnv import *
import copy
from collections import defaultdict

world_size = np.array([5, 5])
env = MazeEnv(world_size=world_size, \
              gold_pos=np.array([[1, 2]]), \
              bad_pos=np.array([[3, 4]]), \
              max_ite=50)
env.reset()
DISCOUNT = 0.9
Q = np.zeros((world_size[0] * world_size[1], 4))
Q_old = copy.deepcopy(Q)
PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int)  # 0,1,2,3分别代表4个动作
#
Return = defaultdict(list)

ite = 0
while True:  # episode循环
    ite += 1
    Q_old = copy.deepcopy(Q)
    for s0 in env.feasible_states:
        for a0 in env.feasible_actions:
            s_a_his = []
            r_his = []
            s = env.reset(s0)
            s_, r, d = env.step(a0)
            s_a_his.append(env.encode_s_a(s0, a0))
            r_his.append(r)
            s = s_