import numpy as np from MazeEnv import * import copy world_size=np.array([5,5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1,2]]), \ bad_pos=np.array([[3,4]]), \ max_ite=20) env.reset() DISCOUNT = 0.9 epsilon = 0.1 alpha = 0.01 Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) Q1 = np.zeros((world_size[0] * world_size[1], 4)) Q1_old = copy.deepcopy(Q1) Q2 = np.zeros((world_size[0] * world_size[1], 4)) Q2_old = copy.deepcopy(Q2) V = np.zeros((world_size[0]*world_size[1])) V_old = copy.deepcopy(V) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) s = env.reset() while True: # step循环 a = np.argmax(Q1[s, :]+Q2[s, :]) if (np.random.rand() > epsilon) else env.random_action() s_, r, d = env.step(a)
import numpy as np from MazeEnv import * import copy world_size = np.array([5, 5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1,2]]), \ bad_pos=np.array([[3,4]]), \ max_ite=10) env.reset() DISCOUNT = 0.9 V = np.zeros((world_size[0] * world_size[1])) V_old = copy.deepcopy(V) PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int) #0,1,2,3分别代表4个动作 def value_iteration(): ite = 0 while True: V_old = copy.deepcopy(V) for s in env.feasible_states: values = [] for a in env.feasible_actions: s_, r, d = env.step_state(s, a) values.append(r + DISCOUNT * V[s_]) V[s] = np.max(np.array(values)) if np.max(np.abs(V - V_old)) < 0.0001: break def policy_improvement():
import numpy as np from MazeEnv import * import copy world_size = np.array([5, 5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1,2]]), \ bad_pos=np.array([[3,4]]), \ max_ite=10) env.reset() # 四个动作各1/4 ACTION_PROB = 0.25 DISCOUNT = 0.9 V = np.zeros((world_size[0] * world_size[1])) V_old = copy.deepcopy(V) while True: V_old = copy.deepcopy(V) for s in env.feasible_states: values = [] for a in env.feasible_actions: s_, r, d = env.step_state(s, a) values.append(ACTION_PROB * (r + DISCOUNT * V[s_])) V[s] = np.sum(np.array(values)) if np.max(np.abs(V - V_old)) < 0.0001: break print('env.ite = ', env.ite) print(V.reshape(world_size))
import numpy as np from MazeEnv import * import copy from collections import defaultdict world_size = np.array([5, 5]) env = MazeEnv(world_size=world_size, \ gold_pos=np.array([[1, 2]]), \ bad_pos=np.array([[3, 4]]), \ max_ite=50) env.reset() DISCOUNT = 0.9 Q = np.zeros((world_size[0] * world_size[1], 4)) Q_old = copy.deepcopy(Q) PI = np.zeros((world_size[0] * world_size[1]), dtype=np.int) # 0,1,2,3分别代表4个动作 # Return = defaultdict(list) ite = 0 while True: # episode循环 ite += 1 Q_old = copy.deepcopy(Q) for s0 in env.feasible_states: for a0 in env.feasible_actions: s_a_his = [] r_his = [] s = env.reset(s0) s_, r, d = env.step(a0) s_a_his.append(env.encode_s_a(s0, a0)) r_his.append(r) s = s_