def p_loop(EPISODE, GAMMA, LAMBDA, ALPHA, path): """ 训练函数 """ # 初始化w try: w = np.load(path) print("Load {}".format(path)) print("-" * 30) except: w = np.zeros((12 * 12 * 12 * 12 * 4, 1)) print("Initialize Value") print("-" * 30) # 初始化Feature_Encoder & Actor encoder = FEATURE_ENCODER(ACTION) actor = ACTOR(encoder, ACTION, is_train=True) # 初始化训练参数 step_a = INTERVAL_A / INTERVAL_ENV # 循环更新 for ep in range(EPISODE): # 训练log记录 w_hist = [] r_hist = [] # 初始化资格迹 et = np.zeros_like(w) #随机初始化环境和状态 e = ENV() # 初次动作生成a_t & 特征编码s_t & 状态更新 a = actor.act([e.c.dx, e.c.dy, e.c.vx, e.c.vy], w) en = encoder.encode([e.c.dx, e.c.dy, e.c.vx, e.c.vy], a) e.update(a) for t in range(int(T / INTERVAL_ENV)): # 动作仿真 if t % step_a == 0: # 更新动作 a_{t+1} a_new = actor.act([e.c.dx, e.c.dy, e.c.vx, e.c.vy], w) # 更新特征 s_{t+1} en_new = encoder.encode([e.c.dx, e.c.dy, e.c.vx, e.c.vy], a_new) # 计算delta delta = e.r + GAMMA * np.matmul(en_new.T, w) - np.matmul( en.T, w) # 更新资格迹 et = GAMMA * LAMBDA * et + en # 更新参数矩阵w w += ALPHA * delta * et a = a_new en = en_new # Log记录 w_hist.append(np.sum(np.abs(delta))) r_hist.append(e.r) # 状态仿真 e.update(a) # Log输出 w_hist = np.array(w_hist) r_hist = np.array(r_hist) print( "EP{}: delta_w:{:.2f} total_r:{:.2f} final_dist:{:.2f} Vx:{:.2f} Vy:{:.2f}" .format(ep + 1, np.sum(w_hist), np.sum(r_hist), -e.r, e.c.vx, e.c.vy)) # 每10个ep存储一次参数矩阵w if (ep + 1) % 10 == 0: np.save(path, w) print("Saved in {}".format(path)) print("-" * 30)
def main(EP, VIS, path, FAST): # 初始化仿真参数 step_a = INTERVAL_A / INTERVAL_ENV # 初始化特征编码器 & 动作生成器 encoder = FEATURE_ENCODER(ACTION) actor = ACTOR(encoder, ACTION, is_train=False) # 加载参数矩阵 try: w = np.load(path) print("Load {}".format(path)) print("-" * 30) except: print("Could not find {}".format(path)) return 0 # 实时可视化的初始化设置 if VIS: plt.ion() plt.figure(figsize=(5, 5)) plt.axis([0, 100, 0, 100]) for ep in range(EP): sys.stdout.write("EP:{} ".format(ep + 1)) # 初始化环境 # e = ENV(w=100, h=100, target=[85.0, 85.0], c_x=10.0, c_y=10.0, c_vx=0.0, c_vy=0.0) # e = ENV(w=100, h=100, c_vx=0.0, c_vy=0.0) e = ENV(w=100, h=100) # 可视化 if VIS: plt.scatter(e.target[0], e.target[1], s=30, c='red') else: track_x = [] track_y = [] for t in range(int(T / INTERVAL_ENV)): if t % step_a == 0: a = actor.act([e.c.dx, e.c.dy, e.c.vx, e.c.vy], w) e.update(a) # 可视化 if VIS and t % FAST == 0: sys.stdout.write( "Ep:{}-{} Vx:{:.2f} Vy:{:.2f} Action:{} \r". format(ep, t + 1, e.c.vx, e.c.vy, a)) sys.stdout.flush() plt.scatter(e.c.x, e.c.y, s=10, c='blue', alpha=0.2) plt.scatter(e.target[0], e.target[1], s=30, c='red') plt.pause(0.01) elif not VIS: track_x.append(e.c.x) track_y.append(e.c.y) str_out = "processing" if (t + 1) % 300 == 0: sys.stdout.write(str_out[(t + 1) // 300 - 1]) sys.stdout.flush() print( " Final_distance:{:.2f} ".format(-e.r)) if VIS: plt.scatter(e.c.x, e.c.y, s=30, c='orange') plt.text(e.c.x, e.c.y - 1, "EP{} Dist:{:.2f}".format(ep + 1, -e.r)) plt.pause(5) if not VIS: plt.scatter(track_x, track_y, s=5, c='blue', alpha=0.2) plt.scatter(e.target[0], e.target[1], s=30, c='red') plt.scatter(track_x[-1], track_y[-1], s=30, c='orange') plt.text(e.c.x, e.c.y - 1, "Dist:{:.2f}".format(-e.r)) plt.axis([0, 100, 0, 100]) plt.show()