def define_identification_nn(self, Vm, Lm): self.model_nn = Model(dim_in=self.env.size_yudc[0] + self.env.size_yudc[1] + self.env.size_yudc[3], dim_out=self.env.size_yudc[0], dim_hidden=6, device=self.device, Vm=Vm, Lm=Lm)
class ILPL(ACBase): def __init__(self, gpu_id=1, replay_buffer = None, u_bounds = None, exploration = None, env=None, predict_training_rounds=10000, Vm=None, Lm=None, Va=None, La=None, Vc=None, Lc=None, gamma=0.6, batch_size = 1, predict_batch_size=32, model_nn_error_limit = 0.08, critic_nn_error_limit = 1, actor_nn_loss = 0.1, u_iter=30, u_begin = None, indice_y = None, indice_y_star = None, indice_c = None, u_first = None ): """ :param gpu_id: :param replay_buffer: :param u_bounds: :param exploration: :param env: :param predict_training_rounds: 训练预测模型时使用的真实数据条数 :param Vm: :param Lm: :param Va: :param La: :param Vc: :param Lc: :param gamma: :param batch_size: :param predict_batch_size: 训练预测模型时的batch_size :param model_nn_error_limit: :param critic_nn_error_limit: critic网络的误差限 :param actor_nn_loss: :param u_iter: 求解u*时的迭代次数 :param u_begin: 求解u*时,第一次迭代的其实u(k) :param indice_y: y在state中的位置 :param indice_y_star: *在state中的位置 :param u_first: 第一次控制时的命令 """ super(ILPL, self).__init__(gpu_id=gpu_id,replay_buffer=replay_buffer, u_bounds=u_bounds,exploration=exploration) if env is None: env = Thickener() self.env=env self.predict_training_rounds = predict_training_rounds self.device = None self.cuda_device(gpu_id) self.batch_size = batch_size self.predict_batch_size = predict_batch_size self.indice_c = [6, 7] self.predict_training_losses = [] self.model_nn = None self.model_nn_error_limit = model_nn_error_limit self.critic_nn_error_limit = critic_nn_error_limit self.actor_nn_error_limit = actor_nn_loss self.u_iter = u_iter # Train model neural network self.train_identification_model(Vm=Vm,Lm=Lm) self.test_predict_model(test_rounds=400) #定义actor网络相关 self.actor_nn = None self.actor_nn_init(Va=Va,La=La) #定义critic网络相关 self.critic_nn = None self.critic_nn_init(Vc=Vc,Lc=Lc) self.gamma = gamma self.u_begin = u_begin if indice_y is None: indice_y = [2,3] if indice_y_star is None: indice_y_star = [0,1] self.indice_y = indice_y self.indice_y_star = indice_y_star if u_first is None: u_first = np.array([1.8, 19]) self.u_first = u_first self.first_act = True # 用来画图用 self.u0_plt = PltUtil() self.u1_plt = PltUtil() self.y0_plt = PltUtil() self.y1_plt = PltUtil() self.wa_plt = PltUtil() self.wm_plt = PltUtil() self.wc_plt = PltUtil() def cuda_device(self, cuda_id): use_cuda = torch.cuda.is_available() cuda = 'cuda:'+str(cuda_id) self.device = torch.device(cuda if use_cuda else "cpu") def _act(self, state): self.y0_plt.push("ht*", state[0]) self.y1_plt.push("Cu*", state[1]) self.y0_plt.push("ht", state[2]) self.y1_plt.push("Cu", state[3]) # 第一次控制不用actor模型输出,否则会很离谱 if self.first_act: self.u0_plt.push("fu", self.u_first[0]) self.u1_plt.push("ff", self.u_first[1]) self.first_act = False act = self.u_first # 用actor网络计算输出 else: y = state[self.indice_y] y_star = state[self.indice_y_star] c = state[self.indice_c] x = torch.FloatTensor(np.hstack((y, y_star,c))).unsqueeze(0) act = self.actor_nn(x).detach().squeeze(0).numpy() self.u0_plt.push("hp", act[0]) self.u0_plt.push("hp min", self.u_bounds[0,0]) self.u0_plt.push("hp max", self.u_bounds[0,1]) self.u1_plt.push("qa", act[1]) self.u1_plt.push("qa min", self.u_bounds[1,0]) self.u1_plt.push("qa max", self.u_bounds[1,1]) return act def _train(self, s, u, ns, r, done): # 先放回放池 self.replay_buffer.push(s, u, r, ns, done) if len(self.replay_buffer) < self.batch_size: return # 从回放池取数据,默认1条 state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size) # 更新模型 self.update_model(state, action, reward, next_state, done) def add_nnw2plt(self): self.wm_plt.push("wm11", float(self.model_nn.Wm.weight.data[0,0])) self.wm_plt.push("wm12", float(self.model_nn.Wm.weight.data[0,1])) self.wm_plt.push("wm13", float(self.model_nn.Wm.weight.data[0,2])) self.wm_plt.push("wm14", float(self.model_nn.Wm.weight.data[0,3])) self.wm_plt.push("wm21", float(self.model_nn.Wm.weight.data[1,0])) self.wm_plt.push("wm22", float(self.model_nn.Wm.weight.data[1,1])) self.wm_plt.push("wm23", float(self.model_nn.Wm.weight.data[1,2])) self.wm_plt.push("wm24", float(self.model_nn.Wm.weight.data[1,3])) self.wa_plt.push("wa11", float(self.actor_nn.Wa.weight.data[0,0])) self.wa_plt.push("wa12", float(self.actor_nn.Wa.weight.data[0,1])) self.wa_plt.push("wa13", float(self.actor_nn.Wa.weight.data[0,2])) self.wa_plt.push("wa14", float(self.actor_nn.Wa.weight.data[0,3])) self.wa_plt.push("wa21", float(self.actor_nn.Wa.weight.data[1,0])) self.wa_plt.push("wa22", float(self.actor_nn.Wa.weight.data[1,1])) self.wa_plt.push("wa23", float(self.actor_nn.Wa.weight.data[1,2])) self.wa_plt.push("wa24", float(self.actor_nn.Wa.weight.data[1,3])) self.wc_plt.push("wc1", float(self.critic_nn.Wc.weight.data[0,0])) self.wc_plt.push("wc2", float(self.critic_nn.Wc.weight.data[0,1])) self.wc_plt.push("wc3", float(self.critic_nn.Wc.weight.data[0,2])) self.wc_plt.push("wc4", float(self.critic_nn.Wc.weight.data[0,3])) def update_model(self,state, action, penalty, next_state, done): self.add_nnw2plt() state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) penalty = torch.FloatTensor(penalty).unsqueeze(1).to(self.device) indices_y = torch.LongTensor(self.indice_y) indices_c = torch.LongTensor(self.indice_c) indices_y_star = torch.LongTensor(self.indice_y_star) y = torch.index_select(state, 1, indices_y) ny = torch.index_select(next_state, 1, indices_y) y_star = torch.index_select(state, 1, indices_y_star) c = torch.index_select(state, 1, indices_c) nc = torch.index_select(next_state, 1, indices_c) # region update model nn while True: next_state_predict = self.model_nn(torch.cat((y, action, c), dim=1)) model_loss = self.model_criterion(ny, next_state_predict) self.model_nn_optim.zero_grad() model_loss.backward() self.model_nn_optim.step() # The loop will be teiminated while the average loss < limit if model_loss.data / self.batch_size < self.model_nn_error_limit: break # endregion # 循环更新actor网络和critic网路 while True: # region update critic nn q_value = self.critic_nn(torch.cat((y, y_star, c), dim=1)) next_q_value = self.critic_nn(torch.cat((ny, y_star, nc), dim=1)) target_q = penalty + self.gamma * next_q_value # 定义TD loss critic_loss = self.critic_criterion(q_value, target_q) self.critic_nn_optim.zero_grad() critic_loss.backward() self.critic_nn_optim.step() # endregion # region update actor nn # find u* best_u = self.find_best_u(u0=action, y=y, y_star=y_star, c=c) best_u = torch.FloatTensor(best_u) x = torch.cat((y, y_star, c), dim=1) # calculate current u cur_u = self.actor_nn(x) act_loss = self.actor_criterion(cur_u, best_u) self.actor_nn_optim.zero_grad() # optimize actor network act_loss.backward() self.actor_nn_optim.step() # act_loss足够小时才结束 if act_loss / self.batch_size > self.actor_nn_error_limit: continue # 计算critic网络更新后的预测的V(k-1)值 new_q_value = self.critic_nn(torch.cat((y, y_star,c), dim=1)) diff_V = self.critic_criterion(q_value, new_q_value) # 两次V的输出足够小时才跳出 if diff_V.data / self.batch_size < self.critic_nn_error_limit: break # endregion # 自己看论文公式18 def find_best_u(self, u0,y,y_star,c): if self.u_begin is not None: u0 = u0.zero_() + torch.FloatTensor(self.u_begin) U = np.diag(self.u_bounds[:,1] - self.u_bounds[:,0]) U = torch.FloatTensor(U) S = self.env.penalty_calculator.S S = torch.FloatTensor(S) u_mid = torch.FloatTensor( np.mean(self.u_bounds,axis=1) ) # 我对论文的理解是用迭代的方法求u* for _ in itertools.count(): tmp_u0 = u0.clone() # region 方向传播计算V对u的梯度 u0.requires_grad = True self.critic_nn_optim.zero_grad() self.model_nn_optim.zero_grad() x_pred = self.model_nn(torch.cat((y, u0, c), dim=1)) v_pred = self.critic_nn(torch.cat((x_pred, y_star, c), dim=1)) v_pred.backward() # endregion u0_grad = u0.grad tmp = F.linear(u0_grad, -0.5*self.gamma*(U.mul(S).inverse().t())) tmp = torch.tanh(tmp) tmp = F.linear(tmp, U, bias=u_mid) u0 = tmp if (tmp_u0 - u0).norm()<0.1: break if _ > 50: print("Too much times for find u*") break return u0 def predict(self, state, act): cur_y=state[2:4] c = state[self.indice_c] x = torch.FloatTensor(np.hstack([cur_y, act, c])) return self.model_nn.forward(x) def cal_training_data(self): """ :return: """ # 写在json里暂存,防止每次都靠仿真模型太慢 json_path = "training_data_" + str(self.predict_training_rounds) + '.json' if os.path.exists(json_path): with open(json_path, 'r',) as fp: train_x, train_y = json.load(fp) train_x = np.array(train_x) train_y = np.array(train_y) return train_x, train_y train_x = [] train_y = [] # 生成训练数据 print("模拟生成") for _ in range(self.predict_training_rounds): print(_) y = self.env.observation()[2:4] act = np.random.uniform(self.u_bounds[:,0], self.u_bounds[:,1]) c = self.env.observation()[6:8] train_x.append(np.hstack([y, act, c])[np.newaxis,:]) self.env.step(act) new_state = self.env.observation()[2:4] train_y.append(new_state[np.newaxis, :]) if random.random() < 0.001: self.env.reset() # 写json暂存 with open(json_path, 'w',) as fp: tmp_x = np.copy(train_x).tolist() tmp_y = np.copy(train_y).tolist() json.dump((tmp_x, tmp_y), fp) return train_x, train_y def test_predict_model(self, test_rounds=1000): """ 测试预测模型效果的,画出差分图 :param test_rounds: :return: """ self.env.reset() pred_y_list = [] real_y_list = [] #pred_y_list.append(self.env.observation()[2:4][np.newaxis,:]) for _ in range(test_rounds): act = np.random.uniform(self.u_bounds[:,0], self.u_bounds[:,1]) pred_y = self.predict(self.env.observation(), act) old_y = self.env.observation()[2:4] pred_y_list.append(pred_y.detach().numpy() - old_y) self.env.step(act) real_y_list.append(self.env.observation()[2:4] - old_y) real_y_array = np.array(real_y_list) pred_y_array = np.array(pred_y_list) for i in range(self.env.size_yudc[0]): plt.plot(real_y_array[:,i]) plt.plot(pred_y_array[:,i]) plt.legend(['real','predict']) plt.show() def cal_predict_mse(self, test_rounds=1000, diff=False): self.env.reset() pred_y_list = [] real_y_list = [] #pred_y_list.append(self.env.observation()[2:4][np.newaxis,:]) for _ in range(test_rounds): act = np.random.uniform(self.u_bounds[:,0], self.u_bounds[:,1]) pred_y = self.predict(self.env.observation(), act) old_y = self.env.observation()[2:4] if not diff: old_y = old_y * 0 pred_y_list.append(pred_y.detach().numpy() - old_y) self.env.step(act) real_y_list.append(self.env.observation()[2:4] - old_y) real_y_array = np.array(real_y_list) pred_y_array = np.array(pred_y_list) mse_li = [] for i in range(self.env.size_yudc[0]): mse = mean_squared_error(real_y_array[:,i], pred_y_array[:,i]) mse_li.append(mse) return mse_li def define_identification_nn(self, Vm, Lm): self.model_nn = Model(dim_in=self.env.size_yudc[0]+self.env.size_yudc[1]+self.env.size_yudc[3], dim_out=self.env.size_yudc[0],dim_hidden=6,device=self.device,Vm=Vm,Lm=Lm) def train_identification_model(self, Vm, Lm): """ 代码参考: https://www.pytorchtutorial.com/3-6-optimizer/#i :param Vm: :param Lm: :return: """ # 定义预测模型 self.define_identification_nn(Vm,Lm) self.model_nn_optim = torch.optim.Adam(self.model_nn.parameters(), lr=0.01,betas=(0.9,0.99)) self.model_criterion = torch.nn.MSELoss() train_x, train_y = self.cal_training_data() torch_dataset = Data.TensorDataset(torch.FloatTensor(train_x), torch.FloatTensor(train_y)) # 建立数据集 loader = Data.DataLoader(dataset=torch_dataset, batch_size=self.predict_batch_size,shuffle=True) mse_list = [] # 训练预测模型 for epoch in itertools.count(): print("Epoch:{}".format(epoch+1), "") sum_loss = 0 for step,(batch_x, batch_y) in enumerate(loader): b_x = Variable(batch_x) b_y = Variable(batch_y) output = self.model_nn(b_x) loss = self.model_criterion(output,b_y) self.model_nn_optim.zero_grad() loss.backward() self.model_nn_optim.step() self.predict_training_losses.append(loss.item()) sum_loss += loss.item() print("Loss:{}".format(sum_loss)) # 每过20轮评估一次mse # if epoch % 20 == 0: # mse_list.append(self.cal_predict_mse()) # loss足够小或者迭代次数超过50次结束 if sum_loss < self.model_nn_error_limit or epoch >= 50: break # # 最后再评估一次 # mse_list.append(self.cal_predict_mse()) # # 绘制损失变化 # plt.figure() # # plt.title("Loss in various epoch") # plt.xlabel("Epochs") # plt.ylabel("Loss") # plt.plot(self.predict_training_losses) # plt.show() # # # # 绘制预测mse变化 # plt.figure() # mse_array = np.array(mse_list) # for i in range(mse_array.shape[1]): # plt.plot(mse_array[:,i]) # plt.plot(mse_array[:,i]) # plt.legend(['y1','y2']) # plt.show() # # 打印mse # print(mse_array) def actor_nn_init(self,Va,La): """ 定义动作网络相关 :param Va: :param La: :return: """ self.actor_nn = Actor(dim_in=6,dim_out=self.env.size_yudc[1], device=self.device, dim_hidden=6,Va=Va,La=La) self.actor_nn_optim = torch.optim.Adam(self.actor_nn.parameters(), lr=0.3,betas=(0.9,0.99)) self.actor_criterion = torch.nn.MSELoss() def critic_nn_init(self,Vc,Lc): """ 定义值函数评价网络相关 :param Vc: :param Lc: :return: """ self.critic_nn = Critic(dim_in=6, device=self.device, dim_out=1, dim_hidden=6,Vc=Vc,Lc=Lc) self.critic_nn_optim = torch.optim.Adam(self.critic_nn.parameters(), lr=0.05,betas=(0.9,0.99)) self.critic_criterion = torch.nn.MSELoss() def plt_list(self): """ 训练结束后返回控制效果和网络收敛效果 :return: """ return [ self.u0_plt, self.u1_plt, self.y0_plt, self.y1_plt, self.wa_plt, self.wm_plt, self.wc_plt ]