def learn(self, obs, action, reward, next_obs, terminal): """ 使用DQN算法更新self.model的value网络 """ # 从target_model中获取 max Q' 的值,用于计算target_Q next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=1) best_v.stop_gradient = True # 阻止梯度传递 terminal = layers.cast(terminal, dtype='float32') target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) # 获取Q预测值 # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处 action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] # ==> pred_action_value = [[3.9]] pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) # 计算 Q(s,a) 与 target_Q的均方差,得到loss cost = layers.square_error_cost(pred_action_value, target) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 optimizer.minimize(cost) return cost
def define_learn(self, obs, action, reward, next_obs, terminal, weight): #Q(s,a|θ) pred_value = self.model.value(obs) #Q(s',a'|θ') targetQ_predict_value = self.target_model.value(next_obs) #Q(s',a'|θ) next_s_predcit_value = self.model.value(next_obs) #argMax[Q(s',a'|θ)] greedy_action = fluid_argmax(next_s_predcit_value) predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim) #Q(s',argMax[Q(s',a'|θ)]|θ') best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul( predict_onehot, targetQ_predict_value), dim=1) best_v.stop_gradient = True #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ') target = reward + ( 1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v action_onehot = layers.one_hot(action, self.action_dim) action_onehot = layers.cast(action_onehot, dtype='float32') pred_action_value = layers.reduce_sum(layers.elementwise_mul( action_onehot, pred_value), dim=1) #计算新的TD-Error newTd = layers.abs(target - pred_action_value) cost = layers.square_error_cost(pred_action_value, target) #weight表示样本的权重,影响cost的更新幅度 cost = weight * cost cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3) optimizer.minimize(cost) return cost, newTd
def learn(self, obs, action, reward): act_prob = self.model(obs) # log_prob = layers.cross_entropy(act_prob, action) log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot( action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward): """ 用policy gradient 算法更新policy model """ act_prob = self.model(obs) # 获取输出动作概率 # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵 log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) * layers.one_hot(action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) print('====loss', cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward, next_obs, terminal): next_pred_value = self.target_model.value(next_obs) best_v = layers.reduce_max(next_pred_value, dim=-1) best_v.stop_gradient = True terminal = layers.cast(terminal, dtype="float32") target = reward + (1.0 - terminal) * self.gamma * best_v pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype="float32") pred_action_value = layers.reduce_sum(layers.elementwise_mul( pred_value, action_onehot), dim=-1) cost = layers.square_error_cost(target, pred_action_value) cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward, next_obs, terminal): ''' :param obs: St :param action: At :param reward: Rt+1 :param next_obs: St+1 :param terminal: done, True代表episode结束 :return: 损失函数的值 ''' # 通过目标网络计算得到target_Q的值 target_Q_tensor = self.target_model.value(next_obs) # 计算St+1对应的价值向量 max_Q = layers.reduce_max(target_Q_tensor, dim=1) # 获取每行的最大值,按dim=1收缩 max_Q.stop_gradient = True # 停止梯度更新 # 由于terminal不是标量,所以不能直接用判断 terminal = layers.cast(terminal, dtype="float32") target_Q = reward + (1.0 - terminal) * self.gamma * max_Q # 通过主网络计算得到perdict_Q的值 predict_Q_tensor = self.model.value(obs) # 将action转成one-hot向量,并将每一位都变成浮点数 action_onehot = layers.one_hot(action, self.act_dim) action = layers.cast(action_onehot, dtype="float32") # 进行elementwise计算并降低张量阶数 # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4], action_onehot=[[0, 0, 0, 1, 0] # [2.1, 3.7, 4.5, 6.7, 7.1]] [0, 1, 0, 0, 0]] # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0] # [0, 3.7, 0, 0, 0]] # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7] predict_Q = layers.reduce_sum(layers.elementwise_mul( action_onehot, predict_Q_tensor), dim=1) # 得到这个batch每条数据的损失函数值的平均值 cost = layers.square_error_cost(predict_Q, target_Q) cost = layers.reduce_mean(cost) # 申明优化器(使用Adam优化器) optimizer = fluid.optimizer.Adam(learning_rate=self.lr) optimizer.minimize(cost) # 指定优化目标 return cost
def learn(self, obs, action, reward): """ :param obs: [B,4] :param action: [B,1] :param reward: [B,] :return: """ act_prob = self.model(obs) # [B,2] # [B, 2] -> [B, ] log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot(action, depth=act_prob.shape[1]), dim=1, keep_dim=False) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def test_param_sharing(self): """ Test case for parameter sharing between layers of the same type """ net = MyNetWork() ## we bind the paras of embedding to those of fc1 batch_size = 10 dict_size = 100 input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32") input_x = np.random.randint(dict_size, size=(batch_size, 1)).astype("int64") ################################# main_program1 = fluid.Program() with fluid.program_guard(main_program1): x = layers.data(name='x', shape=[100], dtype="float32") y1 = net.fc1(input=x) y11 = net.fc1(input=x) y2 = net.fc2(input=x) y3 = net.fc3(input=x) y4 = net.fc4(input=x) main_program2 = fluid.Program() with fluid.program_guard(main_program2): x_ = layers.data(name='x', shape=[1], dtype="int64") cx_ = layers.cast(x=layers.one_hot(input=x_, depth=dict_size), dtype="float32") y1_ = net.fc1(input=cx_) y2_ = net.embedding(input=x_) x1_ = layers.data(name='x1', shape=[100], dtype="float32") y3_ = net.fc1(input=x1_) #### we run the startup program only once to make sure #### only one para init across the two programs place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) ###################################################### outputs = exe.run(main_program1, feed={"x": input_cx}, fetch_list=[y1, y11, y2, y3, y4]) old_y1 = outputs[0] self.assertEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) self.assertNotEqual(np.sum(outputs[1].flatten()), np.sum(outputs[2].flatten())) self.assertNotEqual(np.sum(outputs[3].flatten()), np.sum(outputs[4].flatten())) outputs = exe.run(main_program2, feed={ 'x': input_x, 'x1': input_cx }, fetch_list=[y1_, y2_, y3_]) ### test two different layers sharing the same para matrix self.assertEqual(np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) ### test if the same layer can have the same parameters across two different programs self.assertEqual(np.sum(outputs[2].flatten()), np.sum(old_y1.flatten()))