def learn(self, obs, action, reward): act_prob = self.model(obs) # log_prob = layers.cross_entropy(act_prob, action) log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot( action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward): """ 用policy gradient 算法更新policy model """ act_prob = self.model(obs) # 获取输出动作概率 # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵 log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) * layers.one_hot(action, act_prob.shape[1]), dim=1) cost = log_prob * reward cost = layers.reduce_mean(cost) print('====loss', cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward): """ :param obs: [B,4] :param action: [B,1] :param reward: [B,] :return: """ act_prob = self.model(obs) # [B,2] # [B, 2] -> [B, ] log_prob = layers.reduce_sum( -1.0 * layers.log(act_prob) * layers.one_hot(action, depth=act_prob.shape[1]), dim=1, keep_dim=False) cost = log_prob * reward cost = layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost