예제 #1
0
    def learn(self, obs, action, reward, next_obs, terminal):
        """ 使用DQN算法更新self.model的value网络
        """
        # 从target_model中获取 max Q' 的值,用于计算target_Q
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True  # 阻止梯度传递
        terminal = layers.cast(terminal, dtype='float32')
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)  # 获取Q预测值
        # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        # 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
        # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
        #  ==> pred_action_value = [[3.9]]
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # 计算 Q(s,a) 与 target_Q的均方差,得到loss
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
        optimizer.minimize(cost)
        return cost
예제 #2
0
    def define_learn(self, obs, action, reward, next_obs, terminal, weight):
        #Q(s,a|θ)
        pred_value = self.model.value(obs)
        #Q(s',a'|θ')
        targetQ_predict_value = self.target_model.value(next_obs)
        #Q(s',a'|θ)
        next_s_predcit_value = self.model.value(next_obs)
        #argMax[Q(s',a'|θ)]
        greedy_action = fluid_argmax(next_s_predcit_value)
        predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
        #Q(s',argMax[Q(s',a'|θ)]|θ')
        best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul(
            predict_onehot, targetQ_predict_value),
                                         dim=1)
        best_v.stop_gradient = True
        #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ')
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.action_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        #计算新的TD-Error
        newTd = layers.abs(target - pred_action_value)
        cost = layers.square_error_cost(pred_action_value, target)
        #weight表示样本的权重,影响cost的更新幅度
        cost = weight * cost
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, newTd
예제 #3
0
    def learn(self, obs, action, reward):
        act_prob = self.model(obs)
        # log_prob = layers.cross_entropy(act_prob, action)
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) * layers.one_hot(
                action, act_prob.shape[1]),
            dim=1)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost
예제 #4
0
 def learn(self, obs, action, reward):
     """ 用policy gradient 算法更新policy model
     """
     act_prob = self.model(obs)  # 获取输出动作概率
     # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
     log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) *
                                  layers.one_hot(action, act_prob.shape[1]),
                                  dim=1)
     cost = log_prob * reward
     cost = layers.reduce_mean(cost)
     print('====loss', cost)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(cost)
     return cost
예제 #5
0
    def learn(self, obs, action, reward, next_obs, terminal):
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=-1)
        best_v.stop_gradient = True
        terminal = layers.cast(terminal, dtype="float32")
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype="float32")
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            pred_value, action_onehot),
                                              dim=-1)

        cost = layers.square_error_cost(target, pred_action_value)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)
        return cost
예제 #6
0
    def learn(self, obs, action, reward, next_obs, terminal):
        '''
        :param obs: St
        :param action: At
        :param reward: Rt+1
        :param next_obs: St+1
        :param terminal: done, True代表episode结束
        :return: 损失函数的值
        '''

        # 通过目标网络计算得到target_Q的值
        target_Q_tensor = self.target_model.value(next_obs)  # 计算St+1对应的价值向量
        max_Q = layers.reduce_max(target_Q_tensor, dim=1)  # 获取每行的最大值,按dim=1收缩
        max_Q.stop_gradient = True  # 停止梯度更新

        # 由于terminal不是标量,所以不能直接用判断
        terminal = layers.cast(terminal, dtype="float32")
        target_Q = reward + (1.0 - terminal) * self.gamma * max_Q

        # 通过主网络计算得到perdict_Q的值
        predict_Q_tensor = self.model.value(obs)
        # 将action转成one-hot向量,并将每一位都变成浮点数
        action_onehot = layers.one_hot(action, self.act_dim)
        action = layers.cast(action_onehot, dtype="float32")
        # 进行elementwise计算并降低张量阶数
        # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4],  action_onehot=[[0, 0, 0, 1, 0]
        #                         [2.1, 3.7, 4.5, 6.7, 7.1]]                 [0, 1, 0, 0, 0]]
        # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0]
        #                               [0, 3.7, 0, 0, 0]]
        # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7]
        predict_Q = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, predict_Q_tensor),
                                      dim=1)

        # 得到这个batch每条数据的损失函数值的平均值
        cost = layers.square_error_cost(predict_Q, target_Q)
        cost = layers.reduce_mean(cost)

        # 申明优化器(使用Adam优化器)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)  # 指定优化目标

        return cost
예제 #7
0
파일: algo.py 프로젝트: Feynman1999/myRL
    def learn(self, obs, action, reward):
        """

        :param obs:      [B,4]
        :param action:   [B,1]
        :param reward:   [B,]
        :return:
        """
        act_prob = self.model(obs)  # [B,2]
        # [B, 2] -> [B, ]
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) *
            layers.one_hot(action, depth=act_prob.shape[1]),
            dim=1,
            keep_dim=False)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost
예제 #8
0
    def test_param_sharing(self):
        """
        Test case for parameter sharing between layers of the same type
        """
        net = MyNetWork()
        ## we bind the paras of embedding to those of fc1
        batch_size = 10
        dict_size = 100
        input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
        input_x = np.random.randint(dict_size,
                                    size=(batch_size, 1)).astype("int64")
        #################################

        main_program1 = fluid.Program()
        with fluid.program_guard(main_program1):
            x = layers.data(name='x', shape=[100], dtype="float32")
            y1 = net.fc1(input=x)
            y11 = net.fc1(input=x)
            y2 = net.fc2(input=x)
            y3 = net.fc3(input=x)
            y4 = net.fc4(input=x)

        main_program2 = fluid.Program()
        with fluid.program_guard(main_program2):
            x_ = layers.data(name='x', shape=[1], dtype="int64")
            cx_ = layers.cast(x=layers.one_hot(input=x_, depth=dict_size),
                              dtype="float32")
            y1_ = net.fc1(input=cx_)
            y2_ = net.embedding(input=x_)

            x1_ = layers.data(name='x1', shape=[100], dtype="float32")
            y3_ = net.fc1(input=x1_)

        #### we run the startup program only once to make sure
        #### only one para init across the two programs
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        ######################################################

        outputs = exe.run(main_program1,
                          feed={"x": input_cx},
                          fetch_list=[y1, y11, y2, y3, y4])
        old_y1 = outputs[0]
        self.assertEqual(np.sum(outputs[0].flatten()),
                         np.sum(outputs[1].flatten()))
        self.assertNotEqual(np.sum(outputs[1].flatten()),
                            np.sum(outputs[2].flatten()))
        self.assertNotEqual(np.sum(outputs[3].flatten()),
                            np.sum(outputs[4].flatten()))

        outputs = exe.run(main_program2,
                          feed={
                              'x': input_x,
                              'x1': input_cx
                          },
                          fetch_list=[y1_, y2_, y3_])

        ### test two different layers sharing the same para matrix
        self.assertEqual(np.sum(outputs[0].flatten()),
                         np.sum(outputs[1].flatten()))
        ### test if the same layer can have the same parameters across two different programs
        self.assertEqual(np.sum(outputs[2].flatten()),
                         np.sum(old_y1.flatten()))