示例#1
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        output_dict = self.model.forward(inputs, mode='train')

        total_loss = 0
        if 'click' in self._output_type:
            click_id = inputs['click_id']
            click_prob = output_dict['click_prob']
            click_loss = layers.reduce_mean(
                layers.cross_entropy(input=click_prob, label=click_id))
            total_loss += click_loss
        if 'credit' in self._output_type:
            credit = inputs['credit'] * self._credit_scale
            credit_pred = output_dict['credit_pred']
            credit_loss = layers.reduce_mean(
                layers.square_error_cost(input=credit_pred, label=credit))
            total_loss += credit_loss
        if 'rate' in self._output_type:
            rate = layers.cast(inputs['click_id'],
                               'float32') * self._rate_scale
            rate_pred = output_dict['rate_pred']
            rate_loss = layers.reduce_mean(
                layers.square_error_cost(input=rate_pred, label=rate))
            total_loss += rate_loss

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr,
                                             epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(total_loss)

        fetch_dict = OrderedDict()
        fetch_dict[
            'loss'] = total_loss  # don't rename 'loss', which will be used in parallel exe in computational task
        if 'click' in self._output_type:
            fetch_dict['click_prob'] = click_prob
            fetch_dict['click_id'] = click_id
            fetch_dict['click_loss'] = click_loss
        if 'credit' in self._output_type:
            fetch_dict['credit_pred'] = credit_pred / self._credit_scale
            fetch_dict['credit'] = credit / self._credit_scale
            fetch_dict['credit_loss'] = credit_loss
        if 'rate' in self._output_type:
            fetch_dict['rate_pred'] = rate_pred / self._rate_scale
            fetch_dict['rate'] = rate / self._rate_scale
            fetch_dict['rate_loss'] = rate_loss
        return {'fetch_dict': fetch_dict}
示例#2
0
    def learn(self, obs, action, reward, next_obs, terminal):
        """ 使用DQN算法更新self.model的value网络
        """
        # 从target_model中获取 max Q' 的值,用于计算target_Q
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True  # 阻止梯度传递
        terminal = layers.cast(terminal, dtype='float32')
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)  # 获取Q预测值
        # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        # 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
        # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
        #  ==> pred_action_value = [[3.9]]
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # 计算 Q(s,a) 与 target_Q的均方差,得到loss
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
        optimizer.minimize(cost)
        return cost
示例#3
0
    def define_learn(self, obs, action, reward, next_obs, terminal, weight):
        #Q(s,a|θ)
        pred_value = self.model.value(obs)
        #Q(s',a'|θ')
        targetQ_predict_value = self.target_model.value(next_obs)
        #Q(s',a'|θ)
        next_s_predcit_value = self.model.value(next_obs)
        #argMax[Q(s',a'|θ)]
        greedy_action = fluid_argmax(next_s_predcit_value)
        predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
        #Q(s',argMax[Q(s',a'|θ)]|θ')
        best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul(
            predict_onehot, targetQ_predict_value),
                                         dim=1)
        best_v.stop_gradient = True
        #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ')
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.action_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        #计算新的TD-Error
        newTd = layers.abs(target - pred_action_value)
        cost = layers.square_error_cost(pred_action_value, target)
        #weight表示样本的权重,影响cost的更新幅度
        cost = weight * cost
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, newTd
示例#4
0
 def train_critic(inputs, click_id):
     output_dict = self.model.forward(inputs, output_type='c_Q')
     c_Q = output_dict['Q']
     target_Q = self.get_target_Q(inputs, click_id)
     target_Q.stop_gradient = True
     critic_loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))
     if self.optimizer == 'Adam':
         optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
     elif self.optimizer == 'SGD':
         optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
     optimizer.minimize(critic_loss)
     return critic_loss
示例#5
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
示例#6
0
    def test(self):
        """test"""
        inputs = self.model.create_inputs(mode='train')
        click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale

        output_dict = self.model.forward(inputs, output_type='c_Q')
        c_Q = output_dict['Q']
        target_Q = self.get_target_Q(inputs, click_id)
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss
        fetch_dict['c_Q'] = c_Q / self._reward_scale
        fetch_dict['click_id'] = click_id / self._reward_scale
        return {'fetch_dict': fetch_dict}
示例#7
0
    def test(self):
        """test"""
        inputs = self.model.create_inputs(mode='train')
        reward = layers.cast(inputs['reward'], 'float32')

        c_Q = self.model.forward(inputs, output_type='c_Q')
        max_Q = self.target_model.forward(inputs, output_type='max_Q')
        target_Q = self.get_target_Q(max_Q, reward)

        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss
        fetch_dict['c_Q'] = c_Q
        fetch_dict['reward'] = reward
        return {'fetch_dict': fetch_dict}
示例#8
0
    def learn(self, obs, action, reward, next_obs, terminal):
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=-1)
        best_v.stop_gradient = True
        terminal = layers.cast(terminal, dtype="float32")
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype="float32")
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            pred_value, action_onehot),
                                              dim=-1)

        cost = layers.square_error_cost(target, pred_action_value)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)
        return cost
示例#9
0
    def learn(self, obs, action, reward, next_obs, terminal):
        '''
        :param obs: St
        :param action: At
        :param reward: Rt+1
        :param next_obs: St+1
        :param terminal: done, True代表episode结束
        :return: 损失函数的值
        '''

        # 通过目标网络计算得到target_Q的值
        target_Q_tensor = self.target_model.value(next_obs)  # 计算St+1对应的价值向量
        max_Q = layers.reduce_max(target_Q_tensor, dim=1)  # 获取每行的最大值,按dim=1收缩
        max_Q.stop_gradient = True  # 停止梯度更新

        # 由于terminal不是标量,所以不能直接用判断
        terminal = layers.cast(terminal, dtype="float32")
        target_Q = reward + (1.0 - terminal) * self.gamma * max_Q

        # 通过主网络计算得到perdict_Q的值
        predict_Q_tensor = self.model.value(obs)
        # 将action转成one-hot向量,并将每一位都变成浮点数
        action_onehot = layers.one_hot(action, self.act_dim)
        action = layers.cast(action_onehot, dtype="float32")
        # 进行elementwise计算并降低张量阶数
        # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4],  action_onehot=[[0, 0, 0, 1, 0]
        #                         [2.1, 3.7, 4.5, 6.7, 7.1]]                 [0, 1, 0, 0, 0]]
        # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0]
        #                               [0, 3.7, 0, 0, 0]]
        # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7]
        predict_Q = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, predict_Q_tensor),
                                      dim=1)

        # 得到这个batch每条数据的损失函数值的平均值
        cost = layers.square_error_cost(predict_Q, target_Q)
        cost = layers.reduce_mean(cost)

        # 申明优化器(使用Adam优化器)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)  # 指定优化目标

        return cost
示例#10
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)

        # optimizer = fluid.optimizer.AdamOptimizer(self.critic_lrvalue)
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=self.boundaries, values=self.critic_lrvalue),
            regularization=fluid.regularizer.L2Decay(1e-4))

        optimizer.minimize(cost)
        return cost
示例#11
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        reward = layers.cast(inputs['reward'], 'float32')

        c_Q = self.model.forward(inputs, output_type='c_Q')
        max_Q = self.target_model.forward(inputs, output_type='max_Q')
        target_Q = self.get_target_Q(max_Q, reward)
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(loss)

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss             # don't rename 'loss', which will be used in parallel exe in computational task
        fetch_dict['c_Q'] = c_Q
        fetch_dict['reward'] = reward
        return {'fetch_dict': fetch_dict}
示例#12
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale

        output_dict = self.model.forward(inputs, output_type='c_Q')
        c_Q = output_dict['Q']
        target_Q = self.get_target_Q(inputs, click_id)
        target_Q.stop_gradient = True
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(loss)

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss             # don't rename 'loss', which will be used in parallel exe in computational task
        fetch_dict['c_Q'] = c_Q / self._reward_scale
        fetch_dict['click_id'] = click_id / self._reward_scale
        return {'fetch_dict': fetch_dict}
示例#13
0
 def learn(self, obs, label):
     pred_output = self.model.policy(obs)
     cost = layers.square_error_cost(obs, label)
     cost = fluid.layers.reduce_mean(cost)
     return cost