예제 #1
0
파일: dqn.py 프로젝트: YuechengLiu/PARL
    def learn(self,
              obs,
              action,
              reward,
              next_obs,
              terminal,
              learning_rate=None):
        """ update value model self.model with DQN algorithm
        """
        # Support the modification of learning_rate
        if learning_rate is None:
            assert isinstance(
                self.lr,
                float), "Please set the learning rate of DQN in initializaion."
            learning_rate = self.lr

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(
            learning_rate=learning_rate, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost
예제 #2
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        # print("obs:",obs)
        # raise NotImplementedError
        # obs = layers.squeeze(input=obs,axes=[-1])
        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta
예제 #3
0
    def logp(self, actions, eps=1e-6):
        """
        Args:
            actions: An int64 tensor with shape [BATCH_SIZE]
            eps: A small float constant that avoids underflows when computing the log probability

        Returns:
            actions_log_prob: A float32 tensor with shape [BATCH_SIZE]
        """
        assert len(actions.shape) == 1

        logits = self.logits - layers.reduce_max(self.logits, dim=1)
        e_logits = layers.exp(logits)
        z = layers.reduce_sum(e_logits, dim=1)
        prob = e_logits / z

        actions = layers.unsqueeze(actions, axes=[1])
        actions_onehot = layers.one_hot(actions, prob.shape[1])
        actions_onehot = layers.cast(actions_onehot, dtype='float32')
        actions_prob = layers.reduce_sum(prob * actions_onehot, dim=1)

        actions_prob = actions_prob + eps
        actions_log_prob = layers.log(actions_prob)

        return actions_log_prob
예제 #4
0
파일: td3.py 프로젝트: YuechengLiu/PARL
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        noise = layers.gaussian_random_batch_size_like(
            action, shape=[-1, action.shape[1]])
        noise = layers.clip(noise * self.policy_noise,
                            min=-self.noise_clip,
                            max=self.noise_clip)
        next_action = self.target_model.policy(next_obs) + noise
        next_action = layers.clip(next_action, -self.max_action,
                                  self.max_action)

        next_Q1, next_Q2 = self.target_model.value(next_obs, next_action)
        next_Q = layers.elementwise_min(next_Q1, next_Q2)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.model.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
예제 #5
0
파일: dqn.py 프로젝트: YuechengLiu/PARL
    def cal_bellman_residual(self, obs, action, reward, next_obs, terminal):
        """ use self.model to get squared Bellman residual with fed data
        """
        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        return cost
예제 #6
0
 def sample(self):
     """
     Returns:
         sample_action: An int64 tensor with shape [BATCH_SIZE, NUM_ACTIOINS] of sample action,
                        with noise to keep the target close to the original action.
     """
     eps = 1e-4
     logits_shape = layers.cast(layers.shape(self.logits), dtype='int64')
     uniform = layers.uniform_random(logits_shape, min=eps, max=1.0 - eps)
     soft_uniform = layers.log(-1.0 * layers.log(uniform))
     return layers.softmax(self.logits - soft_uniform, axis=-1)
예제 #7
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        """ update value model self.model with DQN algorithm
        """

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta  # `delta` is the TD-error
예제 #8
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
예제 #9
0
파일: sac.py 프로젝트: YuechengLiu/PARL
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        next_obs_action, next_obs_log_pi = self.sample(next_obs)
        qf1_next_target, qf2_next_target = self.target_critic.value(
            next_obs, next_obs_action)
        min_qf_next_target = layers.elementwise_min(
            qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.critic.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost