示例#1
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        # print("obs:",obs)
        # raise NotImplementedError
        # obs = layers.squeeze(input=obs,axes=[-1])
        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)

        # calculate the target q value
        next_action_value = self.model.value(next_obs)
        greedy_action = layers.argmax(next_action_value, axis=-1)
        greedy_action = layers.unsqueeze(greedy_action, axes=[1])
        greedy_action_onehot = layers.one_hot(greedy_action, self.act_dim)
        next_pred_value = self.target_model.value(next_obs)
        max_v = layers.reduce_sum(greedy_action_onehot * next_pred_value,
                                  dim=1)
        max_v.stop_gradient = True

        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * max_v
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta
示例#2
0
文件: td3.py 项目: YuechengLiu/PARL
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        noise = layers.gaussian_random_batch_size_like(
            action, shape=[-1, action.shape[1]])
        noise = layers.clip(noise * self.policy_noise,
                            min=-self.noise_clip,
                            max=self.noise_clip)
        next_action = self.target_model.policy(next_obs) + noise
        next_action = layers.clip(next_action, -self.max_action,
                                  self.max_action)

        next_Q1, next_Q2 = self.target_model.value(next_obs, next_action)
        next_Q = layers.elementwise_min(next_Q1, next_Q2)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.model.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
示例#3
0
 def _actor_learn(self, obs):
     action = self.model.policy(obs)
     Q = self.model.value(obs, action)
     cost = layers.reduce_mean(-1.0 * Q)
     optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
     optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
     return cost
示例#4
0
文件: dqn.py 项目: YuechengLiu/PARL
    def learn(self,
              obs,
              action,
              reward,
              next_obs,
              terminal,
              learning_rate=None):
        """ update value model self.model with DQN algorithm
        """
        # Support the modification of learning_rate
        if learning_rate is None:
            assert isinstance(
                self.lr,
                float), "Please set the learning rate of DQN in initializaion."
            learning_rate = self.lr

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(
            learning_rate=learning_rate, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost
示例#5
0
文件: ppo.py 项目: YuechengLiu/PARL
 def value_learn(self, obs, val):
     """ Learn the value model with square error cost
     """
     predict_val = self.model.value(obs)
     loss = layers.square_error_cost(predict_val, val)
     loss = layers.reduce_mean(loss)
     optimizer = fluid.optimizer.AdamOptimizer(self.value_lr)
     optimizer.minimize(loss)
     return loss
示例#6
0
 def learn(self, obs, action, reward):
     """ update policy model self.model with policy gradient algorithm
     """
     act_prob = self.model(obs)
     log_prob = layers.cross_entropy(act_prob, action)
     cost = log_prob * reward
     cost = layers.reduce_mean(cost)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(cost)
     return cost
示例#7
0
文件: sac.py 项目: YuechengLiu/PARL
    def actor_learn(self, obs):
        action, log_pi = self.sample(obs)
        qf1_pi, qf2_pi = self.critic.value(obs, action)
        min_qf_pi = layers.elementwise_min(qf1_pi, qf2_pi)
        cost = log_pi * self.alpha - min_qf_pi
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
        optimizer.minimize(cost, parameter_list=self.actor.parameters())

        return cost
示例#8
0
    def _critic_learn(self, obs_n, act_n, target_q):
        pred_q = self.Q(obs_n, act_n)
        cost = layers.reduce_mean(layers.square_error_cost(pred_q, target_q))

        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
            param_list=self.model.get_critic_params())

        optimizer = fluid.optimizer.AdamOptimizer(self.lr)
        optimizer.minimize(cost, parameter_list=self.model.get_critic_params())
        return cost
示例#9
0
文件: ppo.py 项目: YuechengLiu/PARL
    def policy_learn(self, obs, actions, advantages, beta=None):
        """ Learn policy model with: 
                1. CLIP loss: Clipped Surrogate Objective 
                2. KLPEN loss: Adaptive KL Penalty Objective
            See: https://arxiv.org/pdf/1707.02286.pdf

        Args:
            obs: Tensor, (batch_size, obs_dim)
            actions: Tensor, (batch_size, act_dim)
            advantages: Tensor (batch_size, )
            beta: Tensor (1) or None
                  if None, use CLIP Loss; else, use KLPEN loss. 
        """
        old_means, old_logvars = self.old_policy_model.policy(obs)
        old_means.stop_gradient = True
        old_logvars.stop_gradient = True
        old_logprob = self._calc_logprob(actions, old_means, old_logvars)

        means, logvars = self.model.policy(obs)
        logprob = self._calc_logprob(actions, means, logvars)

        kl = self._calc_kl(means, logvars, old_means, old_logvars)
        kl = layers.reduce_mean(kl)

        if beta is None:  # Clipped Surrogate Objective
            pg_ratio = layers.exp(logprob - old_logprob)
            clipped_pg_ratio = layers.clip(pg_ratio, 1 - self.epsilon,
                                           1 + self.epsilon)
            surrogate_loss = layers.elementwise_min(
                advantages * pg_ratio, advantages * clipped_pg_ratio)
            loss = 0 - layers.reduce_mean(surrogate_loss)
        else:  # Adaptive KL Penalty Objective
            # policy gradient loss
            loss1 = 0 - layers.reduce_mean(
                advantages * layers.exp(logprob - old_logprob))
            # adaptive kl loss
            loss2 = kl * beta
            loss = loss1 + loss2
        optimizer = fluid.optimizer.AdamOptimizer(self.policy_lr)
        optimizer.minimize(loss)
        return loss, kl
示例#10
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
示例#11
0
    def _actor_learn(self, obs_n, act_n):
        i = self.agent_index
        this_policy = self.model.policy(obs_n[i])
        sample_this_action = SoftPDistribution(
            logits=this_policy,
            act_space=self.act_space[self.agent_index]).sample()

        action_input_n = act_n + []
        action_input_n[i] = sample_this_action
        eval_q = self.Q(obs_n, action_input_n)
        act_cost = layers.reduce_mean(-1.0 * eval_q)

        act_reg = layers.reduce_mean(layers.square(this_policy))

        cost = act_cost + act_reg * 1e-3

        fluid.clip.set_gradient_clip(
            clip=fluid.clip.GradientClipByNorm(clip_norm=0.5),
            param_list=self.model.get_actor_params())

        optimizer = fluid.optimizer.AdamOptimizer(self.lr)
        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
        return cost
示例#12
0
文件: dqn.py 项目: YuechengLiu/PARL
    def cal_bellman_residual(self, obs, action, reward, next_obs, terminal):
        """ use self.model to get squared Bellman residual with fed data
        """
        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(
            layers.elementwise_mul(action_onehot, pred_value), dim=1)
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        return cost
示例#13
0
文件: sac.py 项目: YuechengLiu/PARL
    def critic_learn(self, obs, action, reward, next_obs, terminal):
        next_obs_action, next_obs_log_pi = self.sample(next_obs)
        qf1_next_target, qf2_next_target = self.target_critic.value(
            next_obs, next_obs_action)
        min_qf_next_target = layers.elementwise_min(
            qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
        target_Q.stop_gradient = True

        current_Q1, current_Q2 = self.critic.value(obs, action)
        cost = layers.square_error_cost(current_Q1,
                                        target_Q) + layers.square_error_cost(
                                            current_Q2, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
示例#14
0
    def learn(self, obs, action, reward, next_obs, terminal, sample_weight):
        """ update value model self.model with DQN algorithm
        """

        pred_value = self.model.value(obs)
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(action_onehot * pred_value,
                                              dim=1)
        delta = layers.abs(target - pred_action_value)
        cost = sample_weight * layers.square_error_cost(
            pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, delta  # `delta` is the TD-error
示例#15
0
文件: alg.py 项目: Termset/IMPASS
    def learn(self, obs, actions, means, log_std, rewards, dones,
              learning_rate, entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
            rewards: A float32 tensor of shape [B].
            dones: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """
        values = self.model.value(obs)
        # pi
        log_std = layers.exp(log_std)
        normal_pi = Normal(means, log_std)
        # x_t1 = normal_pi.sample([1])[0]
        # x_t1.stop_gradient = True
        y_t1 = actions / self.max_action
        # action1 = y_t1 * self.max_action
        log_prob1 = normal_pi.log_prob(actions)
        log_prob1 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob1 = layers.reduce_sum(log_prob1, dim=1, keep_dim=True)
        log_prob_pi = layers.squeeze(log_prob1, axes=[1])

        # mu
        actions_mu, log_std_mu = self.model.policy(obs)
        log_std_mu = layers.exp(log_std_mu)
        normal_mu = Normal(actions_mu, log_std_mu)
        # x_t2 = normal_mu.sample([1])[0]
        # x_t2.stop_gradient = True
        # y_t2 = actions
        # action2 = y_t2 * self.max_action
        log_prob2 = normal_mu.log_prob(actions)
        log_prob2 -= layers.log(self.max_action * (1 - layers.pow(y_t1, 2)) +
                                epsilon)
        log_prob2 = layers.reduce_sum(log_prob2, dim=1, keep_dim=True)
        log_prob_mu = layers.squeeze(log_prob2, axes=[1])

        # target_policy_distribution = CategoricalDistribution(target_logits)
        # behaviour_policy_distribution = CategoricalDistribution(
        #     behaviour_logits)

        policy_entropy = normal_mu.entropy()
        # policy_entropy = layers.reduce_mean(policy_entropy, dim=1)
        target_actions_log_probs = log_prob_mu
        behaviour_actions_log_probs = log_prob_pi

        # Calculating kl for debug
        # kl = target_policy_distribution.kl(behaviour_policy_distribution)
        kl = normal_mu.kl_divergence(normal_pi)
        kl = layers.reduce_mean(kl, dim=1)
        # kl = layers.unsqueeze(kl, axes=[1])
        """
        Split the tensor into batches at known episode cut boundaries. 
        [B * T] -> [T, B]
        """
        T = self.sample_batch_steps

        def split_batches(tensor):
            B = tensor.shape[0] // T
            splited_tensor = layers.reshape(tensor,
                                            [B, T] + list(tensor.shape[1:]))
            # transpose B and T
            return layers.transpose(splited_tensor, [1, 0] +
                                    list(range(2, 1 + len(tensor.shape))))

        behaviour_actions_log_probs = split_batches(
            behaviour_actions_log_probs)
        target_actions_log_probs = split_batches(target_actions_log_probs)
        policy_entropy = split_batches(policy_entropy)
        dones = split_batches(dones)
        rewards = split_batches(rewards)
        values = split_batches(values)

        # [T, B] -> [T - 1, B] for V-trace calc.
        behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs,
                                                   axes=[0],
                                                   starts=[0],
                                                   ends=[-1])
        target_actions_log_probs = layers.slice(target_actions_log_probs,
                                                axes=[0],
                                                starts=[0],
                                                ends=[-1])
        policy_entropy = layers.slice(policy_entropy,
                                      axes=[0],
                                      starts=[0],
                                      ends=[-1])
        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
        bootstrap_value = layers.slice(values,
                                       axes=[0],
                                       starts=[T - 1],
                                       ends=[T])
        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])

        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])

        vtrace_loss = VTraceLoss(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            policy_entropy=policy_entropy,
            dones=dones,
            discount=self.gamma,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            entropy_coeff=entropy_coeff,
            vf_loss_coeff=self.vf_loss_coeff,
            clip_rho_threshold=self.clip_rho_threshold,
            clip_pg_rho_threshold=self.clip_pg_rho_threshold)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(vtrace_loss.total_loss)
        return vtrace_loss, kl
示例#16
0
    def learn(self, obs, actions, behaviour_logits, rewards, dones,
              learning_rate, entropy_coeff):
        """
        Args:
            obs: An float32 tensor of shape ([B] + observation_space).
                 E.g. [B, C, H, W] in atari.
            actions: An int64 tensor of shape [B].
            behaviour_logits: A float32 tensor of shape [B, NUM_ACTIONS].
            rewards: A float32 tensor of shape [B].
            dones: A float32 tensor of shape [B].
            learning_rate: float scalar of learning rate.
            entropy_coeff: float scalar of entropy coefficient.
        """

        values = self.model.value(obs)
        target_logits = self.model.policy(obs)

        target_policy_distribution = CategoricalDistribution(target_logits)
        behaviour_policy_distribution = CategoricalDistribution(
            behaviour_logits)

        policy_entropy = target_policy_distribution.entropy()
        target_actions_log_probs = target_policy_distribution.logp(actions)
        behaviour_actions_log_probs = behaviour_policy_distribution.logp(
            actions)

        # Calculating kl for debug
        kl = target_policy_distribution.kl(behaviour_policy_distribution)
        kl = layers.reduce_mean(kl)
        """
        Split the tensor into batches at known episode cut boundaries. 
        [B * T] -> [T, B]
        """
        T = self.sample_batch_steps

        def split_batches(tensor):
            B = tensor.shape[0] // T
            splited_tensor = layers.reshape(tensor,
                                            [B, T] + list(tensor.shape[1:]))
            # transpose B and T
            return layers.transpose(splited_tensor, [1, 0] +
                                    list(range(2, 1 + len(tensor.shape))))

        behaviour_actions_log_probs = split_batches(
            behaviour_actions_log_probs)
        target_actions_log_probs = split_batches(target_actions_log_probs)
        policy_entropy = split_batches(policy_entropy)
        dones = split_batches(dones)
        rewards = split_batches(rewards)
        values = split_batches(values)

        # [T, B] -> [T - 1, B] for V-trace calc.
        behaviour_actions_log_probs = layers.slice(behaviour_actions_log_probs,
                                                   axes=[0],
                                                   starts=[0],
                                                   ends=[-1])
        target_actions_log_probs = layers.slice(target_actions_log_probs,
                                                axes=[0],
                                                starts=[0],
                                                ends=[-1])
        policy_entropy = layers.slice(policy_entropy,
                                      axes=[0],
                                      starts=[0],
                                      ends=[-1])
        dones = layers.slice(dones, axes=[0], starts=[0], ends=[-1])
        rewards = layers.slice(rewards, axes=[0], starts=[0], ends=[-1])
        bootstrap_value = layers.slice(values,
                                       axes=[0],
                                       starts=[T - 1],
                                       ends=[T])
        values = layers.slice(values, axes=[0], starts=[0], ends=[-1])

        bootstrap_value = layers.squeeze(bootstrap_value, axes=[0])

        vtrace_loss = VTraceLoss(
            behaviour_actions_log_probs=behaviour_actions_log_probs,
            target_actions_log_probs=target_actions_log_probs,
            policy_entropy=policy_entropy,
            dones=dones,
            discount=self.gamma,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value,
            entropy_coeff=entropy_coeff,
            vf_loss_coeff=self.vf_loss_coeff,
            clip_rho_threshold=self.clip_rho_threshold,
            clip_pg_rho_threshold=self.clip_pg_rho_threshold)

        fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
            clip_norm=40.0))

        optimizer = fluid.optimizer.AdamOptimizer(learning_rate)
        optimizer.minimize(vtrace_loss.total_loss)
        return vtrace_loss, kl