Пример #1
0
class DDDQN(SarlOffPolicy):
    """
    Dueling Double DQN, https://arxiv.org/abs/1511.06581
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dueling double dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.assign_interval = assign_interval

        self.q_net = TargetTwin(CriticDueling(self.obs_spec,
                                              rep_net_params=self._rep_net_params,
                                              output_shape=self.a_dim,
                                              network_settings=network_settings)).to(self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net,
                                     oplr=self.oplr)

    @iton
    def select_action(self, obs):
        q_values = self.q_net(obs, rnncs=self.rnncs)  # [B, A]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            actions = q_values.argmax(-1)  # [B,]
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
        next_q = self.q_net(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q_target = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]

        q_eval = (q * BATCH.action).sum(-1, keepdim=True)  # [T, B, 1]
        next_max_action = next_q.argmax(-1)  # [T, B]
        next_max_action_one_hot = F.one_hot(next_max_action.squeeze(), self.a_dim).float()  # [T, B, A]

        q_target_next_max = (q_target * next_max_action_one_hot).sum(-1, keepdim=True)  # [T, B, 1]
        q_target = n_step_return(BATCH.reward,
                                 self.gamma,
                                 BATCH.done,
                                 q_target_next_max,
                                 BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = q_target - q_eval  # [T, B, 1]
        q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean()  # 1
        self.oplr.optimize(q_loss)

        return td_error, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': q_loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()
Пример #2
0
 def _dreamer_build_critic(self):
     return TargetTwin(super()._dreamer_build_critic()).to(self.device)
Пример #3
0
    def __init__(
            self,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            polyak=0.995,
            discrete_tau=1.0,
            network_settings={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64],
                    'soft_clip': False,
                    'log_std_bound': [-20, 2]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128]
            },
            auto_adaption=True,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            **kwargs):
        """
        TODO: Annotation
        """
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        self.target_entropy = 0.98
        for id in self.agent_ids:
            if self.is_continuouss[id]:
                self.target_entropy *= (-self.a_dims[id])
            else:
                self.target_entropy *= np.log(self.a_dims[id])

        self.actors, self.critics, self.critics2 = {}, {}, {}
        for id in set(self.model_ids):
            if self.is_continuouss[id]:
                self.actors[id] = ActorCts(
                    self.obs_specs[id],
                    rep_net_params=self._rep_net_params,
                    output_shape=self.a_dims[id],
                    network_settings=network_settings['actor_continuous']).to(
                        self.device)
            else:
                self.actors[id] = ActorDct(
                    self.obs_specs[id],
                    rep_net_params=self._rep_net_params,
                    output_shape=self.a_dims[id],
                    network_settings=network_settings['actor_discrete']).to(
                        self.device)
            self.critics[id] = TargetTwin(
                MACriticQvalueOne(list(self.obs_specs.values()),
                                  rep_net_params=self._rep_net_params,
                                  action_dim=sum(self.a_dims.values()),
                                  network_settings=network_settings['q']),
                self.polyak).to(self.device)
            self.critics2[id] = deepcopy(self.critics[id])
        self.actor_oplr = OPLR(list(self.actors.values()), actor_lr,
                               **self._oplr_params)
        self.critic_oplr = OPLR(
            list(self.critics.values()) + list(self.critics2.values()),
            critic_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr,
                                   **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha,
                                                       int(1e6))

        self._trainer_modules.update(
            {f"actor_{id}": self.actors[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(
            {f"critic_{id}": self.critics[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(
            {f"critic2_{id}": self.critics2[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(log_alpha=self.log_alpha,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Пример #4
0
class BootstrappedDQN(SarlOffPolicy):
    """
    Deep Exploration via Bootstrapped DQN, http://arxiv.org/abs/1602.04621
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 head_num=4,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'Bootstrapped DQN only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.head_num = head_num
        self._probs = th.FloatTensor([1. / head_num for _ in range(head_num)])
        self.now_head = 0

        self.q_net = TargetTwin(
            CriticQvalueBootstrap(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  head_num=self.head_num,
                                  network_settings=network_settings)).to(
                                      self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)

    def episode_reset(self):
        super().episode_reset()
        self.now_head = np.random.randint(self.head_num)

    @iton
    def select_action(self, obs):
        q_values = self.q_net(obs, rnncs=self.rnncs)  # [H, B, A]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(
                self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            # [H, B, A] => [B, A] => [B, ]
            actions = q_values[self.now_head].argmax(-1)
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask).mean(
            0)  # [H, T, B, A] => [T, B, A]
        q_next = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask).mean(
            0)  # [H, T, B, A] => [T, B, A]
        # [T, B, A] * [T, B, A] => [T, B, 1]
        q_eval = (q * BATCH.action).sum(-1, keepdim=True)
        q_target = n_step_return(
            BATCH.reward,
            self.gamma,
            BATCH.done,
            # [T, B, A] => [T, B, 1]
            q_next.max(-1, keepdim=True)[0],
            BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = q_target - q_eval  # [T, B, 1]
        q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean()  # 1

        # mask_dist = td.Bernoulli(probs=self._probs)  # TODO:
        # mask = mask_dist.sample([batch_size]).T   # [H, B]
        self.oplr.optimize(q_loss)
        return td_error, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': q_loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()
Пример #5
0
class DDPG(SarlOffPolicy):
    """
    Deep Deterministic Policy Gradient, https://arxiv.org/abs/1509.02971
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 polyak=0.995,
                 noise_action='ou',
                 noise_params={'sigma': 0.2},
                 use_target_action_noise=False,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau
        self.use_target_action_noise = use_target_action_noise

        if self.is_continuous:
            actor = ActorDPG(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
            self.target_noised_action = ClippedNormalNoisedAction(
                sigma=0.2, noise_bound=0.2)
            if noise_action in ['ou', 'clip_normal']:
                self.noised_action = Noise_action_REGISTER[noise_action](
                    **noise_params)
            elif noise_action == 'normal':
                self.noised_action = self.target_noised_action
            else:
                raise Exception(
                    f'cannot use noised action type of {noise_action}')
        else:
            actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete'])
        self.actor = TargetTwin(actor, self.polyak).to(self.device)
        self.critic = TargetTwin(
            CriticQvalueOne(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            action_dim=self.a_dim,
                            network_settings=network_settings['q']),
            self.polyak).to(self.device)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)

    def episode_reset(self):
        super().episode_reset()
        if self.is_continuous:
            self.noised_action.reset()

    @iton
    def select_action(self, obs):
        output = self.actor(obs, rnncs=self.rnncs)  # [B, A]
        self.rnncs_ = self.actor.get_rnncs()
        if self.is_continuous:
            mu = output  # [B, A]
            pi = self.noised_action(mu)  # [B, A]
        else:
            logits = output  # [B, A]
            mu = logits.argmax(-1)  # [B, ]
            cate_dist = td.Categorical(logits=logits)
            pi = cate_dist.sample()  # [B,]
        actions = pi if self._is_train_mode else mu
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        if self.is_continuous:
            action_target = self.actor.t(
                BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
            if self.use_target_action_noise:
                action_target = self.target_noised_action(
                    action_target)  # [T, B, A]
        else:
            target_logits = self.actor.t(
                BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
            target_cate_dist = td.Categorical(logits=target_logits)
            target_pi = target_cate_dist.sample()  # [T, B]
            action_target = F.one_hot(target_pi,
                                      self.a_dim).float()  # [T, B, A]
        q = self.critic(BATCH.obs, BATCH.action,
                        begin_mask=BATCH.begin_mask)  # [T, B, 1]
        q_target = self.critic.t(BATCH.obs_,
                                 action_target,
                                 begin_mask=BATCH.begin_mask)  # [T, B, 1]
        dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target,
                             BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = dc_r - q  # [T, B, 1]
        q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean()  # 1
        self.critic_oplr.optimize(q_loss)

        if self.is_continuous:
            mu = self.actor(BATCH.obs,
                            begin_mask=BATCH.begin_mask)  # [T, B, A]
        else:
            logits = self.actor(BATCH.obs,
                                begin_mask=BATCH.begin_mask)  # [T, B, A]
            logp_all = logits.log_softmax(-1)  # [T, B, A]
            gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape)  # [T, B, A]
            _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax(
                -1)  # [T, B, A]
            _pi_true_one_hot = F.one_hot(_pi.argmax(-1),
                                         self.a_dim).float()  # [T, B, A]
            _pi_diff = (_pi_true_one_hot - _pi).detach()  # [T, B, A]
            mu = _pi_diff + _pi  # [T, B, A]
        q_actor = self.critic(BATCH.obs, mu,
                              begin_mask=BATCH.begin_mask)  # [T, B, 1]
        actor_loss = -q_actor.mean()  # 1
        self.actor_oplr.optimize(actor_loss)

        return td_error, {
            'LEARNING_RATE/actor_lr': self.actor_oplr.lr,
            'LEARNING_RATE/critic_lr': self.critic_oplr.lr,
            'LOSS/actor_loss': actor_loss,
            'LOSS/critic_loss': q_loss,
            'Statistics/q_min': q.min(),
            'Statistics/q_mean': q.mean(),
            'Statistics/q_max': q.max()
        }

    def _after_train(self):
        super()._after_train()
        self.actor.sync()
        self.critic.sync()