class DDDQN(SarlOffPolicy): """ Dueling Double DQN, https://arxiv.org/abs/1511.06581 """ policy_mode = 'off-policy' def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dueling double dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin(CriticDueling(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) @iton def select_action(self, obs): q_values = self.q_net(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: actions = q_values.argmax(-1) # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] next_q = self.q_net(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q_target = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q_eval = (q * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] next_max_action = next_q.argmax(-1) # [T, B] next_max_action_one_hot = F.one_hot(next_max_action.squeeze(), self.a_dim).float() # [T, B, A] q_target_next_max = (q_target * next_max_action_one_hot).sum(-1, keepdim=True) # [T, B, 1] q_target = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target_next_max, BATCH.begin_mask).detach() # [T, B, 1] td_error = q_target - q_eval # [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(q_loss) return td_error, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': q_loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
def _dreamer_build_critic(self): return TargetTwin(super()._dreamer_build_critic()).to(self.device)
def __init__( self, alpha=0.2, annealing=True, last_alpha=0.01, polyak=0.995, discrete_tau=1.0, network_settings={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64], 'soft_clip': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): """ TODO: Annotation """ super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.auto_adaption = auto_adaption self.annealing = annealing self.target_entropy = 0.98 for id in self.agent_ids: if self.is_continuouss[id]: self.target_entropy *= (-self.a_dims[id]) else: self.target_entropy *= np.log(self.a_dims[id]) self.actors, self.critics, self.critics2 = {}, {}, {} for id in set(self.model_ids): if self.is_continuouss[id]: self.actors[id] = ActorCts( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_continuous']).to( self.device) else: self.actors[id] = ActorDct( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_discrete']).to( self.device) self.critics[id] = TargetTwin( MACriticQvalueOne(list(self.obs_specs.values()), rep_net_params=self._rep_net_params, action_dim=sum(self.a_dims.values()), network_settings=network_settings['q']), self.polyak).to(self.device) self.critics2[id] = deepcopy(self.critics[id]) self.actor_oplr = OPLR(list(self.actors.values()), actor_lr, **self._oplr_params) self.critic_oplr = OPLR( list(self.critics.values()) + list(self.critics2.values()), critic_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6)) self._trainer_modules.update( {f"actor_{id}": self.actors[id] for id in set(self.model_ids)}) self._trainer_modules.update( {f"critic_{id}": self.critics[id] for id in set(self.model_ids)}) self._trainer_modules.update( {f"critic2_{id}": self.critics2[id] for id in set(self.model_ids)}) self._trainer_modules.update(log_alpha=self.log_alpha, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
class BootstrappedDQN(SarlOffPolicy): """ Deep Exploration via Bootstrapped DQN, http://arxiv.org/abs/1602.04621 """ policy_mode = 'off-policy' def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'Bootstrapped DQN only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = th.FloatTensor([1. / head_num for _ in range(head_num)]) self.now_head = 0 self.q_net = TargetTwin( CriticQvalueBootstrap(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, head_num=self.head_num, network_settings=network_settings)).to( self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) def episode_reset(self): super().episode_reset() self.now_head = np.random.randint(self.head_num) @iton def select_action(self, obs): q_values = self.q_net(obs, rnncs=self.rnncs) # [H, B, A] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: # [H, B, A] => [B, A] => [B, ] actions = q_values[self.now_head].argmax(-1) return actions, Data(action=actions) @iton def _train(self, BATCH): q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask).mean( 0) # [H, T, B, A] => [T, B, A] q_next = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask).mean( 0) # [H, T, B, A] => [T, B, A] # [T, B, A] * [T, B, A] => [T, B, 1] q_eval = (q * BATCH.action).sum(-1, keepdim=True) q_target = n_step_return( BATCH.reward, self.gamma, BATCH.done, # [T, B, A] => [T, B, 1] q_next.max(-1, keepdim=True)[0], BATCH.begin_mask).detach() # [T, B, 1] td_error = q_target - q_eval # [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # 1 # mask_dist = td.Bernoulli(probs=self._probs) # TODO: # mask = mask_dist.sample([batch_size]).T # [H, B] self.oplr.optimize(q_loss) return td_error, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': q_loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
class DDPG(SarlOffPolicy): """ Deep Deterministic Policy Gradient, https://arxiv.org/abs/1509.02971 """ policy_mode = 'off-policy' def __init__(self, polyak=0.995, noise_action='ou', noise_params={'sigma': 0.2}, use_target_action_noise=False, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.use_target_action_noise = use_target_action_noise if self.is_continuous: actor = ActorDPG( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.target_noised_action = ClippedNormalNoisedAction( sigma=0.2, noise_bound=0.2) if noise_action in ['ou', 'clip_normal']: self.noised_action = Noise_action_REGISTER[noise_action]( **noise_params) elif noise_action == 'normal': self.noised_action = self.target_noised_action else: raise Exception( f'cannot use noised action type of {noise_action}') else: actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']) self.actor = TargetTwin(actor, self.polyak).to(self.device) self.critic = TargetTwin( CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr) def episode_reset(self): super().episode_reset() if self.is_continuous: self.noised_action.reset() @iton def select_action(self, obs): output = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() if self.is_continuous: mu = output # [B, A] pi = self.noised_action(mu) # [B, A] else: logits = output # [B, A] mu = logits.argmax(-1) # [B, ] cate_dist = td.Categorical(logits=logits) pi = cate_dist.sample() # [B,] actions = pi if self._is_train_mode else mu return actions, Data(action=actions) @iton def _train(self, BATCH): if self.is_continuous: action_target = self.actor.t( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] if self.use_target_action_noise: action_target = self.target_noised_action( action_target) # [T, B, A] else: target_logits = self.actor.t( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] target_cate_dist = td.Categorical(logits=target_logits) target_pi = target_cate_dist.sample() # [T, B] action_target = F.one_hot(target_pi, self.a_dim).float() # [T, B, A] q = self.critic(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask) # [T, B, 1] q_target = self.critic.t(BATCH.obs_, action_target, begin_mask=BATCH.begin_mask) # [T, B, 1] dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target, BATCH.begin_mask).detach() # [T, B, 1] td_error = dc_r - q # [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # 1 self.critic_oplr.optimize(q_loss) if self.is_continuous: mu = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] else: logits = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] logp_all = logits.log_softmax(-1) # [T, B, A] gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape) # [T, B, A] _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax( -1) # [T, B, A] _pi_true_one_hot = F.one_hot(_pi.argmax(-1), self.a_dim).float() # [T, B, A] _pi_diff = (_pi_true_one_hot - _pi).detach() # [T, B, A] mu = _pi_diff + _pi # [T, B, A] q_actor = self.critic(BATCH.obs, mu, begin_mask=BATCH.begin_mask) # [T, B, 1] actor_loss = -q_actor.mean() # 1 self.actor_oplr.optimize(actor_loss) return td_error, { 'LEARNING_RATE/actor_lr': self.actor_oplr.lr, 'LEARNING_RATE/critic_lr': self.critic_oplr.lr, 'LOSS/actor_loss': actor_loss, 'LOSS/critic_loss': q_loss, 'Statistics/q_min': q.min(), 'Statistics/q_mean': q.mean(), 'Statistics/q_max': q.max() } def _after_train(self): super()._after_train() self.actor.sync() self.critic.sync()