def __init__(self, wm_lr=1e-3, roll_out_horizon=15, **kwargs): super().__init__(**kwargs) network_settings = kwargs.get('network_settings', {}) assert not self.obs_spec.has_visual_observation, "assert not self.obs_spec.has_visual_observation" assert self.obs_spec.has_vector_observation, "assert self.obs_spec.has_vector_observation" self._wm_lr = wm_lr self._roll_out_horizon = roll_out_horizon self._forward_dynamic_model = VectorSA2S( self.obs_spec.vector_dims[0], self.a_dim, hidden_units=network_settings['forward_model']) self._reward_model = VectorSA2R( self.obs_spec.vector_dims[0], self.a_dim, hidden_units=network_settings['reward_model']) self._done_model = VectorSA2D( self.obs_spec.vector_dims[0], self.a_dim, hidden_units=network_settings['done_model']) self._wm_oplr = OPLR([ self._forward_dynamic_model, self._reward_model, self._done_model ], self._wm_lr, **self._oplr_params) self._trainer_modules.update( _forward_dynamic_model=self._forward_dynamic_model, _reward_model=self._reward_model, _done_model=self._done_model, _wm_oplr=self._wm_oplr)
def __init__(self, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net) self._trainer_modules.update(oplr=self.oplr)
def __init__(self, nums=20, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): assert nums > 0, 'assert nums > 0' super().__init__(**kwargs) assert not self.is_continuous, 'qrdqn only support discrete action space' self.nums = nums self.huber_delta = huber_delta self.quantiles = th.tensor((2 * np.arange(self.nums) + 1) / (2.0 * self.nums)).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin(QrdqnDistributional(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, nums=self.nums, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, head_num=4, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'Bootstrapped DQN only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.head_num = head_num self._probs = th.FloatTensor([1. / head_num for _ in range(head_num)]) self.now_head = 0 self.q_net = TargetTwin( CriticQvalueBootstrap(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, head_num=self.head_num, network_settings=network_settings)).to( self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dueling double dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin(CriticDueling(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__( self, agent_spec, lr=5.0e-4, network_settings={ 'actor_continuous': { 'hidden_units': [32, 32], 'condition_sigma': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [32, 32] }, **kwargs): super().__init__(agent_spec=agent_spec, **kwargs) if self.is_continuous: self.net = ActorMuLogstd( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to( self.device) else: self.net = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to( self.device) self.oplr = OPLR(self.net, lr, **self._oplr_params) self._trainer_modules.update(model=self.net, oplr=self.oplr)
def __init__(self, polyak=0.995, noise_action='ou', noise_params={'sigma': 0.2}, use_target_action_noise=False, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.use_target_action_noise = use_target_action_noise if self.is_continuous: actor = ActorDPG( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.target_noised_action = ClippedNormalNoisedAction( sigma=0.2, noise_bound=0.2) if noise_action in ['ou', 'clip_normal']: self.noised_action = Noise_action_REGISTER[noise_action]( **noise_params) elif noise_action == 'normal': self.noised_action = self.target_noised_action else: raise Exception( f'cannot use noised action type of {noise_action}') else: actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']) self.actor = TargetTwin(actor, self.polyak).to(self.device) self.critic = TargetTwin( CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
def __init__(self, polyak=0.995, delay_num=2, noise_action='clip_normal', noise_params={ 'sigma': 0.2, 'noise_bound': 0.2 }, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.delay_num = delay_num self.discrete_tau = discrete_tau if self.is_continuous: actor = ActorDPG( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.noised_action = self.target_noised_action = Noise_action_REGISTER[ noise_action](**noise_params) else: actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.actor = TargetTwin(actor, self.polyak).to(self.device) self.critic = TargetTwin( CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, critic2=self.critic2, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
def __init__( self, agent_spec, actor_step_size=0.5, beta=1.0e-3, lambda_=0.95, cg_iters=10, damping_coeff=0.1, epsilon=0.2, critic_lr=1e-3, train_critic_iters=10, network_settings={ 'actor_continuous': { 'hidden_units': [64, 64], 'condition_sigma': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(agent_spec=agent_spec, **kwargs) self.actor_step_size = actor_step_size self.beta = beta self.lambda_ = lambda_ self._epsilon = epsilon self._cg_iters = cg_iters self._damping_coeff = damping_coeff self._train_critic_iters = train_critic_iters if self.is_continuous: self.actor = ActorMuLogstd( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to( self.device) else: self.actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to( self.device) self.critic = CriticValue( self.obs_spec, rep_net_params=self._rep_net_params, network_settings=network_settings['critic']).to(self.device) self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, critic_oplr=self.critic_oplr)
def __init__(self, mixer='vdn', mixer_settings={}, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, use_double=True, init2mid_annealing_step=1000, assign_interval=1000, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not any(list(self.is_continuouss.values()) ), 'VDN only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self._use_double = use_double self._mixer_type = mixer self._mixer_settings = mixer_settings self.q_nets = {} for id in set(self.model_ids): self.q_nets[id] = TargetTwin( CriticDueling(self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings)).to( self.device) self.mixer = self._build_mixer() self.oplr = OPLR( tuple(self.q_nets.values()) + (self.mixer, ), lr, **self._oplr_params) self._trainer_modules.update( {f"model_{id}": self.q_nets[id] for id in set(self.model_ids)}) self._trainer_modules.update(mixer=self.mixer, oplr=self.oplr)
def __init__(self, alpha=0.2, beta=0.1, polyak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'maxsqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.use_epsilon = use_epsilon self.polyak = polyak self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) self.critic = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) self._trainer_modules.update(critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, critic_oplr=self.critic_oplr)
def __init__(self, obs_spec, rep_net_params, is_continuous, action_dim, *, eta=0.2, lr=1.0e-3, beta=0.2): """ params: is_continuous: sepecify whether action space is continuous(True) or discrete(False) action_dim: dimension of action eta: weight of intrinsic reward lr: the learning rate of curiosity model beta: weight factor of loss between inverse_dynamic_net and forward_net """ super().__init__() self.eta = eta self.beta = beta self.is_continuous = is_continuous self.action_dim = action_dim self.rep_net = RepresentationNetwork(obs_spec=obs_spec, rep_net_params=rep_net_params) self.feat_dim = self.rep_net.h_dim # S, S' => A self.inverse_dynamic_net = nn.Sequential( nn.Linear(self.feat_dim * 2, self.feat_dim * 2), Act_REGISTER[default_act](), nn.Linear(self.feat_dim * 2, action_dim)) if self.is_continuous: self.inverse_dynamic_net.add_module('tanh', nn.Tanh()) # S, A => S' self.forward_net = nn.Sequential( nn.Linear(self.feat_dim + action_dim, self.feat_dim), Act_REGISTER[default_act](), nn.Linear(self.feat_dim, self.feat_dim)) self.oplr = OPLR( models=[self.rep_net, self.inverse_dynamic_net, self.forward_net], lr=lr)
def __init__( self, agent_spec, beta=1.0e-3, actor_lr=5.0e-4, critic_lr=1.0e-3, network_settings={ 'actor_continuous': { 'hidden_units': [64, 64], 'condition_sigma': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, **kwargs): super().__init__(agent_spec=agent_spec, **kwargs) self.beta = beta if self.is_continuous: self.actor = ActorMuLogstd( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to( self.device) else: self.actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to( self.device) self.critic = CriticValue( self.obs_spec, rep_net_params=self._rep_net_params, network_settings=network_settings['critic']).to(self.device) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
def __init__(self, actor_lr=5.0e-4, critic_lr=1.0e-3, use_target_action_noise=False, noise_action='ou', noise_params={ 'sigma': 0.2 }, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(**kwargs) self.discrete_tau = discrete_tau self.use_target_action_noise = use_target_action_noise if self.is_continuous: self.target_noised_action = ClippedNormalNoisedAction(sigma=0.2, noise_bound=0.2) self.noised_action = Noise_action_REGISTER[noise_action](**noise_params) self.actor = ActorDPG(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to(self.device) else: self.actor = ActorDct(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to(self.device) self.critic = CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']).to(self.device) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
def __init__(self, lr=5.0e-4, alpha=2, polyak=0.995, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'sql only support discrete action space' self.alpha = alpha self.polyak = polyak self.q_net = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'share': [128], 'v': [128], 'adv': [128] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'rainbow only support discrete action space' self._v_min = v_min self._v_max = v_max self._atoms = atoms self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1) self._z = th.linspace(self._v_min, self._v_max, self._atoms).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.rainbow_net = TargetTwin( RainbowDueling(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, atoms=self._atoms, network_settings=network_settings)).to(self.device) self.rainbow_net.target.train() # so that NoisyLinear takes effect self.oplr = OPLR(self.rainbow_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.rainbow_net, oplr=self.oplr)
def __init__(self, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'iqn only support discrete action space' self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.q_net = TargetTwin(IqnNet(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, quantiles_idx=self.quantiles_idx, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.current_target_idx = 0 self.q_net = CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings).to( self.device) self.target_nets = [] for i in range(self.target_k): target_q_net = deepcopy(self.q_net) target_q_net.eval() sync_params(target_q_net, self.q_net) self.target_nets.append(target_q_net) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
class CuriosityModel(nn.Module): """ Model of Intrinsic Curiosity Module (ICM). Curiosity-driven Exploration by Self-supervised Prediction, https://arxiv.org/abs/1705.05363 """ def __init__(self, obs_spec, rep_net_params, is_continuous, action_dim, *, eta=0.2, lr=1.0e-3, beta=0.2): """ params: is_continuous: sepecify whether action space is continuous(True) or discrete(False) action_dim: dimension of action eta: weight of intrinsic reward lr: the learning rate of curiosity model beta: weight factor of loss between inverse_dynamic_net and forward_net """ super().__init__() self.eta = eta self.beta = beta self.is_continuous = is_continuous self.action_dim = action_dim self.rep_net = RepresentationNetwork(obs_spec=obs_spec, rep_net_params=rep_net_params) self.feat_dim = self.rep_net.h_dim # S, S' => A self.inverse_dynamic_net = nn.Sequential( nn.Linear(self.feat_dim * 2, self.feat_dim * 2), Act_REGISTER[default_act](), nn.Linear(self.feat_dim * 2, action_dim)) if self.is_continuous: self.inverse_dynamic_net.add_module('tanh', nn.Tanh()) # S, A => S' self.forward_net = nn.Sequential( nn.Linear(self.feat_dim + action_dim, self.feat_dim), Act_REGISTER[default_act](), nn.Linear(self.feat_dim, self.feat_dim)) self.oplr = OPLR( models=[self.rep_net, self.inverse_dynamic_net, self.forward_net], lr=lr) def forward(self, BATCH): fs, _ = self.rep_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, *] fs_, _ = self.rep_net(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, *] # [T, B, *] <S, A> => S' s_eval = self.forward_net(th.cat((fs, BATCH.action), -1)) LF = 0.5 * (fs_ - s_eval).square().sum(-1, keepdim=True) # [T, B, 1] intrinsic_reward = self.eta * LF loss_forward = LF.mean() # 1 a_eval = self.inverse_dynamic_net(th.cat((fs, fs_), -1)) # [T, B, *] if self.is_continuous: loss_inverse = 0.5 * \ (a_eval - BATCH.action).square().sum(-1).mean() else: idx = BATCH.action.argmax(-1) # [T, B] loss_inverse = F.cross_entropy(a_eval.view(-1, self.action_dim), idx.view(-1)) # 1 loss = (1 - self.beta) * loss_inverse + self.beta * loss_forward self.oplr.optimize(loss) summaries = { 'LOSS/curiosity_loss': loss, 'LOSS/forward_loss': loss_forward, 'LOSS/inverse_loss': loss_inverse } return intrinsic_reward, summaries
class MAXSQN(SarlOffPolicy): """ https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py """ policy_mode = 'off-policy' def __init__(self, alpha=0.2, beta=0.1, polyak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'maxsqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.use_epsilon = use_epsilon self.polyak = polyak self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) self.critic = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) self._trainer_modules.update(critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, critic_oplr=self.critic_oplr) @property def alpha(self): return self.log_alpha.exp() @iton def select_action(self, obs): q = self.critic(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.critic.get_rnncs() if self.use_epsilon and self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: cate_dist = td.Categorical(logits=(q / self.alpha)) mu = q.argmax(-1) # [B,] actions = pi = cate_dist.sample() # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q1 = self.critic(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] q2 = self.critic2(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] q1_eval = (q1 * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q2_eval = (q2 * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q1_log_probs = (q1 / (self.alpha + th.finfo().eps)).log_softmax(-1) # [T, B, A] q1_entropy = -(q1_log_probs.exp() * q1_log_probs).sum(-1, keepdim=True).mean() # 1 q1_target = self.critic.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q2_target = self.critic2.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q1_target_max = q1_target.max(-1, keepdim=True)[0] # [T, B, 1] q1_target_log_probs = (q1_target / (self.alpha + th.finfo().eps)).log_softmax(-1) # [T, B, A] q1_target_entropy = -(q1_target_log_probs.exp() * q1_target_log_probs).sum(-1, keepdim=True) # [T, B, 1] q2_target_max = q2_target.max(-1, keepdim=True)[0] # [T, B, 1] # q2_target_log_probs = q2_target.log_softmax(-1) # q2_target_log_max = q2_target_log_probs.max(1, keepdim=True)[0] q_target = th.minimum(q1_target_max, q2_target_max) + self.alpha * q1_target_entropy # [T, B, 1] dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target, BATCH.begin_mask).detach() # [T, B, 1] td_error1 = q1_eval - dc_r # [T, B, 1] td_error2 = q2_eval - dc_r # [T, B, 1] q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean() # 1 q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean() # 1 loss = 0.5 * (q1_loss + q2_loss) self.critic_oplr.optimize(loss) summaries = { 'LEARNING_RATE/critic_lr': self.critic_oplr.lr, 'LOSS/loss': loss, 'Statistics/log_alpha': self.log_alpha, 'Statistics/alpha': self.alpha, 'Statistics/q1_entropy': q1_entropy, 'Statistics/q_min': th.minimum(q1, q2).mean(), 'Statistics/q_mean': q1.mean(), 'Statistics/q_max': th.maximum(q1, q2).mean() } if self.auto_adaption: alpha_loss = -(self.alpha * (self.target_entropy - q1_entropy).detach()).mean() self.alpha_oplr.optimize(alpha_loss) summaries.update({ 'LOSS/alpha_loss': alpha_loss, 'LEARNING_RATE/alpha_lr': self.alpha_oplr.lr }) return (td_error1 + td_error2) / 2, summaries def _after_train(self): super()._after_train() self.critic.sync() self.critic2.sync()
class C51(SarlOffPolicy): """ Category 51, https://arxiv.org/abs/1707.06887 No double, no dueling, no noisy net. """ policy_mode = 'off-policy' def __init__(self, v_min=-10, v_max=10, atoms=51, lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=1000, network_settings=[128, 128], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'c51 only support discrete action space' self._v_min = v_min self._v_max = v_max self._atoms = atoms self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1) self._z = th.linspace(self._v_min, self._v_max, self._atoms).float().to(self.device) # [N,] self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( C51Distributional(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, atoms=self._atoms, network_settings=network_settings)).to( self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) @iton def select_action(self, obs): feat = self.q_net(obs, rnncs=self.rnncs) # [B, A, N] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: q = (self._z * feat).sum(-1) # [B, A, N] * [N,] => [B, A] actions = q.argmax(-1) # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q_dist = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A, N] # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N] q_dist = (q_dist * BATCH.action.unsqueeze(-1)).sum(-2) q_eval = (q_dist * self._z).sum(-1) # [T, B, N] * [N,] => [T, B] target_q_dist = self.q_net.t( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A, N] # [T, B, A, N] * [1, N] => [T, B, A] target_q = (target_q_dist * self._z).sum(-1) a_ = target_q.argmax(-1) # [T, B] a_onehot = F.one_hot(a_, self.a_dim).float() # [T, B, A] # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N] target_q_dist = (target_q_dist * a_onehot.unsqueeze(-1)).sum(-2) target = n_step_return( BATCH.reward.repeat(1, 1, self._atoms), self.gamma, BATCH.done.repeat(1, 1, self._atoms), target_q_dist, BATCH.begin_mask.repeat(1, 1, self._atoms)).detach() # [T, B, N] target = target.clamp(self._v_min, self._v_max) # [T, B, N] # An amazing trick for calculating the projection gracefully. # ref: https://github.com/ShangtongZhang/DeepRL target_dist = ( 1 - (target.unsqueeze(-1) - self._z.view(1, 1, -1, 1)).abs() / self._delta_z).clamp(0, 1) * target_q_dist.unsqueeze( -1) # [T, B, N, 1] target_dist = target_dist.sum(-1) # [T, B, N] _cross_entropy = -(target_dist * th.log(q_dist + th.finfo().eps)).sum( -1, keepdim=True) # [T, B, 1] loss = (_cross_entropy * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(loss) return _cross_entropy, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
def __init__(self, polyak=0.995, noise_action='ou', noise_params={'sigma': 0.2}, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): """ TODO: Annotation """ super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.actors, self.critics = {}, {} for id in set(self.model_ids): if self.is_continuouss[id]: self.actors[id] = TargetTwin( ActorDPG( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_continuous']), self.polyak).to(self.device) else: self.actors[id] = TargetTwin( ActorDct( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_discrete']), self.polyak).to(self.device) self.critics[id] = TargetTwin( MACriticQvalueOne(list(self.obs_specs.values()), rep_net_params=self._rep_net_params, action_dim=sum(self.a_dims.values()), network_settings=network_settings['q']), self.polyak).to(self.device) self.actor_oplr = OPLR(list(self.actors.values()), actor_lr, **self._oplr_params) self.critic_oplr = OPLR(list(self.critics.values()), critic_lr, **self._oplr_params) # TODO: 添加动作类型判断 self.noised_actions = { id: Noise_action_REGISTER[noise_action](**noise_params) for id in set(self.model_ids) if self.is_continuouss[id] } self._trainer_modules.update( {f"actor_{id}": self.actors[id] for id in set(self.model_ids)}) self._trainer_modules.update( {f"critic_{id}": self.critics[id] for id in set(self.model_ids)}) self._trainer_modules.update(actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
class MADDPG(MultiAgentOffPolicy): """ Multi-Agent Deep Deterministic Policy Gradient, https://arxiv.org/abs/1706.02275 """ policy_mode = 'off-policy' def __init__(self, polyak=0.995, noise_action='ou', noise_params={'sigma': 0.2}, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): """ TODO: Annotation """ super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.actors, self.critics = {}, {} for id in set(self.model_ids): if self.is_continuouss[id]: self.actors[id] = TargetTwin( ActorDPG( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_continuous']), self.polyak).to(self.device) else: self.actors[id] = TargetTwin( ActorDct( self.obs_specs[id], rep_net_params=self._rep_net_params, output_shape=self.a_dims[id], network_settings=network_settings['actor_discrete']), self.polyak).to(self.device) self.critics[id] = TargetTwin( MACriticQvalueOne(list(self.obs_specs.values()), rep_net_params=self._rep_net_params, action_dim=sum(self.a_dims.values()), network_settings=network_settings['q']), self.polyak).to(self.device) self.actor_oplr = OPLR(list(self.actors.values()), actor_lr, **self._oplr_params) self.critic_oplr = OPLR(list(self.critics.values()), critic_lr, **self._oplr_params) # TODO: 添加动作类型判断 self.noised_actions = { id: Noise_action_REGISTER[noise_action](**noise_params) for id in set(self.model_ids) if self.is_continuouss[id] } self._trainer_modules.update( {f"actor_{id}": self.actors[id] for id in set(self.model_ids)}) self._trainer_modules.update( {f"critic_{id}": self.critics[id] for id in set(self.model_ids)}) self._trainer_modules.update(actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr) def episode_reset(self): super().episode_reset() for noised_action in self.noised_actions.values(): noised_action.reset() @iton def select_action(self, obs: Dict): acts_info = {} actions = {} for aid, mid in zip(self.agent_ids, self.model_ids): output = self.actors[mid](obs[aid], rnncs=self.rnncs[aid]) # [B, A] self.rnncs_[aid] = self.actors[mid].get_rnncs() if self.is_continuouss[aid]: mu = output # [B, A] pi = self.noised_actions[mid](mu) # [B, A] else: logits = output # [B, A] mu = logits.argmax(-1) # [B,] cate_dist = td.Categorical(logits=logits) pi = cate_dist.sample() # [B,] action = pi if self._is_train_mode else mu acts_info[aid] = Data(action=action) actions[aid] = action return actions, acts_info @iton def _train(self, BATCH_DICT): """ TODO: Annotation """ summaries = defaultdict(dict) target_actions = {} for aid, mid in zip(self.agent_ids, self.model_ids): if self.is_continuouss[aid]: target_actions[aid] = self.actors[mid].t( BATCH_DICT[aid].obs_, begin_mask=BATCH_DICT['global'].begin_mask) # [T, B, A] else: target_logits = self.actors[mid].t( BATCH_DICT[aid].obs_, begin_mask=BATCH_DICT['global'].begin_mask) # [T, B, A] target_cate_dist = td.Categorical(logits=target_logits) target_pi = target_cate_dist.sample() # [T, B] action_target = F.one_hot( target_pi, self.a_dims[aid]).float() # [T, B, A] target_actions[aid] = action_target # [T, B, A] target_actions = th.cat(list(target_actions.values()), -1) # [T, B, N*A] qs, q_targets = {}, {} for mid in self.model_ids: qs[mid] = self.critics[mid]( [BATCH_DICT[id].obs for id in self.agent_ids], th.cat([BATCH_DICT[id].action for id in self.agent_ids], -1)) # [T, B, 1] q_targets[mid] = self.critics[mid].t( [BATCH_DICT[id].obs_ for id in self.agent_ids], target_actions) # [T, B, 1] q_loss = {} td_errors = 0. for aid, mid in zip(self.agent_ids, self.model_ids): dc_r = n_step_return( BATCH_DICT[aid].reward, self.gamma, BATCH_DICT[aid].done, q_targets[mid], BATCH_DICT['global'].begin_mask).detach() # [T, B, 1] td_error = dc_r - qs[mid] # [T, B, 1] td_errors += td_error q_loss[aid] = 0.5 * td_error.square().mean() # 1 summaries[aid].update({ 'Statistics/q_min': qs[mid].min(), 'Statistics/q_mean': qs[mid].mean(), 'Statistics/q_max': qs[mid].max() }) self.critic_oplr.optimize(sum(q_loss.values())) actor_loss = {} for aid, mid in zip(self.agent_ids, self.model_ids): if self.is_continuouss[aid]: mu = self.actors[mid]( BATCH_DICT[aid].obs, begin_mask=BATCH_DICT['global'].begin_mask) # [T, B, A] else: logits = self.actors[mid]( BATCH_DICT[aid].obs, begin_mask=BATCH_DICT['global'].begin_mask) # [T, B, A] logp_all = logits.log_softmax(-1) # [T, B, A] gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape) # [T, B, A] _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax( -1) # [T, B, A] _pi_true_one_hot = F.one_hot( _pi.argmax(-1), self.a_dims[aid]).float() # [T, B, A] _pi_diff = (_pi_true_one_hot - _pi).detach() # [T, B, A] mu = _pi_diff + _pi # [T, B, A] all_actions = {id: BATCH_DICT[id].action for id in self.agent_ids} all_actions[aid] = mu q_actor = self.critics[mid]( [BATCH_DICT[id].obs for id in self.agent_ids], th.cat(list(all_actions.values()), -1), begin_mask=BATCH_DICT['global'].begin_mask) # [T, B, 1] actor_loss[aid] = -q_actor.mean() # 1 self.actor_oplr.optimize(sum(actor_loss.values())) for aid in self.agent_ids: summaries[aid].update({ 'LOSS/actor_loss': actor_loss[aid], 'LOSS/critic_loss': q_loss[aid] }) summaries['model'].update({ 'LOSS/actor_loss', sum(actor_loss.values()), 'LOSS/critic_loss', sum(q_loss.values()) }) return td_errors / self.n_agents_percopy, summaries def _after_train(self): super()._after_train() for actor in self.actors.values(): actor.sync() for critic in self.critics.values(): critic.sync()
def __init__(self, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, use_eps_greedy=False, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, network_settings={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32] }, **kwargs): super().__init__(**kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature self.use_eps_greedy = use_eps_greedy self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['q'])).to( self.device) self.intra_option_net = OcIntraOption( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, options_num=self.options_num, network_settings=network_settings['intra_option']).to(self.device) self.termination_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['termination'], out_act='sigmoid').to(self.device) if self.is_continuous: # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751 # https://blog.csdn.net/nkhgl/article/details/100047276 self.log_std = th.as_tensor( np.full((self.options_num, self.a_dim), -0.5)).requires_grad_().to(self.device) # [P, A] self.intra_option_oplr = OPLR( [self.intra_option_net, self.log_std], intra_option_lr, **self._oplr_params) else: self.intra_option_oplr = OPLR(self.intra_option_net, intra_option_lr, **self._oplr_params) self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params) self.termination_oplr = OPLR(self.termination_net, termination_lr, **self._oplr_params) self._trainer_modules.update(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, q_oplr=self.q_oplr, intra_option_oplr=self.intra_option_oplr, termination_oplr=self.termination_oplr) self.options = self.new_options = self._generate_random_options()
class OC(SarlOffPolicy): """ The Option-Critic Architecture. http://arxiv.org/abs/1609.05140 """ policy_mode = 'off-policy' def __init__(self, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, use_eps_greedy=False, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, network_settings={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32] }, **kwargs): super().__init__(**kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature self.use_eps_greedy = use_eps_greedy self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['q'])).to( self.device) self.intra_option_net = OcIntraOption( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, options_num=self.options_num, network_settings=network_settings['intra_option']).to(self.device) self.termination_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['termination'], out_act='sigmoid').to(self.device) if self.is_continuous: # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751 # https://blog.csdn.net/nkhgl/article/details/100047276 self.log_std = th.as_tensor( np.full((self.options_num, self.a_dim), -0.5)).requires_grad_().to(self.device) # [P, A] self.intra_option_oplr = OPLR( [self.intra_option_net, self.log_std], intra_option_lr, **self._oplr_params) else: self.intra_option_oplr = OPLR(self.intra_option_net, intra_option_lr, **self._oplr_params) self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params) self.termination_oplr = OPLR(self.termination_net, termination_lr, **self._oplr_params) self._trainer_modules.update(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, q_oplr=self.q_oplr, intra_option_oplr=self.intra_option_oplr, termination_oplr=self.termination_oplr) self.options = self.new_options = self._generate_random_options() def _generate_random_options(self): # [B,] return th.tensor(np.random.randint(0, self.options_num, self.n_copies)).to(self.device) def episode_step(self, obs: Data, env_rets: Data, begin_mask: np.ndarray): super().episode_step(obs, env_rets, begin_mask) self.options = self.new_options @iton def select_action(self, obs): q = self.q_net(obs, rnncs=self.rnncs) # [B, P] self.rnncs_ = self.q_net.get_rnncs() pi = self.intra_option_net(obs, rnncs=self.rnncs) # [B, P, A] beta = self.termination_net(obs, rnncs=self.rnncs) # [B, P] options_onehot = F.one_hot(self.options, self.options_num).float() # [B, P] options_onehot_expanded = options_onehot.unsqueeze(-1) # [B, P, 1] pi = (pi * options_onehot_expanded).sum(-2) # [B, A] if self.is_continuous: mu = pi.tanh() # [B, A] log_std = self.log_std[self.options] # [B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) actions = dist.sample().clamp(-1, 1) # [B, A] else: pi = pi / self.boltzmann_temperature # [B, A] dist = td.Categorical(logits=pi) actions = dist.sample() # [B, ] max_options = q.argmax(-1).long() # [B, P] => [B, ] if self.use_eps_greedy: # epsilon greedy if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): self.new_options = self._generate_random_options() else: self.new_options = max_options else: beta_probs = (beta * options_onehot).sum(-1) # [B, P] => [B,] beta_dist = td.Bernoulli(probs=beta_probs) self.new_options = th.where(beta_dist.sample() < 1, self.options, max_options) return actions, Data(action=actions, last_options=self.options, options=self.new_options) def random_action(self): actions = super().random_action() self._acts_info.update( last_options=np.random.randint(0, self.options_num, self.n_copies), options=np.random.randint(0, self.options_num, self.n_copies)) return actions def _preprocess_BATCH(self, BATCH): # [T, B, *] BATCH = super()._preprocess_BATCH(BATCH) BATCH.last_options = int2one_hot(BATCH.last_options, self.options_num) BATCH.options = int2one_hot(BATCH.options, self.options_num) return BATCH @iton def _train(self, BATCH): q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, P] q_next = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, P] beta_next = self.termination_net( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, P] qu_eval = (q * BATCH.options).sum(-1, keepdim=True) # [T, B, 1] beta_s_ = (beta_next * BATCH.options).sum(-1, keepdim=True) # [T, B, 1] q_s_ = (q_next * BATCH.options).sum(-1, keepdim=True) # [T, B, 1] # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94 if self.double_q: q_ = self.q_net(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, P] # [T, B, P] => [T, B] => [T, B, P] max_a_idx = F.one_hot(q_.argmax(-1), self.options_num).float() q_s_max = (q_next * max_a_idx).sum(-1, keepdim=True) # [T, B, 1] else: q_s_max = q_next.max(-1, keepdim=True)[0] # [T, B, 1] u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max # [T, B, 1] qu_target = n_step_return(BATCH.reward, self.gamma, BATCH.done, u_target, BATCH.begin_mask).detach() # [T, B, 1] td_error = qu_target - qu_eval # gradient : q [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # [T, B, 1] => 1 self.q_oplr.optimize(q_loss) q_s = qu_eval.detach() # [T, B, 1] # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130 if self.use_baseline: adv = (qu_target - q_s).detach() # [T, B, 1] else: adv = qu_target.detach() # [T, B, 1] # [T, B, P] => [T, B, P, 1] options_onehot_expanded = BATCH.options.unsqueeze(-1) pi = self.intra_option_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, P, A] # [T, B, P, A] => [T, B, A] pi = (pi * options_onehot_expanded).sum(-2) if self.is_continuous: mu = pi.tanh() # [T, B, A] log_std = self.log_std[BATCH.options.argmax(-1)] # [T, B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) log_p = dist.log_prob(BATCH.action).unsqueeze(-1) # [T, B, 1] entropy = dist.entropy().unsqueeze(-1) # [T, B, 1] else: pi = pi / self.boltzmann_temperature # [T, B, A] log_pi = pi.log_softmax(-1) # [T, B, A] entropy = -(log_pi.exp() * log_pi).sum(-1, keepdim=True) # [T, B, 1] log_p = (BATCH.action * log_pi).sum(-1, keepdim=True) # [T, B, 1] pi_loss = -(log_p * adv + self.ent_coff * entropy).mean() # 1 beta = self.termination_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, P] beta_s = (beta * BATCH.last_options).sum(-1, keepdim=True) # [T, B, 1] if self.use_eps_greedy: v_s = q.max( -1, keepdim=True)[0] - self.termination_regularizer # [T, B, 1] else: v_s = (1 - beta_s) * q_s + beta_s * q.max( -1, keepdim=True)[0] # [T, B, 1] # v_s = q.mean(-1, keepdim=True) # [T, B, 1] beta_loss = beta_s * (q_s - v_s).detach() # [T, B, 1] # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238 if self.terminal_mask: beta_loss *= (1 - BATCH.done) # [T, B, 1] beta_loss = beta_loss.mean() # 1 self.intra_option_oplr.optimize(pi_loss) self.termination_oplr.optimize(beta_loss) return td_error, { 'LEARNING_RATE/q_lr': self.q_oplr.lr, 'LEARNING_RATE/intra_option_lr': self.intra_option_oplr.lr, 'LEARNING_RATE/termination_lr': self.termination_oplr.lr, # 'Statistics/option': self.options[0], 'LOSS/q_loss': q_loss, 'LOSS/pi_loss': pi_loss, 'LOSS/beta_loss': beta_loss, 'Statistics/q_option_max': q_s.max(), 'Statistics/q_option_min': q_s.min(), 'Statistics/q_option_mean': q_s.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
def __init__(self, alpha=0.2, annealing=True, last_alpha=0.01, polyak=0.995, entropic_index=1.5, discrete_tau=1.0, network_settings={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64], 'soft_clip': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.auto_adaption = auto_adaption self.annealing = annealing self.critic = TargetTwin(CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) if self.is_continuous: self.actor = ActorCts(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to(self.device) else: self.actor = ActorDct(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to(self.device) # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6)) self._trainer_modules.update(actor=self.actor, critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
class TAC(SarlOffPolicy): """Tsallis Actor Critic, TAC with V neural Network. https://arxiv.org/abs/1902.00137 """ policy_mode = 'off-policy' def __init__(self, alpha=0.2, annealing=True, last_alpha=0.01, polyak=0.995, entropic_index=1.5, discrete_tau=1.0, network_settings={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64], 'soft_clip': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [64, 32], 'q': [128, 128] }, auto_adaption=True, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.discrete_tau = discrete_tau self.entropic_index = 2 - entropic_index self.auto_adaption = auto_adaption self.annealing = annealing self.critic = TargetTwin(CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) if self.is_continuous: self.actor = ActorCts(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to(self.device) else: self.actor = ActorDct(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to(self.device) # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6)) self._trainer_modules.update(actor=self.actor, critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr) @property def alpha(self): return self.log_alpha.exp() @iton def select_action(self, obs): if self.is_continuous: mu, log_std = self.actor(obs, rnncs=self.rnncs) # [B, A] pi = td.Normal(mu, log_std.exp()).sample().tanh() # [B, A] mu.tanh_() # squash mu # [B, A] else: logits = self.actor(obs, rnncs=self.rnncs) # [B, A] mu = logits.argmax(-1) # [B,] cate_dist = td.Categorical(logits=logits) pi = cate_dist.sample() # [B,] self.rnncs_ = self.actor.get_rnncs() actions = pi if self._is_train_mode else mu return actions, Data(action=actions) @iton def _train(self, BATCH): if self.is_continuous: target_mu, target_log_std = self.actor(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] dist = td.Independent(td.Normal(target_mu, target_log_std.exp()), 1) target_pi = dist.sample() # [T, B, A] target_pi, target_log_pi = squash_action(target_pi, dist.log_prob( target_pi).unsqueeze(-1), is_independent=False) # [T, B, A] target_log_pi = tsallis_entropy_log_q(target_log_pi, self.entropic_index) # [T, B, 1] else: target_logits = self.actor(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] target_cate_dist = td.Categorical(logits=target_logits) target_pi = target_cate_dist.sample() # [T, B] target_log_pi = target_cate_dist.log_prob(target_pi).unsqueeze(-1) # [T, B, 1] target_pi = F.one_hot(target_pi, self.a_dim).float() # [T, B, A] q1 = self.critic(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask) # [T, B, 1] q2 = self.critic2(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask) # [T, B, 1] q1_target = self.critic.t(BATCH.obs_, target_pi, begin_mask=BATCH.begin_mask) # [T, B, 1] q2_target = self.critic2.t(BATCH.obs_, target_pi, begin_mask=BATCH.begin_mask) # [T, B, 1] q_target = th.minimum(q1_target, q2_target) # [T, B, 1] dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, (q_target - self.alpha * target_log_pi), BATCH.begin_mask).detach() # [T, B, 1] td_error1 = q1 - dc_r # [T, B, 1] td_error2 = q2 - dc_r # [T, B, 1] q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean() # 1 q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean() # 1 critic_loss = 0.5 * q1_loss + 0.5 * q2_loss self.critic_oplr.optimize(critic_loss) if self.is_continuous: mu, log_std = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] dist = td.Independent(td.Normal(mu, log_std.exp()), 1) pi = dist.rsample() # [T, B, A] pi, log_pi = squash_action(pi, dist.log_prob(pi).unsqueeze(-1), is_independent=False) # [T, B, A] log_pi = tsallis_entropy_log_q(log_pi, self.entropic_index) # [T, B, 1] entropy = dist.entropy().mean() # 1 else: logits = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] logp_all = logits.log_softmax(-1) # [T, B, A] gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape) # [T, B, A] _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax(-1) # [T, B, A] _pi_true_one_hot = F.one_hot(_pi.argmax(-1), self.a_dim).float() # [T, B, A] _pi_diff = (_pi_true_one_hot - _pi).detach() # [T, B, A] pi = _pi_diff + _pi # [T, B, A] log_pi = (logp_all * pi).sum(-1, keepdim=True) # [T, B, 1] entropy = -(logp_all.exp() * logp_all).sum(-1).mean() # 1 q_s_pi = th.minimum(self.critic(BATCH.obs, pi, begin_mask=BATCH.begin_mask), self.critic2(BATCH.obs, pi, begin_mask=BATCH.begin_mask)) # [T, B, 1] actor_loss = -(q_s_pi - self.alpha * log_pi).mean() # 1 self.actor_oplr.optimize(actor_loss) summaries = { 'LEARNING_RATE/actor_lr': self.actor_oplr.lr, 'LEARNING_RATE/critic_lr': self.critic_oplr.lr, 'LOSS/actor_loss': actor_loss, 'LOSS/q1_loss': q1_loss, 'LOSS/q2_loss': q2_loss, 'LOSS/critic_loss': critic_loss, 'Statistics/log_alpha': self.log_alpha, 'Statistics/alpha': self.alpha, 'Statistics/entropy': entropy, 'Statistics/q_min': th.minimum(q1, q2).min(), 'Statistics/q_mean': th.minimum(q1, q2).mean(), 'Statistics/q_max': th.maximum(q1, q2).max() } if self.auto_adaption: alpha_loss = -(self.alpha * (log_pi + self.target_entropy).detach()).mean() # 1 self.alpha_oplr.optimize(alpha_loss) summaries.update({ 'LOSS/alpha_loss': alpha_loss, 'LEARNING_RATE/alpha_lr': self.alpha_oplr.lr }) return (td_error1 + td_error2) / 2, summaries def _after_train(self): super()._after_train() self.critic.sync() self.critic2.sync() if self.annealing and not self.auto_adaption: self.log_alpha.copy_(self.alpha_annealing(self._cur_train_step).log())
class TD3(SarlOffPolicy): """ Twin Delayed Deep Deterministic Policy Gradient, https://arxiv.org/abs/1802.09477 """ policy_mode = 'off-policy' def __init__(self, polyak=0.995, delay_num=2, noise_action='clip_normal', noise_params={ 'sigma': 0.2, 'noise_bound': 0.2 }, actor_lr=5.0e-4, critic_lr=1.0e-3, discrete_tau=1.0, network_settings={ 'actor_continuous': [32, 32], 'actor_discrete': [32, 32], 'q': [32, 32] }, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.delay_num = delay_num self.discrete_tau = discrete_tau if self.is_continuous: actor = ActorDPG( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.noised_action = self.target_noised_action = Noise_action_REGISTER[ noise_action](**noise_params) else: actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']) self.actor = TargetTwin(actor, self.polyak).to(self.device) self.critic = TargetTwin( CriticQvalueOne(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params) self._trainer_modules.update(actor=self.actor, critic=self.critic, critic2=self.critic2, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr) def episode_reset(self): super().episode_reset() if self.is_continuous: self.noised_action.reset() @iton def select_action(self, obs): output = self.actor(obs, rnncs=self.rnncs) # [B, A] self.rnncs_ = self.actor.get_rnncs() if self.is_continuous: mu = output # [B, A] pi = self.noised_action(mu) # [B, A] else: logits = output # [B, A] mu = logits.argmax(-1) # [B,] cate_dist = td.Categorical(logits=logits) pi = cate_dist.sample() # [B,] actions = pi if self._is_train_mode else mu return actions, Data(action=actions) @iton def _train(self, BATCH): for _ in range(self.delay_num): if self.is_continuous: action_target = self.target_noised_action( self.actor.t(BATCH.obs_, begin_mask=BATCH.begin_mask)) # [T, B, A] else: target_logits = self.actor.t( BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] target_cate_dist = td.Categorical(logits=target_logits) target_pi = target_cate_dist.sample() # [T, B] action_target = F.one_hot(target_pi, self.a_dim).float() # [T, B, A] q1 = self.critic(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask) # [T, B, 1] q2 = self.critic2(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask) # [T, B, 1] q_target = th.minimum( self.critic.t(BATCH.obs_, action_target, begin_mask=BATCH.begin_mask), self.critic2.t(BATCH.obs_, action_target, begin_mask=BATCH.begin_mask)) # [T, B, 1] dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_target, BATCH.begin_mask).detach() # [T, B, 1] td_error1 = q1 - dc_r # [T, B, 1] td_error2 = q2 - dc_r # [T, B, 1] q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean() # 1 q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean() # 1 critic_loss = 0.5 * (q1_loss + q2_loss) self.critic_oplr.optimize(critic_loss) if self.is_continuous: mu = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] else: logits = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] logp_all = logits.log_softmax(-1) # [T, B, A] gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape) # [T, B, A] _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax( -1) # [T, B, A] _pi_true_one_hot = F.one_hot(_pi.argmax(-1), self.a_dim).float() # [T, B, A] _pi_diff = (_pi_true_one_hot - _pi).detach() # [T, B, A] mu = _pi_diff + _pi # [T, B, A] q1_actor = self.critic(BATCH.obs, mu, begin_mask=BATCH.begin_mask) # [T, B, 1] actor_loss = -q1_actor.mean() # 1 self.actor_oplr.optimize(actor_loss) return (td_error1 + td_error2) / 2, { 'LEARNING_RATE/actor_lr': self.actor_oplr.lr, 'LEARNING_RATE/critic_lr': self.critic_oplr.lr, 'LOSS/actor_loss': actor_loss, 'LOSS/critic_loss': critic_loss, 'Statistics/q_min': th.minimum(q1, q2).min(), 'Statistics/q_mean': th.minimum(q1, q2).mean(), 'Statistics/q_max': th.maximum(q1, q2).max() } def _after_train(self): super()._after_train() self.actor.sync() self.critic.sync() self.critic2.sync()
class IQN(SarlOffPolicy): """ Implicit Quantile Networks, https://arxiv.org/abs/1806.06923 Double DQN """ policy_mode = 'off-policy' def __init__(self, online_quantiles=8, target_quantiles=8, select_quantiles=32, quantiles_idx=64, huber_delta=1., lr=5.0e-4, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, assign_interval=2, network_settings={ 'q_net': [128, 64], 'quantile': [128, 64], 'tile': [64] }, **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'iqn only support discrete action space' self.online_quantiles = online_quantiles self.target_quantiles = target_quantiles self.select_quantiles = select_quantiles self.quantiles_idx = quantiles_idx self.huber_delta = huber_delta self.assign_interval = assign_interval self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.q_net = TargetTwin(IqnNet(self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, quantiles_idx=self.quantiles_idx, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) @iton def select_action(self, obs): _, select_quantiles_tiled = self._generate_quantiles( # [N*B, X] batch_size=self.n_copies, quantiles_num=self.select_quantiles ) q_values = self.q_net(obs, select_quantiles_tiled, rnncs=self.rnncs) # [N, B, A] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: # [N, B, A] => [B, A] => [B,] actions = q_values.mean(0).argmax(-1) return actions, Data(action=actions) def _generate_quantiles(self, batch_size, quantiles_num): _quantiles = th.rand([quantiles_num * batch_size, 1]) # [N*B, 1] _quantiles_tiled = _quantiles.repeat(1, self.quantiles_idx) # [N*B, 1] => [N*B, X] # pi * i * tau [N*B, X] * [X, ] => [N*B, X] _quantiles_tiled = th.arange(self.quantiles_idx) * np.pi * _quantiles_tiled _quantiles_tiled.cos_() # [N*B, X] _quantiles = _quantiles.view(batch_size, quantiles_num, 1) # [N*B, 1] => [B, N, 1] return _quantiles, _quantiles_tiled # [B, N, 1], [N*B, X] @iton def _train(self, BATCH): time_step = BATCH.reward.shape[0] batch_size = BATCH.reward.shape[1] quantiles, quantiles_tiled = self._generate_quantiles( # [T*B, N, 1], [N*T*B, X] batch_size=time_step * batch_size, quantiles_num=self.online_quantiles) # [T*B, N, 1] => [T, B, N, 1] quantiles = quantiles.view(time_step, batch_size, -1, 1) quantiles_tiled = quantiles_tiled.view(time_step, -1, self.quantiles_idx) # [N*T*B, X] => [T, N*B, X] quantiles_value = self.q_net(BATCH.obs, quantiles_tiled, begin_mask=BATCH.begin_mask) # [T, N, B, A] # [T, N, B, A] => [N, T, B, A] * [T, B, A] => [N, T, B, 1] quantiles_value = (quantiles_value.swapaxes(0, 1) * BATCH.action).sum(-1, keepdim=True) q_eval = quantiles_value.mean(0) # [N, T, B, 1] => [T, B, 1] _, select_quantiles_tiled = self._generate_quantiles( # [N*T*B, X] batch_size=time_step * batch_size, quantiles_num=self.select_quantiles) select_quantiles_tiled = select_quantiles_tiled.view( time_step, -1, self.quantiles_idx) # [N*T*B, X] => [T, N*B, X] q_values = self.q_net( BATCH.obs_, select_quantiles_tiled, begin_mask=BATCH.begin_mask) # [T, N, B, A] q_values = q_values.mean(1) # [T, N, B, A] => [T, B, A] next_max_action = q_values.argmax(-1) # [T, B] next_max_action = F.one_hot( next_max_action, self.a_dim).float() # [T, B, A] _, target_quantiles_tiled = self._generate_quantiles( # [N'*T*B, X] batch_size=time_step * batch_size, quantiles_num=self.target_quantiles) target_quantiles_tiled = target_quantiles_tiled.view( time_step, -1, self.quantiles_idx) # [N'*T*B, X] => [T, N'*B, X] target_quantiles_value = self.q_net.t(BATCH.obs_, target_quantiles_tiled, begin_mask=BATCH.begin_mask) # [T, N', B, A] target_quantiles_value = target_quantiles_value.swapaxes(0, 1) # [T, N', B, A] => [N', T, B, A] target_quantiles_value = (target_quantiles_value * next_max_action).sum(-1, keepdim=True) # [N', T, B, 1] target_q = target_quantiles_value.mean(0) # [T, B, 1] q_target = n_step_return(BATCH.reward, # [T, B, 1] self.gamma, BATCH.done, # [T, B, 1] target_q, # [T, B, 1] BATCH.begin_mask).detach() # [T, B, 1] td_error = q_target - q_eval # [T, B, 1] # [N', T, B, 1] => [N', T, B] target_quantiles_value = target_quantiles_value.squeeze(-1) target_quantiles_value = target_quantiles_value.permute( 1, 2, 0) # [N', T, B] => [T, B, N'] quantiles_value_target = n_step_return(BATCH.reward.repeat(1, 1, self.target_quantiles), self.gamma, BATCH.done.repeat(1, 1, self.target_quantiles), target_quantiles_value, BATCH.begin_mask.repeat(1, 1, self.target_quantiles)).detach() # [T, B, N'] # [T, B, N'] => [T, B, 1, N'] quantiles_value_target = quantiles_value_target.unsqueeze(-2) quantiles_value_online = quantiles_value.permute(1, 2, 0, 3) # [N, T, B, 1] => [T, B, N, 1] # [T, B, N, 1] - [T, B, 1, N'] => [T, B, N, N'] quantile_error = quantiles_value_online - quantiles_value_target huber = F.huber_loss(quantiles_value_online, quantiles_value_target, reduction="none", delta=self.huber_delta) # [T, B, N, N] # [T, B, N, 1] - [T, B, N, N'] => [T, B, N, N'] huber_abs = (quantiles - quantile_error.detach().le(0.).float()).abs() loss = (huber_abs * huber).mean(-1) # [T, B, N, N'] => [T, B, N] loss = loss.sum(-1, keepdim=True) # [T, B, N] => [T, B, 1] loss = (loss * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(loss) return td_error, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()
class DQN(SarlOffPolicy): """ Deep Q-learning Network, DQN, [2013](https://arxiv.org/pdf/1312.5602.pdf), [2015](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) DQN + LSTM, https://arxiv.org/abs/1507.06527 """ policy_mode = 'off-policy' def __init__(self, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net) self._trainer_modules.update(oplr=self.oplr) @iton def select_action(self, obs): q_values = self.q_net(obs, rnncs=self.rnncs) # [B, *] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: actions = q_values.argmax(-1) # [B,] return actions, Data(action=actions) @iton def _train(self, BATCH): q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, A] q_next = self.q_net.t(BATCH.obs_, begin_mask=BATCH.begin_mask) # [T, B, A] q_eval = (q * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q_target = n_step_return( BATCH.reward, self.gamma, BATCH.done, q_next.max(-1, keepdim=True)[0], BATCH.begin_mask, nstep=self._n_step_value).detach() # [T, B, 1] td_error = q_target - q_eval # [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(q_loss) return td_error, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': q_loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: self.q_net.sync()