class AgentTD3(AgentDDPG): def __init__(self): super().__init__() self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency def init(self, net_dim, state_dim, action_dim): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() obj_critic = obj_actor = None for i in range(int(target_step * repeat_times)): obj_critic, state = self.get_obj_critic(buffer, batch_size) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() if i % self.update_freq == 0: # delay update self.soft_update(self.cri_target, self.cri, self.soft_update_tau) self.soft_update(self.act_target, self.act, self.soft_update_tau) return obj_actor.item(), obj_critic.item() / 2 def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_a = self.act_target.get_action( next_s, self.policy_noise) # policy noise next_q = torch.min(*self.cri_target.get_q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get_q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) # twin critics return obj_critic, state
class AgentSAC(AgentBase): def __init__(self): super().__init__() self.target_entropy = None self.alpha_log = None self.alpha_optimizer = None def init(self, net_dim, state_dim, action_dim): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.target_entropy = np.log(action_dim) self.alpha_log = torch.tensor((-np.log(action_dim) * np.e,), dtype=torch.float32, requires_grad=True, device=self.device) # trainable parameter self.act = ActorSAC(net_dim, state_dim, action_dim).to(self.device) # SAC don't use act_target self.cri = CriticTwin(int(net_dim * 1.25), state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.alpha_optimizer = torch.optim.Adam((self.alpha_log,), self.learning_rate) def select_action(self, state): states = torch.as_tensor((state,), dtype=torch.float32, device=self.device) action = self.act.get_action(states)[0] return action.detach().cpu().numpy() def update_net(self, buffer, target_step, batch_size, repeat_times): buffer.update__now_len__before_sample() alpha = self.alpha_log.exp().detach() obj_critic = None for _ in range(int(target_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch(batch_size) next_a, next_logprob = self.act.get_action_logprob(next_s) next_q = torch.min(*self.cri_target.get_q1_q2(next_s, next_a)) q_label = reward + mask * (next_q + next_logprob * alpha) q1, q2 = self.cri.get_q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri) action_pg, logprob = self.act.get_action_logprob(state) # policy gradient obj_alpha = (self.alpha_log * (logprob - self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() obj_alpha.backward() self.alpha_optimizer.step() alpha = self.alpha_log.exp().detach() obj_actor = -(torch.min(*self.cri_target.get_q1_q2(state, action_pg)) + logprob * alpha).mean() self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() return alpha.item(), obj_critic.item()
class AgentTD3(AgentBase): def __init__(self): super().__init__() self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency def init(self, net_dim, state_dim, action_dim): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.MSELoss() self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) def select_action(self, state): states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() action = self.act(states)[0] action = (action + torch.randn_like(action) * self.explore_noise).clamp(-1, 1) return action.cpu().numpy() def update_net(self, buffer, target_step, batch_size, repeat_times): buffer.update_now_len_before_sample() obj_critic = obj_actor = None for i in range(int(target_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_a = self.act_target.get_action(next_s, self.policy_noise) next_q = torch.min(*self.cri_target.get_q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get_q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) # twin critics self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() if i % self.update_freq == 0: # delay update self.soft_update(self.cri_target, self.cri) self.soft_update(self.act_target, self.act) return obj_actor.item(), obj_critic.item() / 2
class AgentTD3(AgentDDPG): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__(net_dim, state_dim, action_dim, learning_rate) self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency, for soft target update self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.optimizer = torch.optim.Adam([{ 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }]) def update_net(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() obj_critic = obj_actor = None for i in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample( batch_size) next_a = self.act_target.get_action( next_s, self.policy_noise) # policy noise next_q = torch.min(*self.cri_target.get__q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get__q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) # twin critics q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() obj_united = obj_actor + obj_critic # objective self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() if i % self.update_freq == 0: # delay update soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) return obj_actor.item(), obj_critic.item() / 2
class AgentSAC(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.target_entropy = np.log(action_dim) self.alpha_log = torch.tensor( (-np.log(action_dim) * np.e, ), dtype=torch.float32, requires_grad=True, device=self.device) # trainable parameter self.act = ActorSAC(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = CriticTwin( int(net_dim * 1.25), state_dim, action_dim, ).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{ 'params': self.act.parameters(), 'lr': learning_rate * 0.75 }, { 'params': self.cri.parameters(), 'lr': learning_rate * 1.25 }, { 'params': (self.alpha_log, ), 'lr': learning_rate }]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act.get_action(states) return actions.detach().cpu().numpy() def update_net(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() alpha = self.alpha_log.exp().detach() obj_actor = obj_critic = None for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample( batch_size) next_a, next_log_prob = self.act_target.get__action__log_prob( next_s) next_q = torch.min(*self.cri_target.get__q1_q2(next_s, next_a)) q_label = reward + mask * (next_q + next_log_prob * alpha) q1, q2 = self.cri.get__q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) action_pg, log_prob = self.act.get__action__log_prob( state) # policy gradient obj_alpha = (self.alpha_log * (log_prob - self.target_entropy).detach()).mean() alpha = self.alpha_log.exp().detach() with torch.no_grad(): self.alpha_log[:] = self.alpha_log.clamp(-16, 2) obj_actor = -( torch.min(*self.cri_target.get__q1_q2(state, action_pg)) + log_prob * alpha).mean() obj_united = obj_critic + obj_alpha + obj_actor self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) # return obj_actor.item(), obj_critic.item() return alpha.item(), obj_critic.item()