class AgentDoubleDQN(AgentDQN): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__(net_dim, state_dim, action_dim, learning_rate) self.explore_rate = 0.25 # epsilon-greedy, the rate of choosing random action self.softmax = torch.nn.Softmax(dim=1) self.action_dim = action_dim self.act = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam(self.act.parameters(), lr=learning_rate) def select_actions(self, states): # for discrete action space states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy a_prob_l = self.softmax(actions).detach().cpu().numpy( ) # choose action according to Q value a_int = [ rd.choice(self.action_dim, p=a_prob) for a_prob in a_prob_l ] else: a_int = actions.argmax(dim=1).detach().cpu().numpy() return a_int def update_net(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() next_q = obj_critic = None for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample( batch_size) next_q = self.act_target(next_s).max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q action = action.type(torch.long) q_eval1, q_eval2 = [ qs.gather(1, action) for qs in self.act.get__q1_q2(state) ] obj_critic = self.criterion(q_eval1, q_label) + self.criterion( q_eval2, q_label) self.optimizer.zero_grad() obj_critic.backward() self.optimizer.step() soft_target_update(self.act_target, self.act) return next_q.mean().item(), obj_critic.item() / 2
class AgentDoubleDQN(AgentDQN): def __init__(self): super().__init__() self.explore_rate = 0.25 # the probability of choosing action randomly in epsilon-greedy self.softmax = torch.nn.Softmax(dim=1) def init(self, net_dim, state_dim, action_dim): self.action_dim = action_dim self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.act = self.cri self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) def select_action(self, state) -> np.ndarray: # for discrete action space states = torch.as_tensor((state,), dtype=torch.float32, device=self.device).detach_() actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy action = self.softmax(actions)[0] a_prob = action.detach().cpu().numpy() # choose action according to Q value a_int = rd.choice(self.action_dim, p=a_prob) else: action = actions[0] a_int = action.argmax(dim=0).cpu().numpy() return a_int def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch(batch_size) next_q = torch.min(*self.cri_target.get_q1_q2(next_s)) next_q = next_q.max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q act_int = action.type(torch.long) q1, q2 = [qs.gather(1, act_int) for qs in self.act.get_q1_q2(state)] obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) return obj_critic, q1