Exemplos de QNetTwin.get_q1_q2 em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: elegantrl.tutorial.net

Classe / Tipo: QNetTwin

Método / Função: get_q1_q2

Exemplos em hotexamples.com: 2

QNetTwin.get_q1_q2 em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de elegantrl.tutorial.net.QNetTwin.get_q1_q2 em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

QNetTwin(2)

get_q1_q2(2)

parameters(2)

get__q1_q2(1)

Métodos Frequentes

QNetTwin (2)

get_q1_q2 (2)

parameters (2)

get__q1_q2 (1)

Exemplo n.º 1

0

Exibir arquivo

class AgentDoubleDQN(AgentDQN): def __init__(self): super().__init__() self.explore_rate = 0.25 # the probability of choosing action randomly in epsilon-greedy self.softmax = torch.nn.Softmax(dim=1) def init(self, net_dim, state_dim, action_dim): self.action_dim = action_dim self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.act = self.cri self.criterion = torch.nn.SmoothL1Loss() self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) def select_action(self, state): # for discrete action space states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy action = self.softmax(actions)[0] a_prob = action.detach().cpu().numpy( ) # choose action according to Q value a_int = rd.choice(self.action_dim, p=a_prob) else: action = actions[0] a_int = action.argmax(dim=0).cpu().numpy() return a_int def update_net(self, buffer, target_step, batch_size, repeat_times): buffer.update_now_len_before_sample() next_q = obj_critic = None for _ in range(int(target_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = torch.min(*self.cri_target.get_q1_q2(next_s)) next_q = next_q.max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q act_int = action.type(torch.long) q1, q2 = [ qs.gather(1, act_int) for qs in self.cri.get_q1_q2(state) ] obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri) return next_q.mean().item(), obj_critic.item() / 2

Exemplo n.º 2

0

Exibir arquivo

class AgentDoubleDQN(AgentDQN): def __init__(self): super().__init__() self.explore_rate = 0.25 # the probability of choosing action randomly in epsilon-greedy self.softmax = torch.nn.Softmax(dim=1) def init(self, net_dim, state_dim, action_dim): self.action_dim = action_dim self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = QNetTwin(net_dim, state_dim, action_dim).to(self.device) self.act = self.cri self.cri_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) def select_action(self, state) -> np.ndarray: # for discrete action space states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() actions = self.act(states) if rd.rand() < self.explore_rate: # epsilon-greedy action = self.softmax(actions)[0] a_prob = action.detach().cpu().numpy( ) # choose action according to Q value a_int = rd.choice(self.action_dim, p=a_prob) else: action = actions[0] a_int = action.argmax(dim=0).cpu().numpy() return a_int def get_obj_critic(self, buffer, batch_size) -> (torch.Tensor, torch.Tensor): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = torch.min(*self.cri_target.get_q1_q2(next_s)) next_q = next_q.max(dim=1, keepdim=True)[0] q_label = reward + mask * next_q act_int = action.type(torch.long) q1, q2 = [qs.gather(1, act_int) for qs in self.act.get_q1_q2(state)] obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) return obj_critic, q1