def select_action(self, state, test=False): value_c, value_d = self.actor.forward(to_variable(state, volatile=True)) action_d = (F.softmax(value_d)) action_d = to_numpy(action_d.multinomial()) action_c = to_numpy(value_c) action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0 action_c = action_c[0] return action_c, action_d
def select_action(self, state, test=False): value = to_numpy(self.actor.forward(to_variable(state, volatile=True))) cur_episode = len(self.experience_replay) action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1) return action
def select_action(self, state, test=False): on_state = to_variable(state, volatile=True) greedy = np.random.rand() if greedy < self.epsilon and not test: # explore action = np.random.randint(self.action_num) else: # exploit action = np.argmax(to_numpy(self.net.forward(on_state))) return action
def update(self, state, action, reward, new_state, done): self.experience_replay.append((state, action, reward, new_state, done)) # add new transition to dataset self.epsilon = max( self.epsilon - (self.initial_epsilon - self.final_epsilon) / self.epsilon_decay, 0) if len(self.experience_replay ) >= self.observation: # if have enough experience example, go # minibatch = np.array(random.sample(self.experience_replay, self.batch_size)) # states, actions, rewards, new_states, dones = tuple(minibatch[:, k] for k in range(5)) mini_batch = random.sample(self.experience_replay, self.batch_size) states = torch.cat([ mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size) ]) actions = [mini_batch[k][1] for k in range(self.batch_size)] rewards = [mini_batch[k][2] for k in range(self.batch_size)] new_states = torch.cat([ mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size) ]) dones = [mini_batch[k][4] for k in range(self.batch_size)] new_states = torch.cat([x.unsqueeze(0) for x in new_states], 0) new_states = to_variable(new_states) q_prime = to_numpy(self.net.forward(new_states)) states = torch.cat([x.unsqueeze(0) for x in states], 0) states = to_variable(states) out = self.net.forward(states) # Perform Gradient Descent action_input = to_variable(actions, dtype='long') y_label = to_variable([ rewards[i] if dones[i] else rewards[i] + self.gamma * np.max(q_prime[i]) for i in range(self.batch_size) ]) try: y_out = out.gather(1, action_input.view(-1, 1)) except RuntimeError: pass self.optimizer.zero_grad() loss = self.loss(y_out, y_label) loss.backward() self.optimizer.step()
def select_action(self, state, test=False): value = to_numpy(self.actor.forward(to_variable(state, volatile=True))) # print(value) cur_episode = len(self.experience_replay) if self.action_type == 'continuous': action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1) else: action = self.noise.generate(value[0], cur_episode) if isinstance(action, int): action = np.array([1., 0.] if action == 0 else [0., 1.]) else: # action = np.clip(action, 0.4, 0.6) action = action return action
def select_action(self, state, test=False): state = to_variable(state, volatile=test) if self.config['name'] == 'LSTM': on_state = state, (self.hx, self.cx) value, logit, (hx, cx) = self.net.forward(on_state) else: on_state = state # print(state.size()) value, logit = self.net.forward(on_state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) action = to_numpy(prob.multinomial()) log_prob = log_prob.gather(1, to_variable(action, dtype='long')) action = action[0, 0] if self.config['name'] == 'LSTM': self.hx, self.cx = hx, cx return action, value, log_prob, entropy