def update(self, state, action, reward, new_state, done): self.experience_replay.append((state, action, reward, new_state, done)) # add new transition to dataset self.epsilon = max( self.epsilon - (self.initial_epsilon - self.final_epsilon) / self.epsilon_decay, 0) if len(self.experience_replay ) >= self.observation: # if have enough experience example, go # minibatch = np.array(random.sample(self.experience_replay, self.batch_size)) # states, actions, rewards, new_states, dones = tuple(minibatch[:, k] for k in range(5)) mini_batch = random.sample(self.experience_replay, self.batch_size) states = torch.cat([ mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size) ]) actions = [mini_batch[k][1] for k in range(self.batch_size)] rewards = [mini_batch[k][2] for k in range(self.batch_size)] new_states = torch.cat([ mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size) ]) dones = [mini_batch[k][4] for k in range(self.batch_size)] new_states = torch.cat([x.unsqueeze(0) for x in new_states], 0) new_states = to_variable(new_states) q_prime = to_numpy(self.net.forward(new_states)) states = torch.cat([x.unsqueeze(0) for x in states], 0) states = to_variable(states) out = self.net.forward(states) # Perform Gradient Descent action_input = to_variable(actions, dtype='long') y_label = to_variable([ rewards[i] if dones[i] else rewards[i] + self.gamma * np.max(q_prime[i]) for i in range(self.batch_size) ]) try: y_out = out.gather(1, action_input.view(-1, 1)) except RuntimeError: pass self.optimizer.zero_grad() loss = self.loss(y_out, y_label) loss.backward() self.optimizer.step()
def update(self, state, action, reward, new_state, done): self.experience_replay.append((state, action, reward, new_state, done)) if len(self.experience_replay ) >= self.observation: # if have enough experience example, go # Sample batch from memory replay mini_batch = random.sample(self.experience_replay, self.batch_size) state_batch = torch.cat([ mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size) ]) action_batch = [mini_batch[k][1] for k in range(self.batch_size)] reward_batch = [mini_batch[k][2] for k in range(self.batch_size)] next_state_batch = torch.cat([ mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size) ]) terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)] action_tensor = to_variable(np.vstack(action_batch)) # Prepare for the target q batch value = self.actor_target.forward( to_variable(next_state_batch, volatile=True)) next_q_values = self.critic_target.forward( [to_variable(next_state_batch, volatile=True), value]) next_q_values.volatile = False try: y_batch = to_variable(reward_batch) + self.discount * \ to_variable(terminal_batch) * next_q_values except RuntimeError: print(reward_batch) # Critic update self.critic.zero_grad() q_batch = self.critic.forward( [to_variable(state_batch), action_tensor]) value_loss = self.loss(q_batch, y_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() value = self.actor.forward(to_variable(state_batch)) policy_loss = -self.critic.forward( [to_variable(state_batch), value]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau)
def update(self, state, action, reward, new_state, done): if self.config['use_memory']: self.experience_replay.append( new_state.numpy(), action.tolist(), reward, done) # add new transition to dataset else: self.experience_replay.append((state, action.tolist(), reward, new_state, done)) if done: self.random_process.reset_states() self.epsilon -= self.depsilon if len(self.experience_replay) >= self.observation: # if have enough experience example, go # Sample batch from memory replay if self.config['use_memory']: state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.experience_replay.sample_and_split(self.batch_size) state_batch = state_batch.reshape(-1, 4, 80, 80) next_state_batch = next_state_batch.reshape(-1, 4, 80, 80) else: mini_batch = random.sample(self.experience_replay, self.batch_size) state_batch = torch.cat(mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)) action_batch = [mini_batch[k][1] for k in range(self.batch_size)] reward_batch = [mini_batch[k][2] for k in range(self.batch_size)] next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)) terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)] # Prepare for the target q batch value_c, _ = self.actor_target.forward(to_variable(next_state_batch, volatile=True)) next_q_values = self.critic_target.forward([to_variable(next_state_batch, volatile=True), value_c]) next_q_values.volatile = False y_batch = to_variable(reward_batch) + self.discount * \ to_variable(terminal_batch) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic.forward([to_variable(state_batch), to_variable(action_batch)]) value_loss = self.loss(q_batch, y_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() value_c, _ = self.actor.forward(to_variable(state_batch)) policy_loss = -self.critic.forward([to_variable(state_batch), value_c]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau)
def select_action(self, state, test=False): value = to_numpy(self.actor.forward(to_variable(state, volatile=True))) cur_episode = len(self.experience_replay) action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1) return action
def __init__(self, shared_net, num_actions, in_channels, use_conv, config): super(A3C, self).__init__() net = A3CNet(config, in_channels, num_actions, use_conv) self.net = net.cuda() if isGPU else net self.config = config if config['name'] == 'LSTM': self.cx = to_variable(torch.zeros(1, config['hidden_size'])) self.hx = to_variable(torch.zeros(1, config['hidden_size'])) self.shared_net = shared_net self.optimizer = torch.optim.Adam(self.shared_net.parameters(), lr=config['lr']) self.gamma = config['gamma'] self.tau = config['tau'] self.clip_norm = config['clip_norm'] self.entropy_beta = config['entropy_beta']
def select_action(self, state, test=False): value_c, value_d = self.actor.forward(to_variable(state, volatile=True)) action_d = (F.softmax(value_d)) action_d = to_numpy(action_d.multinomial()) action_c = to_numpy(value_c) action_c += (max(self.epsilon, 0) * self.random_process.sample()) if not test else 0 action_c = action_c[0] return action_c, action_d
def select_action(self, state, test=False): on_state = to_variable(state, volatile=True) greedy = np.random.rand() if greedy < self.epsilon and not test: # explore action = np.random.randint(self.action_num) else: # exploit action = np.argmax(to_numpy(self.net.forward(on_state))) return action
def compute_expect(self, state, value, network, volatile=False): if self.use_expect: actions = [ to_variable(i * torch.ones(value.size(0), 1)) for i in range(self.nb_actions) ] next_q_values = torch.cat([ network.forward([to_variable(state, volatile=volatile), a]) for a in actions ], 1) next_q_values = torch.cat([ value[i, :].dot(next_q_values[i, :]) for i in range(self.batch_size) ], 0) else: next_q_values = network.forward( [to_variable(state, volatile=volatile), value]) return next_q_values
def update(self, values, log_probs, rewards, entropies, done, state=None): if done: if self.config['name'] == 'LSTM': self.cx = to_variable( torch.zeros(1, self.config['hidden_size'])) self.hx = to_variable( torch.zeros(1, self.config['hidden_size'])) R = to_variable(torch.zeros(1, 1)) else: if self.config['name'] == 'LSTM': self.cx = to_variable(self.cx.data) self.hx = to_variable(self.hx.data) _, value, _, _ = self.select_action(state) R = to_variable(value.data) values.append(R) # print(len(values), len(log_probs), len(rewards), len(entropies)) policy_loss = 0. value_loss = 0. gae = torch.zeros(1, 1) gae = gae.cuda() if isGPU else gae for i in reversed(range(len(rewards))): try: R = self.gamma * R + rewards[i] except TypeError: pass delta_t = rewards[i] + self.gamma * values[i + 1].data - values[i].data gae = gae * self.gamma * self.tau + delta_t policy_loss -= log_probs[i] * to_variable( gae) + self.entropy_beta * entropies[i] value_loss += 0.5 * (R - values[i])**2 # Perform asynchronous update final_loss = policy_loss + 0.5 * value_loss self.optimizer.zero_grad() final_loss.backward() nn.utils.clip_grad_norm(self.net.parameters(), self.clip_norm) ensure_shared_grads(self.net, self.shared_net) self.optimizer.step()
def select_action(self, state, test=False): state = to_variable(state, volatile=test) if self.config['name'] == 'LSTM': on_state = state, (self.hx, self.cx) value, logit, (hx, cx) = self.net.forward(on_state) else: on_state = state # print(state.size()) value, logit = self.net.forward(on_state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) action = to_numpy(prob.multinomial()) log_prob = log_prob.gather(1, to_variable(action, dtype='long')) action = action[0, 0] if self.config['name'] == 'LSTM': self.hx, self.cx = hx, cx return action, value, log_prob, entropy
def select_action(self, state, test=False): value = to_numpy(self.actor.forward(to_variable(state, volatile=True))) # print(value) cur_episode = len(self.experience_replay) if self.action_type == 'continuous': action = np.clip(value[0] + self.noise.generate(cur_episode), -1, 1) else: action = self.noise.generate(value[0], cur_episode) if isinstance(action, int): action = np.array([1., 0.] if action == 0 else [0., 1.]) else: # action = np.clip(action, 0.4, 0.6) action = action return action
def select_action(self, state, test=False): state = to_variable(state, isCuda=False, volatile=test) if self.config['name'] == 'LSTM': on_state = state, (self.hx, self.cx) value, logit, (hx, cx) = self.net.forward(on_state) else: on_state = state value, logit = self.net.forward(on_state) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) action = action.numpy()[0, 0] if self.config['name'] == 'LSTM': self.hx, self.cx = hx, cx return action, value, log_prob, entropy
def update(self, state, action, reward, new_state, done): self.experience_replay.append((state, action, reward, new_state, done)) if len(self.experience_replay ) >= self.observation: # if have enough experience example, go # Sample batch from memory replay mini_batch = random.sample(self.experience_replay, self.batch_size) state_batch = torch.cat(mini_batch[k][0].unsqueeze(0) for k in range(self.batch_size)) action_batch = [mini_batch[k][1] for k in range(self.batch_size)] reward_batch = [mini_batch[k][2] for k in range(self.batch_size)] next_state_batch = torch.cat(mini_batch[k][3].unsqueeze(0) for k in range(self.batch_size)) terminal_batch = [mini_batch[k][4] for k in range(self.batch_size)] action_tensor = to_variable(np.vstack(action_batch)) # Prepare for the target q batch value = self.actor_target.forward( to_variable(next_state_batch, volatile=True)) next_q_values = self.compute_expect(next_state_batch, value, self.critic_target, volatile=True) next_q_values.volatile = False y_batch = to_variable(reward_batch) + self.discount * \ to_variable(terminal_batch) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.compute_expect(state_batch, action_tensor, self.critic) # q_batch = self.critic.forward([to_variable(state_batch), action_tensor]) value_loss = self.loss(q_batch, y_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() value = self.actor.forward(to_variable(state_batch)) policy_loss = -self.compute_expect(state_batch, value, self.critic) # policy_loss = -self.critic.forward([to_variable(state_batch), value]) policy_loss = policy_loss.mean() policy_loss.backward() # torch.nn.utils.clip_grad_norm(self.actor.parameters(), 1.) list_params = list(self.actor.parameters()) # print(list_params[-1].grad[0]) # # invert gradients # for i, p in enumerate(list_params): # if i == len(list_params)-1: # for j in range(self.nb_actions): # # print("gradient", p.grad.data[j]) # if p.grad.data[j] > 0: # suggest increasing p # # print("current p", (self.pmax - p.data[j]) / (self.pmax - self.pmin)) # p.grad.data[j] *= abs(self.pmax - p.data[j])/(self.pmax - self.pmin) # else: # # print("current p", (p.data[j] - self.pmin) / (self.pmax - self.pmin)) # p.grad.data[j] *= abs(p.data[j] - self.pmin)/(self.pmax - self.pmin) # for p in list_params: # self.invert_gradient(p.data, p.grad.data) # print(list_params[-1].grad[0]) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau)