def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) action_var = to_tensor_var([action], self.use_cuda) # whole_state_var = state_var.view(-1, self.n_agents*self.state_dim) # whole_action_var = action_var.view(-1, self.n_agents*self.action_dim) # whole_critic_state_dim = 0 # whole_critic_action_dim = 0 # for i in range(self.n_agents): # whole_critic_state_dim += self.obs_shape_n[i] # whole_critic_action_dim += self.act_shape_n[i] whole_state_var = state_var.view(-1, self.whole_critic_state_dim) whole_action_var = action_var.view(-1, self.whole_critic_action_dim) values = [0] * self.n_agents for agent_id in range(self.n_agents): if self.training_strategy == "cocurrent": value_var = self.critics[agent_id](state_var[:, agent_id, :], action_var[:, agent_id, :]) elif self.training_strategy == "centralized": value_var = self.critics[agent_id](whole_state_var, whole_action_var) if self.use_cuda: values[agent_id] = value_var.data.cpu().numpy()[0] else: values[agent_id] = value_var.data.numpy()[0] return values
def get_loss(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var( batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var( one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # Get the actor network loss self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # Get the critic network loss self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) combined_loss = {'actor_loss': actor_loss, 'critic_loss': critic_loss} return combined_loss
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) actions_var = to_tensor_var(batch.actions, self.use_cuda, "long").view(-1, 1) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim) dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1) # compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken current_q = self.actor(states_var).gather(1, actions_var) # compute V(s_{t+1}) for all next states and all actions, # and we then take max_a { V(s_{t+1}) } next_state_action_values = self.actor(next_states_var).detach() next_q = th.max(next_state_action_values, 1)[0].view(-1, 1) # compute target q by: r + gamma * max_a { V(s_{t+1}) } target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. - dones_var) # update value network self.actor_optimizer.zero_grad() if self.critic_loss == "huber": loss = th.nn.functional.smooth_l1_loss(current_q, target_q) else: loss = th.nn.MSELoss()(current_q, target_q) loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step()
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step()
def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) action = index_to_one_hot(action, self.action_dim) action_var = to_tensor_var([action], self.use_cuda) value_var = self.critic(state_var, action_var) if self.use_cuda: value = value_var.data.cpu().numpy()[0] else: value = value_var.data.numpy()[0] return value
def train(self): # do not train until exploration is enough if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) state_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) action_var = to_tensor_var(batch.actions, self.use_cuda).view(-1, self.action_dim) reward_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) next_state_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim) done_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1) # estimate the target q with actor_target network and critic_target network next_action_var = self.actor_target(next_state_var) next_q = self.critic_target(next_state_var, next_action_var).detach() target_q = self.reward_scale * reward_var + self.reward_gamma * next_q * ( 1. - done_var) # update critic network self.critic_optimizer.zero_grad() # current Q values current_q = self.critic(state_var, action_var) # rewards is target Q values if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(current_q, target_q) else: critic_loss = nn.MSELoss()(current_q, target_q) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # update actor network self.actor_optimizer.zero_grad() # the accurate action prediction action = self.actor(state_var) # actor_loss is used to maximize the Q value for the predicted action actor_loss = -self.critic(state_var, action) actor_loss = actor_loss.mean() actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update actor target network and critic target network if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0: super(DDPG, self)._soft_update_target(self.critic_target, self.critic) super(DDPG, self)._soft_update_target(self.actor_target, self.actor)
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() values = self.critic_target(states_var, actions_var).detach() advantages = rewards_var - values # # normalizing advantages seems not working correctly here # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) action_log_probs = self.actor(states_var) action_log_probs = th.sum(action_log_probs * actions_var, 1) old_action_log_probs = self.actor_target(states_var).detach() old_action_log_probs = th.sum(old_action_log_probs * actions_var, 1) ratio = th.exp(action_log_probs - old_action_log_probs) surr1 = ratio * advantages surr2 = th.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantages # PPO's pessimistic surrogate (L^CLIP) actor_loss = -th.mean(th.min(surr1, surr2)) actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var values = self.critic(states_var, actions_var) if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() # update actor target network and critic target network if self.n_steps % self.target_update_steps == 0 and self.n_steps > 0: super(PPO, self)._soft_update_target(self.actor_target, self.actor) super(PPO, self)._soft_update_target(self.critic_target, self.critic)
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.n_agents, self.state_dim) actions_var = to_tensor_var(batch.actions, self.use_cuda).view( -1, self.n_agents, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, self.n_agents, 1) whole_states_var = states_var.view(-1, self.n_agents * self.state_dim) whole_actions_var = actions_var.view(-1, self.n_agents * self.action_dim) for agent_id in range(self.n_agents): # update actor network self.actor_optimizers[agent_id].zero_grad() action_log_probs = self.actors[agent_id](states_var[:, agent_id, :]) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum( action_log_probs * actions_var[:, agent_id, :], 1) if self.training_strategy == "cocurrent": values = self.critics[agent_id](states_var[:, agent_id, :], actions_var[:, agent_id, :]) elif self.training_strategy == "centralized": values = self.critics[agent_id](whole_states_var, whole_actions_var) advantages = rewards_var[:, agent_id, :] - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actors[agent_id].parameters(), self.max_grad_norm) self.actor_optimizers[agent_id].step() # update critic network self.critic_optimizers[agent_id].zero_grad() target_values = rewards_var[:, agent_id, :] if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss( values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critics[agent_id].parameters(), self.max_grad_norm) self.critic_optimizers[agent_id].step()
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network action_log_probs, values = self.actor_critic(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) # fisher loss if self.optimizer.steps % self.optimizer.Ts == 0: self.actor_critic.zero_grad() pg_fisher_loss = th.mean(action_log_probs) values_noise = to_tensor_var(np.random.randn(values.size()[0]), self.use_cuda) sample_values = (values + values_noise.view(-1, 1)).detach() if self.critic_loss == "huber": vf_fisher_loss = -nn.functional.smooth_l1_loss( values, sample_values) else: vf_fisher_loss = -nn.MSELoss()(values, sample_values) joint_fisher_loss = pg_fisher_loss + self.vf_fisher_coef * vf_fisher_loss self.optimizer.acc_stats = True joint_fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() # actor loss advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # critic loss target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) loss = actor_loss + critic_loss loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step()
def action(self, state): action_var = self.actor(to_tensor_var([state], self.use_cuda)) if self.use_cuda: action = action_var.data.cpu().numpy()[0] else: action = action_var.data.numpy()[0] return action
def value(self, state, action): state_var = to_tensor_var([state], self.use_cuda) value_var = self.actor_critic(state_var)[1] if self.use_cuda: value = value_var.data.cpu().numpy()[0] else: value = value_var.data.numpy()[0] return value
def _softmax_action(self, state): state_var = to_tensor_var([state], self.use_cuda) softmax_action_var = th.exp(self.actor(state_var)) if self.use_cuda: softmax_action = softmax_action_var.data.cpu().numpy()[0] else: softmax_action = softmax_action_var.data.numpy()[0] return softmax_action
def action(self, state): state_var = to_tensor_var([state], self.use_cuda) state_action_value_var = self.actor(state_var) if self.use_cuda: state_action_value = state_action_value_var.data.cpu().numpy()[0] else: state_action_value = state_action_value_var.data.numpy()[0] action = np.argmax(state_action_value) return action
def _softmax_action(self, state): state_var = to_tensor_var([state], self.use_cuda) softmax_action = np.zeros((self.n_agents, self.action_dim), dtype=np.float64) for agent_id in range(self.n_agents): softmax_action_var = self.actors[agent_id](state_var[:,agent_id,:]) if self.use_cuda: softmax_action[agent_id] = softmax_action_var.data.cpu().numpy()[0] else: softmax_action[agent_id] = softmax_action_var.data.numpy()[0] return softmax_action
def value(self, state, action): # print([state]) # try: state_var = to_tensor_var([state], self.use_cuda) action_var = to_tensor_var([action], self.use_cuda) # except ValueError as e: # print([state]) # print([action]) whole_state_var = state_var.view(-1, self.n_agents*self.obs_shape_n) whole_action_var = action_var.view(-1, self.n_agents*self.act_shape_n) values = [0]*self.n_agents for agent_id in range(self.n_agents): if self.training_strategy == "cocurrent": value_var = self.critics[agent_id](state_var[:,agent_id,:], action_var[:,agent_id,:]) elif self.training_strategy == "centralized": value_var = self.critics[agent_id](whole_state_var, whole_action_var) if self.use_cuda: values[agent_id] = value_var.data.cpu().numpy()[0] else: values[agent_id] = value_var.data.numpy()[0] return values
def _softmax_action(self, state): try: state_var = to_tensor_var([state], self.use_cuda) except ValueError as e: print([state]) sys.exit(0) softmax_action = np.zeros((self.n_agents, self.act_shape_n), dtype=np.float64) for agent_id in range(self.n_agents): softmax_action_var = th.exp(self.actors[agent_id](state_var[:,agent_id,:])) if self.use_cuda: softmax_action[agent_id] = softmax_action_var.data.cpu().numpy()[0] else: softmax_action[agent_id] = softmax_action_var.data.numpy()[0] return softmax_action
def _softmax_action(self, state): # print(state) state_var = to_tensor_var([state], self.use_cuda) softmax_action = np.zeros((self.n_agents, self.act_shape_n[0]), dtype=np.float64) # softmax_action = np.zeros((self.n_agents, self.action_dim), dtype=np.float64) # softmax_action = np.array([]) # print(self.act_shape_n[0]) # for i in range(self.n_agents): # print(np.zeros(self.act_shape_n[i])) # np.vstack((softmax_action,np.zeros(self.act_shape_n[i]))) for agent_id in range(self.n_agents): softmax_action_var = th.exp(self.actors[agent_id]( state_var[:, agent_id, :])) if self.use_cuda: softmax_action[agent_id] = softmax_action_var.data.cpu().numpy( )[0] else: softmax_action[agent_id] = softmax_action_var.data.numpy()[0] return softmax_action