class DDPG_single(): def __init__(self, state_dim, action_dim, max_action, num_agents, learning_rate, discrete_action=True, grid_per_action=20, hidden_dim=32): self.max_action = max_action self.actor = Actor_DDPG(state_dim, action_dim, max_action, hidden_dim) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic = Critic_DDPG(state_dim, action_dim, num_agents, hidden_dim) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=learning_rate) self.exploration = OUNoise(action_dim) self.iter = 0 def scale_noise(self, scale): self.exploration.scale = scale def reset_noise(self): self.exploration.reset() def select_action(self, obs, explore=False): self.actor.eval() action = self.actor(obs) self.actor.train() if explore: device = action.device action += torch.Tensor(self.exploration.noise()).to(device) action = action.clamp(-self.max_action, self.max_action) return action def get_params(self): return { 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict() } def load_params(self, params): self.actor.load_state_dict(params['actor']) self.actor_target.load_state_dict(params['actor_target']) self.actor_optimizer.load_state_dict(params['actor_optimizer']) self.critic.load_state_dict(params['critic']) self.critic_target.load_state_dict(params['critic_target']) self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class MADDPGAgent: """ Defines a Multi-Agent Deep Deterministic Policy Gradient (MADDPG) agent """ def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic) def act(self, obs, noise=0.0): """ Act using the online actor network """ obs = obs.to(device) action = self.actor(obs) + (noise * self.noise.noise()).to(device) action = torch.clamp(action, -1, 1) return action def target_act(self, obs, noise=0.0): """ Act using the target actor network (used for training) """ obs = obs.to(device) action = self.target_actor(obs) + (noise * self.noise.noise()).to(device) action = torch.clamp(action, -1, 1) return action def update_targets(self): """ Perform soft update of target network parameters based on latest actor/critic parameters """ soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) def train(self, samples): """ Perform a training step for critic and actor networks with soft update """ # Unpack data from replay buffer and convert to tensors obs = torch.tensor([exp[0] for exp in samples], dtype=torch.float, device=device) act = torch.tensor([exp[1] for exp in samples], dtype=torch.float, device=device) reward = torch.tensor([exp[2] for exp in samples], dtype=torch.float, device=device) next_obs = torch.tensor([exp[3] for exp in samples], dtype=torch.float, device=device) done = torch.tensor([exp[4] for exp in samples], dtype=torch.float, device=device) obs_full = torch.tensor([exp[5] for exp in samples], dtype=torch.float, device=device) next_obs_full = torch.tensor([exp[6] for exp in samples], dtype=torch.float, device=device) act_full = torch.tensor([exp[7] for exp in samples], dtype=torch.float, device=device) # Critic update self.critic_optimizer.zero_grad() target_critic_obs = [next_obs_full[:,i,:].squeeze() \ for i in range(self.num_agents)] target_critic_obs = torch.cat(target_critic_obs, dim=1) target_act = [self.target_act(next_obs_full[:,i,:].squeeze()) \ for i in range(self.num_agents)] target_act = torch.cat(target_act, dim=1) with torch.no_grad(): q_next = self.target_critic(target_critic_obs, target_act) q_target = reward + self.gamma * q_next * (1 - done) critic_obs = [obs_full[:,i,:].squeeze() \ for i in range(self.num_agents)] critic_obs = torch.cat(critic_obs, dim=1) critic_act = [act_full[:,i,:].squeeze() \ for i in range(self.num_agents)] critic_act = torch.cat(critic_act, dim=1) q = self.critic(critic_obs, critic_act) critic_loss = torch.nn.functional.mse_loss(q, q_target.detach()) critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.clip_grad) self.critic_optimizer.step() # Actor update using policy gradient self.actor_optimizer.zero_grad() actor_act = [self.act(obs_full[:,i,:].squeeze()) \ for i in range(self.num_agents)] actor_act = torch.cat(actor_act, dim=1) actor_loss = -self.critic(critic_obs, actor_act).mean() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) actor_loss.backward() self.actor_optimizer.step() # Update target networks self.update_targets()
class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-3, lr_critic=1.0e-3, noise_dist: str = 'normal', checkpoint_path=None) -> None: super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0, noise_dist=noise_dist) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) if checkpoint_path: checkpoint = torch.load(checkpoint_path) self.actor.load_state_dict(checkpoint[0]['actor_params']) self.target_actor.load_state_dict(checkpoint[0]['actor_params']) self.critic.load_state_dict(checkpoint[0]['critic_params']) self.target_critic.load_state_dict(checkpoint[0]['critic_params']) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs) + noise * self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise * self.noise.noise() return action def update(self, buffer: ReplayBuffer, batchsize: int = 1000, tau: float = 0.005, discount: float = 0.98): states, actions, rewards, states_next, dones = buffer.sample( batchsize=batchsize) actions_next = self.target_actor(torch.stack(states_next).float()) input_target_critic = torch.cat( [torch.stack(states_next).float(), actions_next.float()], axis=1) state_value = self.target_critic(input_target_critic) state_value.add_(torch.tensor(rewards).unsqueeze(1)) state_value = state_value * discount * (1 - torch.tensor(dones).float()) state_value.detach() input_critic = torch.cat( [torch.stack(states).float(), torch.stack(actions).float()], axis=1) state_value_local = self.critic(input_critic) critic_loss = (state_value - state_value_local).pow(2).mul(0.5).sum(-1).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor actions_new = self.actor(torch.stack(states).float()) value_critic = self.critic( torch.cat([torch.stack(states).float(), actions_new], axis=1)) loss_actor = -value_critic.mean() self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() soft_update(self.target_actor, self.actor, tau) soft_update(self.target_critic, self.critic, tau) def update_targets(self, tau=0.005): soft_update(self.target_actor, self.actor, tau) soft_update(self.target_critic, self.critic, tau)