class MADDPG(): actors = None actors_targets = None actors_optimisers = None critics = None critics_targets = None critics_optimisers = None env = None gamma = None def __init__(self, env, env_obs, gamma=0.99, tau=0.001, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0.1, batch_size=64, subpolicies=1, action_shape=2, replay_buffer_size=5000, replay_buffer_type="rb", noise=0.1, noise_decay=0.999, max_action=1, min_action=-1, teacher=False, bc=None): self.env = env self.subpolicies = subpolicies self.total_obs = np.sum(env_obs) self.weight_decay = weight_decay self.max_action = max_action self.min_action = min_action self.action_shape = action_shape self.gamma = gamma self.tau = tau self.batch_size = batch_size self.replay_buffer_type = replay_buffer_type self.replay_buffer_size = replay_buffer_size self.init_noise = noise self.noise = noise self.noise_decay = noise_decay self.teacher = teacher self.mul = 1 if self.teacher is False else 2 self.actors = [[ Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies) ] for agent in range(env.n)] self.actors_targets = [[ Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies) ] for agent in range(env.n)] self.critics = [ Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents ] self.critics_targets = [ Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents ] self.actors_optimizers = [[ torch.optim.RMSprop(self.actors[agent][i].parameters(), lr=lr_actor, weight_decay=weight_decay) for i in range(self.subpolicies) ] for agent in range(len(env.agents))] self.critics_optimisers = [ torch.optim.RMSprop(self.critics[agent].parameters(), lr=lr_critic, weight_decay=weight_decay) for agent in range(len(env.agents)) ] if self.subpolicies > 1: if self.replay_buffer_type == "rb": self.replay_buffers = [[ ReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies) ] for _ in range(env.n)] else: self.replay_buffers = [[ PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies) ] for _ in range(env.n)] else: if self.replay_buffer_type == "rb": self.replay_buffers = ReplayBuffer(self.replay_buffer_size) else: self.replay_buffers = PrioritizedReplayBuffer( self.replay_buffer_size) def save(self, path): for agent in range(self.env.n): for sub in range(self.subpolicies): torch.save( self.actors[agent][sub].state_dict(), path + '/actor_{}_subpolicy_{}.pt'.format(agent, sub)) def load(self, path): for agent in range(self.env.n): for sub in range(self.subpolicies): self.actors[agent][sub].load_state_dict( torch.load(path + "/actor_{}_subpolicy_{}.pt".format(agent, sub))) def push_sample(self, s, a, r, d, s_t, subs): if self.replay_buffer_type == "rb": if self.subpolicies > 1: for agent in range(self.env.n): self.replay_buffers[agent][subs[agent]].push( s, a, r, d, s_t, subs) else: self.replay_buffers.push(s, a, r, d, s_t, subs) else: errors = self.get_errors(s, a, r, d, s_t, subs) if self.subpolicies > 1: for agent in range(self.env.n): self.replay_buffers[agent][subs[agent]].push( s, a, r, d, s_t, subs, errors[agent]) else: self.replay_buffers.push(s, a, r, d, s_t, subs, np.mean(errors)) def get_errors(self, s, a, r, d, s_t, subs): errors = [] per_agent_obs = [s[agent] for agent in range(self.env.n)] t_agent_obs = [ torch.Tensor(list(per_agent_obs[agent][:])).view(1, -1) for agent in range(self.env.n) ] obs = torch.cat(t_agent_obs, 1) per_agent_new_obs = [s_t[agent] for agent in range(self.env.n)] t_agent_new_obs = [ torch.Tensor(list(per_agent_new_obs[agent][:])).view(1, -1) for agent in range(self.env.n) ] new_obs = torch.cat(t_agent_new_obs, 1) action = torch.Tensor(a) reward = torch.Tensor(r) new_actions = [] for agent in range(self.env.n): current_new_action = self.actors_targets[agent][subs[agent]]( t_agent_new_obs[agent]).clamp(self.min_action, self.max_action) new_actions.append(current_new_action) new_actions = torch.stack(new_actions, 1) for i in range(len(self.env.agents)): with torch.no_grad(): target_Q_input = torch.cat((new_obs, new_actions.view(1, -1)), 1) target_Q = reward[i] + (self.gamma * self.critics_targets[i] (target_Q_input) * (1 - d)) current_Q_input = torch.cat((obs, action.view(1, -1)), 1) current_Q = self.critics[i](current_Q_input) critic_loss = F.mse_loss(current_Q, target_Q) errors.append(critic_loss.item()) return errors def apply_noise_decay(self): self.noise = self.noise * self.noise_decay def reset_noise(self, noise_value=None): if noise_value is None: self.noise = self.init_noise else: self.noise = noise_value def random_act(self): return np.array([ random.uniform(self.min_action, self.max_action) for _ in range(self.env.n * self.action_shape) ]).reshape(self.env.n, self.action_shape) def act(self, s, subs, noise=True): actions = [] input = torch.Tensor(s) for agent in range(self.env.n): action = self.actors[agent][subs[agent]](input[agent]) if noise is True: action = action + torch.FloatTensor(action.shape).uniform_( -self.noise, self.noise) action = action.clamp(self.min_action, self.max_action) actions.append(action) return np.array([action.detach().numpy() for action in actions ]).reshape(self.env.n, self.action_shape) def train(self, subs): for i in range(len(self.env.agents)): self.critics_optimisers[i].zero_grad() if self.subpolicies > 1: minibatch = self.replay_buffers[i][subs[i]].sample( self.batch_size) else: minibatch = self.replay_buffers.sample(self.batch_size) if self.replay_buffer_type == "rb": obs, actions, rewards, dones, new_obs, _ = minibatch else: (batch, index, is_weight) = minibatch is_weight = torch.Tensor(is_weight) obs, actions, rewards, dones, new_obs, _ = batch """ Handle heterogeneous observation sizes by splitting agent observations """ per_agent_obs = [obs[:, agent] for agent in range(self.env.n)] t_agent_obs = [ torch.Tensor(list(per_agent_obs[agent][:])) for agent in range(self.env.n) ] obs = torch.cat(t_agent_obs, 1) per_agent_new_obs = [ new_obs[:, agent] for agent in range(self.env.n) ] t_agent_new_obs = [ torch.Tensor(list(per_agent_new_obs[agent][:])) for agent in range(self.env.n) ] new_obs = torch.cat(t_agent_new_obs, 1) actions = torch.Tensor(actions).clone() rewards = torch.Tensor(rewards) dones = torch.Tensor(dones).view(self.batch_size, -1) """ Compute a(st + 1) for each agent """ new_actions = [] for agent in range(self.env.n): current_new_action = self.actors_targets[agent][subs[agent]]( t_agent_new_obs[agent]).clamp(self.min_action, self.max_action) new_actions.append(current_new_action) new_actions = torch.stack(new_actions, 1) target_Q_input = torch.cat( (new_obs, new_actions.view(self.batch_size, -1)), 1) current_Q_input = torch.cat( (obs, actions.view(self.batch_size, -1)), 1) with torch.no_grad(): target_Q = rewards[:, i].view(self.batch_size, -1) + ( self.gamma * self.critics_targets[i](target_Q_input) * (1 - dones)) target_Q.detach() current_Q = self.critics[i](current_Q_input) if self.replay_buffer_type == "per": error = ((current_Q - target_Q)**2).reshape( is_weight.shape) * is_weight error = error.detach().numpy() for sample_index in range(self.batch_size): if self.subpolicies > 1: self.replay_buffers[i][subs[i]].update( index[sample_index], error[sample_index]) else: self.replay_buffers.update(index[sample_index], error[sample_index]) critic_loss = F.mse_loss(current_Q, target_Q) critic_loss.backward() self.critics_optimisers[i].step() self.actors_optimizers[i][subs[i]].zero_grad() t_agent_obs[i].requires_grad = True actions[:, i] = self.actors[i][subs[i]](t_agent_obs[i]) _input = torch.cat((obs.view( self.batch_size, -1), actions.view(self.batch_size, -1)), 1) actor_loss = -self.critics[i](_input).mean() actor_loss.backward() self.actors_optimizers[i][subs[i]].step() # Update the frozen target models for param, target_param in zip( self.critics[i].parameters(), self.critics_targets[i].parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.actors[i][subs[i]].parameters(), self.actors_targets[i][subs[i]].parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, QNetwork): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed QNetwork: a class inheriting from torch.nn.Module that define the structure of the neural network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #when using a dropout the qnetwork_target should be put in eval mode self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = PrioritizedReplayBuffer(seed, device, action_size, BUFFER_SIZE, BATCH_SIZE, DEFAULT_PRIORITY, PRIORITY_FACTOR) # Initialize time step (for updating every UPDATE_EVERY steps) self.u_step = 0 self.t_step = 0 self.up_step = 0 # To control the importance sampling weight. As the network is converging, b should move toward 1 self.b = torch.tensor(1., device=device, requires_grad=False) self.b_decay = torch.tensor(0.00015, device=device, requires_grad=False) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(np.int_) else: return random.choice(np.arange(self.action_size)).astype(np.int_) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % TRANSFER_EVERY self.u_step = (self.u_step + 1) % UPDATE_EVERY self.up_step = (self.up_step + 1) % UPDATE_PRIORITY_EVERY # Learn from experiences if len(self.memory) > BATCH_SIZE and self.u_step == 0: # sample the experiences from the memory based on their priority experiences = self.memory.sample() self.learn(experiences) # Transfer the knowledge from the local network to the fixed on if len(self.memory) > BATCH_SIZE and self.t_step == 0: self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) # Update the priorities in the memory to alter the sampling # Ideally, this should be done before the sampling is taking place # But, for sake of performance, it might be better to recalculate them less often if len(self.memory) > 1 and self.up_step == 0: for experiences in self.memory.get_all_experiences(512): with torch.no_grad(): self.qnetwork_local.eval() current_estimate, from_env = self.get_target_estimate(experiences) # update the priorities based on newly learned errors self.memory.update(experiences[-1], (from_env - current_estimate).squeeze()) def get_target_estimate(self, experiences): states, actions, rewards, next_states, dones, probabilities, selected = experiences with torch.no_grad(): best_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1) evaluations = self.qnetwork_target(next_states).gather(1,best_actions) from_env = rewards + GAMMA*evaluations*(1 - dones) return self.qnetwork_local(states).gather(1, actions), from_env def learn(self, experiences): self.qnetwork_local.train() current_estimate,from_env = self.get_target_estimate(experiences) probabilities = experiences[-2] errors = (from_env - current_estimate) # Since the experiences were retrieved based on a given probabilities, such experience will biase the network # Therefore, we introduce here an importance sampling weight loss = (errors * errors / (len(self.memory) * probabilities) * self.b).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.b = min(self.b + self.b_decay,1) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target θ_target = θ_target + τ*(θ_local - θ_target) Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)