class Agent(): def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['l1'][0] = state_size nn_params['l5'][1] = action_size self.__learning_mode = params['learning_mode'] if self.__learning_mode['DuelingDDQN']: self.__qnetwork_local = DuelingQNetwork(nn_params).to(device) self.__qnetwork_target = DuelingQNetwork(nn_params).to(device) else: self.__qnetwork_local = QNetwork(nn_params).to(device) self.__qnetwork_target = QNetwork(nn_params).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.eps = params['eps_initial'] self.gamma = params['gamma'] self.learning_rate = params['learning_rate'] self.update_period = params['update_period'] self.a = params['a'] self.b = params['b'] self.e = params['e'] self.tau = params['tau'] self.__optimiser = optim.Adam(self.__qnetwork_local.parameters(), self.learning_rate) # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr): self.learning_rate = lr for param_group in self.__optimiser.param_groups: param_group['lr'] = lr # Get methods def get_qlocal(self): return self.__qnetwork_local # Other methods def step(self, state, action, reward, next_state, done): # add experience to memory self.__memory.add(state, action, reward, next_state, done) self.__t = (self.__t + 1) % self.update_period if not self.__t: if self.__memory.is_ready(): experiences = self.__memory.sample() self.__update(experiences) def choose_action(self, state, mode='train'): # state should be transformed to a tensor if mode == 'train': if random.random() > self.eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.__qnetwork_local.eval() with torch.no_grad(): actions = self.__qnetwork_local(state) self.__qnetwork_local.train() return np.argmax(actions.cpu().data.numpy()) else: return np.random.choice(np.arange(self.__action_size)) elif mode == 'test': state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.__qnetwork_local.eval() with torch.no_grad(): actions = self.__qnetwork_local(state) self.__qnetwork_local.train() return np.argmax(actions.cpu().data.numpy()) else: print("Invalid mode value") def __update(self, experiences): states, actions, rewards, next_states, dones, indices, probs = experiences # Compute and minimise the loss self.__optimiser.zero_grad() loss_fn = nn.MSELoss(reduce=False) if self.__learning_mode['DQN']: Q_target_next = self.__qnetwork_target.forward(next_states).max( 1)[0].unsqueeze(1).detach() else: Q_target_next = self.__qnetwork_target.forward(next_states). \ gather(1, self.__qnetwork_local.forward(next_states).max(1)[1].unsqueeze(1)).detach() targets = rewards + self.gamma * Q_target_next * (1 - dones) outputs = self.__qnetwork_local.forward(states).gather(1, actions) loss = loss_fn(outputs, targets) # Calculate weights and normalise if probs: weights = [(prob * len(self.__memory))**(-self.b) for prob in probs] weights = np.array([w / max(weights) for w in weights]).reshape( (-1, 1)) else: weights = np.ones(loss.shape, dtype=np.float) # Calculate weighted loss weighted_loss = torch.mean(torch.from_numpy(weights).float() * loss) weighted_loss.backward() self.__optimiser.step() if indices: self.__memory.update( indices, list(loss.detach().numpy().squeeze()**self.a + self.e)) self.__soft_update(self.__qnetwork_local, self.__qnetwork_target, self.tau) self.agent_loss = weighted_loss.detach().numpy().squeeze() def __soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AgentDDPG: def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['nn_actor']['l1'][0] = state_size nn_params['nn_actor']['l3'][1] = action_size nn_params['nn_critic']['l1'][0] = state_size + action_size self.__actor_local = Actor(nn_params['nn_actor']).to(device) self.__actor_target = Actor(nn_params['nn_actor']).to(device) self.__critic_local = Critic(nn_params['nn_critic']).to(device) self.__critic_target = Critic(nn_params['nn_critic']).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.gamma = params['gamma'] self.learning_rate_actor = params['learning_rate_actor'] self.learning_rate_critic = params['learning_rate_critic'] self.tau = params['tau'] self.__optimiser_actor = optim.Adam(self.__actor_local.parameters(), self.learning_rate_actor) self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic) self.__uo_process = UOProcess() # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr_actor, lr_critic): self.learning_rate_actor = lr_actor self.learning_rate_critic = lr_critic for param_group in self.__optimiser_actor.param_groups: param_group['lr'] = lr_actor for param_group in self.__optimiser_critic.param_groups: param_group['lr'] = lr_critic # Get methods def get_actor(self): return self.__actor_local def get_critic(self): return self.__critic_local # Other methods def step(self, state, action, reward, next_state, done): # add experience to memory self.__memory.add(state, action, reward, next_state, done) if self.__memory.is_ready(): experiences = self.__memory.sample() self.__update(experiences) def choose_action(self, state, mode='train'): if mode == 'train': # state should be transformed to a tensor state = torch.from_numpy( np.array(state)).float().unsqueeze(0).to(device) self.__actor_local.eval() with torch.no_grad(): action = self.__actor_local(state) + self.__uo_process.sample() self.__actor_local.train() return list(np.clip(action.cpu().numpy().squeeze(), -1, 1)) elif mode == 'test': # state should be transformed to a tensor state = torch.from_numpy( np.array(state)).float().unsqueeze(0).to(device) self.__actor_local.eval() with torch.no_grad(): action = self.__actor_local(state) self.__actor_local.train() return list(np.clip(action.cpu().numpy().squeeze(), -1, 1)) else: print("Invalid mode value") def reset(self, sigma): self.__uo_process.reset(sigma) def __update(self, experiences): states, actions, rewards, next_states, dones = experiences # update critic # ---------------------------------------------------------- loss_fn = nn.MSELoss() self.__optimiser_critic.zero_grad() # form target next_actions = self.__actor_target(next_states) Q_target_next = self.__critic_target.forward( torch.cat((next_states, next_actions), dim=1)).detach() targets = rewards + self.gamma * Q_target_next * (1 - dones) # form output outputs = self.__critic_local.forward( torch.cat((states, actions), dim=1)) mean_loss_critic = loss_fn( outputs, targets) # minus added since it's gradient ascent mean_loss_critic.backward() self.__optimiser_critic.step() # update actor # ---------------------------------------------------------- self.__optimiser_actor.zero_grad() predicted_actions = self.__actor_local(states) mean_loss_actor = -self.__critic_local.forward( torch.cat((states, predicted_actions), dim=1)).mean() mean_loss_actor.backward() self.__optimiser_actor.step() # update actor self.__soft_update(self.__critic_local, self.__critic_target, self.tau) self.__soft_update(self.__actor_local, self.__actor_target, self.tau) @staticmethod def __soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agents: def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] num_agents = params['num_of_agents'] nn_params = params['nn_params'] nn_params['nn_actor']['l1'][0] = state_size nn_params['nn_actor']['l3'][1] = action_size nn_params['nn_critic']['l1'][0] = (state_size + action_size) * num_agents self.__actors_local = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)] self.__actors_target = [Actor(nn_params['nn_actor']).to(device), Actor(nn_params['nn_actor']).to(device)] self.__critic_local = Critic(nn_params['nn_critic']).to(device) self.__critic_target = Critic(nn_params['nn_critic']).to(device) self.__action_size = action_size self.__state_size = state_size self.__num_agents = num_agents self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.gamma = params['gamma'] self.learning_rate_actor = params['learning_rate_actor'] self.learning_rate_critic = params['learning_rate_critic'] self.tau = params['tau'] self.__optimisers_actor = [optim.Adam(self.__actors_local[0].parameters(), self.learning_rate_actor), optim.Adam(self.__actors_local[1].parameters(), self.learning_rate_actor)] self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic) self.__uo_process = UOProcess(shape=(self.__num_agents, self.__action_size)) # other parameters self.agent_loss = 0.0 # Set methods def set_learning_rate(self, lr_actor, lr_critic): self.learning_rate_actor = lr_actor self.learning_rate_critic = lr_critic for n in range(self.__num_agents): for param_group in self.__optimisers_actor[n].param_groups: param_group['lr'] = lr_actor for param_group in self.__optimiser_critic.param_groups: param_group['lr'] = lr_critic # Get methods def get_actor(self): return self.__actors_local def get_critic(self): return self.__critic_local # Other methods def step(self, state, action, reward, next_state, done): # add experience to memory self.__memory.add(state, action, reward, next_state, done) if self.__memory.is_ready(): self.__update() def choose_action(self, states, mode='train'): if mode == 'train': # state should be transformed to a tensor states = torch.from_numpy(np.array(states)).float().to(device) actions = np.zeros((self.__num_agents, self.__action_size)) for i, actor in enumerate(self.__actors_local): state = states[i, :] actor.eval() with torch.no_grad(): action = actor(state) actor.train() actions[i, :] = action.cpu().numpy() actions += np.array(self.__uo_process.sample()) return np.clip(actions, -1, 1) elif mode == 'test': # state should be transformed to a tensor states = torch.from_numpy(np.array(states)).float().to(device) actions = np.zeros((self.__num_agents, self.__action_size)) for i, actor in enumerate(self.__actors_local): state = states[i, :] actor.eval() with torch.no_grad(): action = actor(state) actions[i, :] = action.cpu().numpy() actions += np.array(self.__uo_process.sample()) return np.clip(actions, -1, 1) else: print("Invalid mode value") def reset(self, sigma): self.__uo_process.reset(sigma) def __update(self): for i in range(self.__num_agents): # update critic # ---------------------------------------------------------- # states, actions, rewards, next_states, dones = self.__memory.sample() states_i = states[:, i, :] actions_i = actions[:, i, :] rewards_i = rewards[:, i] next_states_i = next_states[:, i, :] dones_i = dones[:, i] loss_fn = nn.MSELoss() self.__optimiser_critic.zero_grad() # form target next_states_actions = torch.cat((next_states[:, 0, :], next_states[:, 1, :], self.__actors_target[0].forward(next_states[:, 0, :]), self.__actors_target[1].forward(next_states[:, 1, :])), dim=1) Q_target_next = self.__critic_target.forward(next_states_actions).detach() targets = (rewards_i + self.gamma * Q_target_next[:, i] * (1 - dones_i)) # form output states_actions = torch.cat((states[:, 0, :], states[:, 1, :], actions[:, 0, :], actions[:, 1, :]), dim=1) outputs = self.__critic_local.forward(states_actions) mean_loss_critic = loss_fn(outputs[:, i], targets) # minus added since it's gradient ascent mean_loss_critic.backward() self.__optimiser_critic.step() # update actor # ---------------------------------------------------------- self.__optimisers_actor[i].zero_grad() predicted_actions = copy.copy(actions) predicted_actions[:, i, :] = self.__actors_local[i](states_i) mean_loss_actor = - self.__critic_local.forward(torch.cat((states[:, 0, :], states[:, 1, :], predicted_actions[:, 0, :], predicted_actions[:, 1, :]), dim=1))[:, i].mean() mean_loss_actor.backward() self.__optimisers_actor[i].step() # update actor self.__soft_update(self.__critic_local, self.__critic_target, self.tau) self.__soft_update(self.__actors_local[i], self.__actors_target[i], self.tau) @staticmethod def __soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)