class DDPGController: """ Deep learning agent based on Deep Deterministic Policy Gradient described in https://arxiv.org/pdf/1509.02971.pdf """ def __init__(self, env, brain_name, config): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'batch_size', int, size of the batches sampled to train the model on each update - 'update_every', int, update frequency, in number of steps - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.trained_policy = Policy(config, self.state_size, self.action_size) self.target_policy = Policy(config, self.state_size, self.action_size) self.trained_critic = Critic(config, self.state_size, self.action_size) self.target_critic = Critic(config, self.state_size, self.action_size) # those networks will never be trained self.target_policy.eval() self.target_critic.eval() self.memory = AgentMemory(((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.scores = [] self.critic_losses = [] self.surrogates = [] self.critic_optimizer = optim.Adam(self.trained_critic.parameters(), lr=config.learning_rate) self.policy_optimizer = optim.Adam(self.trained_policy.parameters(), lr=config.learning_rate) def solve(self): """ Main method to launch the environment loop """ step = 1 for i_episode in range(1, self.num_episodes + 1): env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations rewards = [] surrogates = [] critic_losses = [] while True: action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done self.memory.add((state, action, next_state, reward, done)) state = next_state rewards.append(reward) if self.memory.size >= self.batch_size and not step % self.update_every: surrogate_buffer, critic_loss = self.train() surrogates.append(surrogate_buffer) critic_losses.append(critic_loss) step += 1 if np.any(done): break self.scores.append(np.mean(np.sum(rewards, axis=0))) self.surrogates.append(np.mean(surrogates)) self.critic_losses.append(np.mean(critic_losses)) self.print_status(i_episode) return self.scores, self.surrogates, self.critic_losses def act(self, states): """ Based on states, returns the on-policy actions Parameter --------- states - float array shape=(num_agents, state_size) Return --------- Float array shape=(num_agents, action_size), chosen action """ states = torch.from_numpy(states).float().to(device) self.trained_policy.eval() with torch.no_grad(): actions = self.trained_policy(states) # TODO: add exploration noise return actions.cpu().data.numpy() def train(self): """ Training routine to update the policy and critic """ states, actions, next_states, rewards, dones = self.memory.sample( self.batch_size) states = torch.from_numpy(states).float().to(device) actions = torch.from_numpy(actions).float().to(device) next_states = torch.from_numpy(next_states).float().to(device) rewards = torch.from_numpy(rewards).float().to(device) dones = torch.from_numpy(dones).float().to(device) # critic update next_actions = self.target_policy(next_states) self.trained_critic.train() self.critic_optimizer.zero_grad() done_mask = 1 - dones target_states_values = rewards + self.gamma * \ self.target_critic(next_states, next_actions) * done_mask predicted_states_values = self.trained_critic(states, actions) critic_loss = torch.mean( (target_states_values - predicted_states_values)**2) critic_loss.backward() self.critic_optimizer.step() # policy update self.trained_policy.train() self.policy_optimizer.zero_grad() action_values = self.trained_critic(states, self.trained_policy(states)) surrogate = -torch.mean(action_values) surrogate.backward() self.policy_optimizer.step() self.target_network_update(self.trained_critic, self.target_critic) self.target_network_update(self.trained_policy, self.target_policy) return surrogate.cpu().data.numpy(), critic_loss.cpu().data.numpy() def target_network_update(self, trained_model, target_model): """ Performs a soft update with rate tau from the trained_model to the target_model. """ target_model_weights = target_model.get_weights() train_model_weights = trained_model.get_weights() new_weights = [] for w1, w2 in zip(target_model_weights, train_model_weights): new_weights.append(w1 * (1 - self.tau) + w2 * self.tau) target_model.set_weights(new_weights) def print_status(self, i_episode): """ Print the latest status of the agent Parameter --------- i_episode, int """ print( "\rEpisode %d/%d | Average Score: %.2f | Surrogate: %.5f | Critic loss: %.5f " % (i_episode, self.num_episodes, self.scores[-1], self.surrogates[-1], self.critic_losses[-1]), end="") sys.stdout.flush()
class PPOController: """ Deep learning agent based on Proximal Policy Optimization, based on https://arxiv.org/pdf/1506.02438.pdf """ def __init__(self, env, brain_name, config, policy=None, critic=None): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'update_every', int, update frequency, in number of steps - 'train_iterations', int, number of training passes over a data batch - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'std', float, standard deviation used for the Normal distribution of the policy - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment - 'policy', optional, used to pass a mock policy for testing purposes - 'critic', optional, used to pass a mock critic for testing purposes """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.policy = Policy(config, self.state_size, self.action_size) if policy is None else policy self.trained_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic.eval() self.memory = AgentMemory( ((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, ), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.epsilon = config.epsilon_start self.scores = [] self.surrogates = [] self.optimizer = optim.Adam([{ 'params': self.policy.parameters() }, { 'params': self.trained_critic.parameters() }], lr=config.learning_rate) def solve(self): """ Main method to launch the environment loop """ step = 1 for i_episode in range(1, self.num_episodes + 1): env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations rewards = [] surrogates = [] while True: action, log_probability = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done self.memory.add( (state, action, log_probability, next_state, reward, done)) state = next_state rewards.append(reward) if not step % self.update_every: surrogate_buffer = self.train_loop() surrogates.append(surrogate_buffer) step += 1 if np.any(done): break self.scores.append(np.mean(np.sum(rewards, axis=0))) self.surrogates.append(np.mean(surrogates)) self.epsilon *= self.epsilon_decay self.print_status(i_episode) return self.scores, self.surrogates def act(self, states): """ Based on states, returns the on-policy actions Parameter --------- states - float array shape=(num_agents, state_size) Return --------- Float array shape=(num_agents, action_size), chosen action """ states = torch.from_numpy(states).float().to(device) self.policy.eval() actions, log_probabilities = self.policy.next_actions(states) return actions.cpu().data.numpy(), log_probabilities.cpu().data.numpy() def train_loop(self): """ Training routine to update the policy and critic """ surrogate_buffer = [] states, actions, old_log_probabilities, next_states, rewards, dones = self.memory.get_latest( self.update_every) future_rewards = self.compute_discounted_future_rewards(rewards) old_log_probabilities = torch.from_numpy( old_log_probabilities).float().to(device) states = torch.from_numpy(states).float().to(device) actions = torch.from_numpy(actions).float().to(device) next_states = torch.from_numpy(next_states).float().to(device) future_rewards = torch.from_numpy(future_rewards).float().to(device) dones = torch.from_numpy(dones).bool().to(device) self.policy.train() self.trained_critic.train() for _ in range(self.train_iterations): surrogate = self.compute_surrogate(old_log_probabilities, states, actions, next_states, future_rewards, dones) surrogate_buffer.append(surrogate.cpu().data.numpy()) self.optimizer.zero_grad() surrogate.backward() self.optimizer.step() self.target_network_update() return surrogate_buffer def compute_surrogate(self, old_log_probabilities, states, actions, next_states, future_rewards, dones): """ Compute the surrogate, i.e. the function optimized at training time Parameters ---------- - old_log_probabilities, float Tensor shape=(batch_size, num_agents), original probabilities for the performed action - states, float Tensor shape=(batch_size, num_agents, state_size) - actions, float Tensor shape=(batch_size, num_agents, action_size) - next_states, float Tensor shape=(batch_size, num_agents, state_size) - future_rewards, float Tensor shape=(batch_size, num_agents), discounted sum of future rewards over the length of the trajectory - dones, float Tensor shape=(batch_size, num_agents) Return --------- Surrogate, float Tensor """ new_log_probabilities, entropy = self.policy.get_log_probabilities_and_entropy( states, actions) ratio = torch.exp(new_log_probabilities - old_log_probabilities) with torch.no_grad(): states_values = self.target_critic(states) next_states_values = self.target_critic(next_states[-1, :]) if torch.any(dones): final_states_values = 0 else: final_states_values = next_states_values.expand( states_values.shape) future_rewards = self.normalize(future_rewards) discount = self.gamma**torch.arange(len(states_values), 0, -1, dtype=torch.float).unsqueeze(1) target_states_values = future_rewards + final_states_values * discount advantages = target_states_values - states_values clip = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) clipped_surrogate = torch.min(ratio * advantages, clip * advantages) return -1 * torch.mean( clipped_surrogate) + 0.5 * self.trained_critic.mse( states_values, target_states_values) - 0.01 * entropy.mean() def normalize(self, a): """ Normalize a torch Tensor Parameters ---------- - a, float Tensor to normalize """ mean = torch.mean(a, -1) std = torch.std(a, -1) b = a mask = std != 0 b[mask] = (a[mask] - mean[mask].unsqueeze(1)) / std[mask].unsqueeze(1) # if the deviation is null set the normalized reward to 0 mask = std == 0 b[mask] = 0 return b def compute_discounted_future_rewards(self, rewards): """ Compute the discounted sum of future reward over the trajectory Parameters ---------- - rewards, float array shape=(batch_size, num_agents) Return ---------- Discounted future rewards, float array shape=(batch_size, num_agents) """ # This is complex so giving an example with gamma = 0.5 and # rewards = [[1, 0], # [1, 1]] main_dim = len(rewards) # discounts = [1, 0.5] discounts = (self.gamma**np.arange(main_dim)) # discounts = [[1, 0.5], # [1, 0.5]] discounts = np.tile(discounts, main_dim).reshape(main_dim, main_dim) # indexes = [[0, 1], # [1, 2]] indexes = np.tile(np.arange(main_dim), main_dim).reshape( main_dim, main_dim) + np.arange(main_dim)[:, np.newaxis] # indexes = [[0, 1], # [1, 0]] indexes = np.mod(indexes, main_dim) # discounts = [[1, 0.5], # [0, 1]] discounts = np.triu(discounts[range(main_dim), indexes]) # rewards = [[1.5, 0.5], # [1, 1]] return np.dot(discounts, rewards) def target_network_update(self): """ Performs a soft update with rate tau from the trained_model to the target_model. """ target_model_weights = self.target_critic.get_weights() train_model_weights = self.trained_critic.get_weights() new_weights = [] for w1, w2 in zip(target_model_weights, train_model_weights): new_weights.append(w1 * (1 - self.tau) + w2 * self.tau) self.target_critic.set_weights(new_weights) def print_status(self, i_episode): """ Print the latest status of the agent Parameter --------- i_episode, int """ print( "\rEpisode %d/%d | Average Score: %.2f | Model surrogate: %.5f " % (i_episode, self.num_episodes, self.scores[-1], self.surrogates[-1]), end="") sys.stdout.flush()