class DeepActorCriticAgent(mp.Process): def __init__(self, id, env_name, agent_params, env_params): """ An Advantage Actor-Critic Agent that uses a Deep Neural Network to represent it's Policy and the Value function :param id: An integer ID to identify the agent in case there are multiple agent instances :param env_name: Name/ID of the environment :param agent_params: Parameters to be used by the agent """ super(DeepActorCriticAgent, self).__init__() self.id = id self.actor_name = "actor" + str(self.id) self.env_name = env_name self.params = agent_params self.env_conf = env_params self.policy = self.multi_variate_gaussian_policy self.gamma = self.params['gamma'] self.trajectory = [ ] # Contains the trajectory of the agent as a sequence of Transitions self.rewards = [ ] # Contains the rewards obtained from the env at every step self.global_step_num = 0 self.best_mean_reward = -float( "inf") # Agent's personal best mean episode reward self.best_reward = -float("inf") self.saved_params = False # Whether or not the params have been saved along with the model to model_dir self.continuous_action_space = True #Assumption by default unless env.action_space is Discrete def multi_variate_gaussian_policy(self, obs): """ Calculates a multi-variate gaussian distribution over actions given observations :param obs: Agent's observation :return: policy, a distribution over actions for the given observation """ mu, sigma = self.actor(obs) value = self.critic(obs) [ mu[:, i].clamp_(float(self.env.action_space.low[i]), float(self.env.action_space.high[i])) for i in range(self.action_shape) ] # Clamp each dim of mu based on the (low,high) limits of that action dim sigma = torch.nn.Softplus()( sigma).squeeze() + 1e-7 # Let sigma be (smoothly) +ve self.mu = mu.to(torch.device("cpu")) self.sigma = sigma.to(torch.device("cpu")) self.value = value.to(torch.device("cpu")) if len(self.mu.shape) == 0: # See if mu is a scalar #self.mu = self.mu.unsqueeze(0) # This prevents MultivariateNormal from crashing with SIGFPE self.mu.unsqueeze_(0) self.action_distribution = MultivariateNormal( self.mu, torch.eye(self.action_shape) * self.sigma, validate_args=True) return self.action_distribution def discrete_policy(self, obs): """ Calculates a discrete/categorical distribution over actions given observations :param obs: Agent's observation :return: policy, a distribution over actions for the given observation """ logits = self.actor(obs) value = self.critic(obs) self.logits = logits.to(torch.device("cpu")) self.value = value.to(torch.device("cpu")) self.action_distribution = Categorical(logits=self.logits) return self.action_distribution def preproc_obs(self, obs): obs = np.array( obs ) # Obs could be lazy frames. So, force fetch before moving forward if len(obs.shape) == 3: # Reshape obs from (H x W x C) order to this order: C x W x H and resize to (C x 84 x 84) obs = np.reshape(obs, (obs.shape[2], obs.shape[1], obs.shape[0])) obs = np.resize(obs, (obs.shape[0], 84, 84)) # Convert to torch Tensor, add a batch dimension, convert to float repr obs = torch.from_numpy(obs).unsqueeze(0).float() return obs def process_action(self, action): if self.continuous_action_space: [ action[:, i].clamp_(float(self.env.action_space.low[i]), float(self.env.action_space.high[i])) for i in range(self.action_shape) ] # Limit the action to lie between the (low, high) limits of the env action = action.to(torch.device("cpu")) return action.numpy().squeeze( 0 ) # Convert to numpy ndarray, squeeze and remove the batch dimension def get_action(self, obs): obs = self.preproc_obs(obs) action_distribution = self.policy( obs ) # Call to self.policy(obs) also populates self.value with V(obs) value = self.value action = action_distribution.sample() log_prob_a = action_distribution.log_prob(action) action = self.process_action(action) self.trajectory.append(Transition( obs, value, action, log_prob_a)) # Construct the trajectory return action def calculate_n_step_return(self, n_step_rewards, final_state, done, gamma): """ Calculates the n-step return for each state in the input-trajectory/n_step_transitions :param n_step_rewards: List of rewards for each step :param final_state: Final state in this n_step_transition/trajectory :param done: True rf the final state is a terminal state if not, False :return: The n-step return for each state in the n_step_transitions """ g_t_n_s = list() with torch.no_grad(): g_t_n = torch.tensor([[0]]).float() if done else self.critic( self.preproc_obs(final_state)).cpu() for r_t in n_step_rewards[:: -1]: # Reverse order; From r_tpn to r_t g_t_n = torch.tensor(r_t).float() + self.gamma * g_t_n g_t_n_s.insert( 0, g_t_n ) # n-step returns inserted to the left to maintain correct index order return g_t_n_s def calculate_loss(self, trajectory, td_targets): """ Calculates the critic and actor losses using the td_targets and self.trajectory :param td_targets: :return: """ n_step_trajectory = Transition(*zip(*trajectory)) v_s_batch = n_step_trajectory.value_s log_prob_a_batch = n_step_trajectory.log_prob_a actor_losses, critic_losses = [], [] for td_target, critic_prediction, log_p_a in zip( td_targets, v_s_batch, log_prob_a_batch): td_err = td_target - critic_prediction actor_losses.append( -log_p_a * td_err) # td_err is an unbiased estimated of Advantage critic_losses.append(F.smooth_l1_loss(critic_prediction, td_target)) #critic_loss.append(F.mse_loss(critic_pred, td_target)) if self.params["use_entropy_bonus"]: actor_loss = torch.stack(actor_losses).mean( ) - self.action_distribution.entropy().mean() else: actor_loss = torch.stack(actor_losses).mean() critic_loss = torch.stack(critic_losses).mean() writer.add_scalar(self.actor_name + "/critic_loss", critic_loss, self.global_step_num) writer.add_scalar(self.actor_name + "/actor_loss", actor_loss, self.global_step_num) return actor_loss, critic_loss def learn(self, n_th_observation, done): if self.params["clip_rewards"]: self.rewards = np.sign( self.rewards).tolist() # Clip rewards to -1 or 0 or +1 td_targets = self.calculate_n_step_return(self.rewards, n_th_observation, done, self.gamma) actor_loss, critic_loss = self.calculate_loss(self.trajectory, td_targets) self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.trajectory.clear() self.rewards.clear() def save(self): model_file_name = self.params[ "model_dir"] + "A2C_" + self.env_name + ".ptm" agent_state = { "Actor": self.actor.state_dict(), "Critic": self.critic.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, model_file_name) print("Agent's state is saved to", model_file_name) # Export the params used if not exported already if not self.saved_params: params_manager.export_agent_params(model_file_name + ".agent_params") print("The parameters have been saved to", model_file_name + ".agent_params") self.saved_params = True def load(self): model_file_name = self.params[ "model_dir"] + "A2C_" + self.env_name + ".ptm" agent_state = torch.load(model_file_name, map_location=lambda storage, loc: storage) self.actor.load_state_dict(agent_state["Actor"]) self.critic.load_state_dict(agent_state["Critic"]) self.actor.to(device) self.critic.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Loaded Advantage Actor-Critic model state from", model_file_name, " which fetched a best mean reward of:", self.best_mean_reward, " and an all time best reward of:", self.best_reward) def run(self): # If a custom useful_region configuration for this environment ID is available, use it if not use the Default. # Currently this is utilized for only the Atari env. Follows the same procedure as in Chapter 6 custom_region_available = False for key, value in self.env_conf['useful_region'].items(): if key in args.env: self.env_conf['useful_region'] = value custom_region_available = True break if custom_region_available is not True: self.env_conf['useful_region'] = self.env_conf['useful_region'][ 'Default'] atari_env = False for game in Atari.get_games_list(): if game in args.env.lower(): atari_env = True if atari_env: # Use the Atari wrappers (like we did in Chapter 6) if it's an Atari env self.env = Atari.make_env(self.env_name, self.env_conf) else: #print("Given environment name is not an Atari Env. Creating a Gym env") self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape if isinstance(self.env.action_space.sample(), int): # Discrete action space self.action_shape = self.env.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.env.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) ep_reward += reward step_num += 1 if not args.test and ( step_num >= self.params["learning_step_thresh"] or done): self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: episode_rewards.append(ep_reward) if ep_reward > self.best_reward: self.best_reward = ep_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs self.global_step_num += 1 if args.render: self.env.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print( "{}:Episode#:{} \t ep_reward:{} \t mean_ep_rew:{}\t best_ep_reward:{}" .format(self.actor_name, episode, ep_reward, np.mean(episode_rewards), self.best_reward)) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
def run(self): # If a custom useful_region configuration for this environment ID is available, use it if not use the Default. # Currently this is utilized for only the Atari env. Follows the same procedure as in Chapter 6 custom_region_available = False for key, value in self.env_conf['useful_region'].items(): if key in args.env: self.env_conf['useful_region'] = value custom_region_available = True break if custom_region_available is not True: self.env_conf['useful_region'] = self.env_conf['useful_region'][ 'Default'] atari_env = False for game in Atari.get_games_list(): if game in args.env.lower(): atari_env = True if atari_env: # Use the Atari wrappers (like we did in Chapter 6) if it's an Atari env self.env = Atari.make_env(self.env_name, self.env_conf) else: #print("Given environment name is not an Atari Env. Creating a Gym env") self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape if isinstance(self.env.action_space.sample(), int): # Discrete action space self.action_shape = self.env.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.env.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) ep_reward += reward step_num += 1 if not args.test and ( step_num >= self.params["learning_step_thresh"] or done): self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: episode_rewards.append(ep_reward) if ep_reward > self.best_reward: self.best_reward = ep_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs self.global_step_num += 1 if args.render: self.env.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print( "{}:Episode#:{} \t ep_reward:{} \t mean_ep_rew:{}\t best_ep_reward:{}" .format(self.actor_name, episode, ep_reward, np.mean(episode_rewards), self.best_reward)) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
class DeepActorCriticAgent(mp.Process): def __init__(self, id, env_name, agent_params): """ An Actor-Critic Agent that uses a Deep Neural Network to represent it's Policy and the Value function :param state_shape: :param action_shape: """ super(DeepActorCriticAgent, self).__init__() self.id = id self.actor_name = "actor" + str(self.id) self.env_name = env_name self.params = agent_params self.policy = self.multi_variate_gaussian_policy self.gamma = self.params['gamma'] self.trajectory = [ ] # Contains the trajectory of the agent as a sequence of Transitions self.rewards = [ ] # Contains the rewards obtained from the env at every step self.global_step_num = 0 self.best_mean_reward = -float( "inf") # Agent's personal best mean episode reward self.best_reward = -float("inf") def multi_variate_gaussian_policy(self, obs): """ Calculates a multi-variate gaussian distribution over actions given observations :param obs: Agent's observation :return: policy, a distribution over actions for the given observation """ mu, sigma = self.actor(obs) value = self.critic(obs) [ mu[:, i].clamp_(float(self.env.action_space.low[i]), float(self.env.action_space.high[i])) for i in range(self.action_shape) ] # Clamp each dim of mu based on the (low,high) limits of that action dim sigma = torch.nn.Softplus()( sigma).squeeze() + 1e-7 # Let sigma be (smoothly) +ve self.mu = mu.to(torch.device("cpu")) self.sigma = sigma.to(torch.device("cpu")) self.value = value.to(torch.device("cpu")) if len(self.mu.shape) == 0: # See if mu is a scalar #self.mu = self.mu.unsqueeze(0) # This prevents MultivariateNormal from crashing with SIGFPE self.mu.unsqueeze_(0) self.action_distribution = MultivariateNormal( self.mu, torch.eye(self.action_shape) * self.sigma, validate_args=True) return (self.action_distribution) def preproc_obs(self, obs): if len(obs.shape) == 3: # Make sure the obs are in this order: C x W x H and add a batch dimension obs = np.reshape(obs, (obs.shape[2], obs.shape[1], obs.shape[0])) obs = np.resize(obs, (3, 84, 84)) # Convert to torch Tensor, add a batch dimension, convert to float repr obs = torch.from_numpy(obs).unsqueeze(0).float() return obs def process_action(self, action): [ action[:, i].clamp_(float(self.env.action_space.low[i]), float(self.env.action_space.high[i])) for i in range(self.action_shape) ] # Limit the action to lie between the (low, high) limits of the env action = action.to(torch.device("cpu")) return action.numpy().squeeze( 0 ) # Convert to numpy ndarray, squeeze and remove the batch dimension def get_action(self, obs): obs = self.preproc_obs(obs) action_distribution = self.policy( obs ) # Call to self.policy(obs) also populates self.value with V(obs) value = self.value action = action_distribution.sample() log_prob_a = action_distribution.log_prob(action) action = self.process_action(action) self.trajectory.append(Transition( obs, value, action, log_prob_a)) # Construct the trajectory return action def calculate_n_step_return(self, n_step_rewards, final_state, done, gamma): """ Calculates the n-step return for each state in the input-trajectory/n_step_transitions :param n_step_rewards: List of rewards for each step :param final_state: Final state in this n_step_transition/trajectory :param done: True rf the final state is a terminal state if not, False :return: The n-step return for each state in the n_step_transitions """ g_t_n_s = list() with torch.no_grad(): g_t_n = torch.tensor([[0]]).float() if done else self.critic( self.preproc_obs(final_state)).cpu() for r_t in n_step_rewards[:: -1]: # Reverse order; From r_tpn to r_t g_t_n = torch.tensor(r_t).float() + self.gamma * g_t_n g_t_n_s.insert( 0, g_t_n ) # n-step returns inserted to the left to maintain correct index order return g_t_n_s def calculate_loss(self, trajectory, td_targets): """ Calculates the critic and actor losses using the td_targets and self.trajectory :param td_targets: :return: """ n_step_trajectory = Transition(*zip(*trajectory)) v_s_batch = n_step_trajectory.value_s log_prob_a_batch = n_step_trajectory.log_prob_a actor_loss, critic_loss = [], [] for td_target, critic_prediction, log_p_a in zip( td_targets, v_s_batch, log_prob_a_batch): td_err = td_target - critic_prediction actor_loss.append( -log_p_a * td_err) # td_err is an unbiased estimated of Advantage critic_loss.append(F.smooth_l1_loss(critic_prediction, td_target)) #critic_loss.append(F.mse_loss(critic_pred, td_target)) actor_loss = torch.stack(actor_loss).mean() critic_loss = torch.stack(critic_loss).mean() writer.add_scalar(self.actor_name + "/critic_loss", critic_loss, self.global_step_num) writer.add_scalar(self.actor_name + "/actor_loss", actor_loss, self.global_step_num) return actor_loss, critic_loss def learn(self, n_th_observation, done): td_targets = self.calculate_n_step_return(self.rewards, n_th_observation, done, self.gamma) actor_loss, critic_loss = self.calculate_loss(self.trajectory, td_targets) self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.trajectory.clear() self.rewards.clear() def save(self): model_file_name = self.params[ "model_dir"] + "A2C_" + self.env_name + ".ptm" agent_state = { "Actor": self.actor.state_dict(), "Critic": self.critic.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, model_file_name) print("Agent's state is saved to", model_file_name) def load(self): model_file_name = self.params[ "model_dir"] + "A3C_" + self.env_name + ".ptm" agent_state = torch.load(model_file_name, map_location=lambda storage, loc: storage) self.actor.load_state_dict(agent_state["Actor"]) self.critic.load_state_dict(agent_state["Critic"]) self.actor.to(device) self.critic.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Loaded Advantage Actor-Critic model state from", model_file_name, " which fetched a best mean reward of:", self.best_mean_reward, " and an all time best reward of:", self.best_reward) def run(self): self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape self.action_shape = self.env.action_space.shape[0] self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) step_num += 1 if step_num >= self.params["learning_step_thresh"] or done: self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: cum_reward = np.sum(self.rewards) episode_rewards.append(cum_reward) if cum_reward > self.best_reward: self.best_reward = cum_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs ep_reward += reward self.global_step_num += 1 #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print(self.actor_name + ":Episode#:", episode, "\t ep_reward=", ep_reward) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
def run(self): self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape self.action_shape = self.env.action_space.shape[0] self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) step_num += 1 if step_num >= self.params["learning_step_thresh"] or done: self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: cum_reward = np.sum(self.rewards) episode_rewards.append(cum_reward) if cum_reward > self.best_reward: self.best_reward = cum_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs ep_reward += reward self.global_step_num += 1 #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print(self.actor_name + ":Episode#:", episode, "\t ep_reward=", ep_reward) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
class DeepActorCriticAgent(): def __init__(self, id, env_names, agent_params): """ An Actor-Critic Agent that uses a Deep Neural Network to represent it's Policy and the Value function :param state_shape: :param action_shape: """ super(DeepActorCriticAgent, self).__init__() self.id = id self.actor_name = "actor" + str(self.id) self.env_names = env_names self.params = agent_params self.policy = self.multi_variate_gaussian_policy self.gamma = self.params['gamma'] self.trajectory = [ ] # Contains the trajectory of the agent as a sequence of Transitions self.rewards = [ ] # Contains the rewards obtained from the env at every step self.global_step_num = 0 self.best_mean_reward = -float( "inf") # Agent's personal best mean episode reward self.best_reward = -float("inf") self.saved_params = False # Whether or not the params have been saved along with the model to model_dir self.continuous_action_space = True # Assumption by default unless env.action_space is Discrete def multi_variate_gaussian_policy(self, obs): """ Calculates a multi-variate gaussian distribution over actions given observations :param obs: Agent's observation :return: policy, a distribution over actions for the given observation """ mu, sigma = self.actor(obs) value = self.critic(obs).squeeze() [ mu[:, i].clamp_(float(self.envs.action_space.low[i]), float(self.envs.action_space.high[i])) for i in range(self.action_shape) ] # Clamp each dim of mu based on the (low,high) limits of that action dim sigma = torch.nn.Softplus()( sigma) + 1e-7 # Let sigma be (smoothly) +ve self.mu = mu.to(torch.device("cpu")) self.sigma = sigma.to(torch.device("cpu")) self.value = value.to(torch.device("cpu")) if len(self.mu[0].shape) == 0: # See if mu is a scalar self.mu = self.mu.unsqueeze( 0 ) # This prevents MultivariateNormal from crashing with SIGFPE self.covariance = torch.stack( [torch.eye(self.action_shape) * s for s in self.sigma]) if self.action_shape == 1: self.covariance = self.sigma.unsqueeze( -1 ) # Make the covariance a square mat to avoid RuntimeError with MultivariateNormal self.action_distribution = MultivariateNormal(self.mu, self.covariance) return self.action_distribution def discrete_policy(self, obs): """ Calculates a discrete/categorical distribution over actions given observations :param obs: Agent's observation :return: policy, a distribution over actions for the given observation """ logits = self.actor(obs) value = self.critic(obs).squeeze() self.logits = logits.to(torch.device("cpu")) self.value = value.to(torch.device("cpu")) self.action_distribution = Categorical(logits=self.logits) return self.action_distribution def preproc_obs(self, obs): if len( obs[0].shape ) == 3: # shape of obs:(num_agents, obs_im_height, obs_im_width, obs_num_channels) # Reshape obs from (B x H x W x C) order to this order: B x C x W x H and resize to (C x 84 x 84) obs = np.reshape(obs, (-1, obs.shape[3], obs.shape[2], obs.shape[1])) # The environment wrapper already takes care of reshaping image obs into 84 x 84 x C. Can be skipped obs = np.resize(obs, (-1, obs.shape[1], 84, 84)) # Convert to torch Tensor, convert to float repr obs = torch.from_numpy(obs).float() return obs def process_action(self, action): if self.continuous_action_space: [ action[:, i].clamp_(float(self.envs.action_space.low[i]), float(self.envs.action_space.high[i])) for i in range(self.action_shape) ] # Limit the action to lie between the (low, high) limits of the env action = action.to(torch.device("cpu")) return action.numpy() def get_action(self, obs): obs = self.preproc_obs(obs) action_distributions = self.policy( obs ) # Call to self.policy(obs) also populates self.value with V(obs) value = self.value actions = action_distributions.sample() log_prob_a = action_distributions.log_prob(actions) actions = self.process_action(actions) # Store the n-step trajectory for learning. Skip storing the trajectory in test only mode if not self.params["test"]: self.trajectory.append(Transition( obs, value, actions, log_prob_a)) # Construct the trajectory return actions # TODO: rename num_agents to num_actors in parameters.json file to be consistent with comments def calculate_n_step_return(self, n_step_rewards, next_states, dones, gamma): """ Calculates the n-step return for each state in the input-trajectory/n_step_transitions for the "done" actors :param n_step_rewards: List of length=num_steps containing rewards of shape=(num_actors x 1) :param next_states: list of length=num_actors containing next observations of shape=(obs_shape) :param dones: list of length=num_actors containing True if the next_state is a terminal state if not, False :return: The n-step return for each state in the n_step_transitions """ g_t_n_s = list() with torch.no_grad(): # 1. Calculate next-state values for each actor: # a. If next_state is terminal (done[actor_idx]=True), set g_t_n[actor_idx]=0 # b. If next_state is non-terminal (done[actor_idx]=False), set g_t_n[actor_idx] to Critic's prediction g_t_n = torch.tensor([[not d] for d in dones]).float() # 1. a. # See if there is at least one non-terminal next-state if np.where([not d for d in dones])[0].size > 0: non_terminal_idxs = torch.tensor( np.where([not d for d in dones])).squeeze(0) g_t_n[non_terminal_idxs] = self.critic( self.preproc_obs( next_states[non_terminal_idxs])).cpu() # 1. b. g_t_n_s_batch = [] n_step_rewards = torch.stack( n_step_rewards) # tensor of shape (num_steps x num_actors x 1) # For each actor for actor_idx in range(n_step_rewards.shape[1]): actor_n_step_rewards = n_step_rewards.index_select( 1, torch.tensor([actor_idx])) # shape:(num_steps,1) g_t_n_s = [] # Calculate n number of n-step returns for r_t in actor_n_step_rewards.numpy( )[:: -1]: # Reverse order; From r_tpn to r_t; PyTorch can't slice in reverse #229 g_t_n[actor_idx] = torch.tensor( r_t).float() + self.gamma * g_t_n[actor_idx] g_t_n_s.insert( 0, g_t_n[actor_idx].clone() ) # n-step returns inserted to the left to maintain correct index order g_t_n_s_batch.append(g_t_n_s) return torch.tensor( g_t_n_s_batch) # tensor of shape:(num_actors, num_steps, 1) def calculate_loss(self, trajectory, td_targets): """ Calculates the critic and actor losses using the td_targets and self.trajectory :param trajectory: List of trajectories from all the actors :param td_targets: Tensor of shape:(num_actors, num_steps, 1) :return: """ n_step_trajectory = Transition(*zip(*trajectory)) # n_step_trajectory.x returns a list of length= num_steps containing num_actors x shape_of_x items # 1. Create tensor of shape:(num_steps x num_actors x shape_of_x) (using torch.stack()) # 2. Reshape the tensor to be of shape:(num_actors x num_steps x shape_of_x) (using torch.transpose(1,0) v_s_batch = torch.stack(n_step_trajectory.value_s).transpose( 1, 0) # shape:(num_actors, num_steps, 1) log_prob_a_batch = torch.stack(n_step_trajectory.log_prob_a).transpose( 1, 0) # shape:(num_actors, num_steps, 1) actor_losses, critic_losses = [], [] for td_targets, critic_predictions, log_p_a in zip( td_targets, v_s_batch, log_prob_a_batch): td_err = td_targets - critic_predictions actor_losses.append( -log_p_a * td_err) # td_err is an unbiased estimated of Advantage critic_losses.append( F.smooth_l1_loss(critic_predictions, td_targets)) #critic_loss.append(F.mse_loss(critic_pred, td_target)) if self.params["use_entropy_bonus"]: actor_loss = torch.stack(actor_losses).mean( ) - self.action_distribution.entropy().mean() else: actor_loss = torch.stack(actor_losses).mean() critic_loss = torch.stack(critic_losses).mean() writer.add_scalar(self.actor_name + "/critic_loss", critic_loss, self.global_step_num) writer.add_scalar(self.actor_name + "/actor_loss", actor_loss, self.global_step_num) return actor_loss, critic_loss def learn(self, n_th_observations, dones): td_targets = self.calculate_n_step_return(self.rewards, n_th_observations, dones, self.gamma) actor_loss, critic_loss = self.calculate_loss(self.trajectory, td_targets) self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.trajectory.clear() self.rewards.clear() def save(self): model_file_name = self.params[ "model_dir"] + "Batch-A2C_" + self.env_names[0] + ".ptm" agent_state = { "Actor": self.actor.state_dict(), "Critic": self.critic.state_dict(), "best_mean_reward": self.best_mean_reward, "best_reward": self.best_reward } torch.save(agent_state, model_file_name) print("Agent's state is saved to", model_file_name) # Export the params used if not exported already if not self.saved_params: params_manager.export_agent_params(model_file_name + ".agent_params") print("The parameters have been saved to", model_file_name + ".agent_params") self.saved_params = True def load(self): model_file_name = self.params[ "model_dir"] + "Batch-A2C_" + self.env_names[0] + ".ptm" agent_state = torch.load(model_file_name, map_location=lambda storage, loc: storage) self.actor.load_state_dict(agent_state["Actor"]) self.critic.load_state_dict(agent_state["Critic"]) self.actor.to(device) self.critic.to(device) self.best_mean_reward = agent_state["best_mean_reward"] self.best_reward = agent_state["best_reward"] print("Loaded Advantage Actor-Critic model state from", model_file_name, " which fetched a best mean reward of:", self.best_mean_reward, " and an all time best reward of:", self.best_reward) def run(self): self.envs = SubprocVecEnv(self.env_names) self.state_shape = self.envs.observation_space.shape if isinstance(self.envs.action_space.sample(), int): # Discrete action space self.action_shape = self.envs.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.envs.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) #for episode in range(self.params["max_num_episodes"]): obs = self.envs.reset() # TODO: Create appropriate masks to take care of envs that have set dones to True & learn() accordingly episode = 0 cum_step_rewards = np.zeros(self.params["num_agents"]) episode_rewards = [] step_num = 0 while True: action = self.get_action(obs) next_obs, rewards, dones, _ = self.envs.step(action) self.rewards.append(torch.tensor(rewards)) done_env_idxs = np.where(dones)[0] cum_step_rewards += rewards # nd-array of shape=num_actors step_num += self.params["num_agents"] episode += done_env_idxs.size # Update the number of finished episodes if not args.test and ( step_num >= self.params["learning_step_thresh"] or done_env_idxs.size): self.learn(next_obs, dones) step_num = 0 # Monitor performance and save Agent's state when perf improves if done_env_idxs.size > 0: [ episode_rewards.append(r) for r in cum_step_rewards[done_env_idxs] ] if np.max(cum_step_rewards[done_env_idxs] ) > self.best_reward: self.best_reward = np.max( cum_step_rewards[done_env_idxs]) if np.mean(episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean(episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 writer.add_scalar(self.actor_name + "/mean_ep_rew", np.mean(cum_step_rewards[done_env_idxs]), self.global_step_num) # Reset the cum_step_rew for the done envs cum_step_rewards[done_env_idxs] = 0.0 obs = next_obs self.global_step_num += self.params["num_agents"] if args.render: self.envs.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", np.mean(cum_step_rewards), self.global_step_num) print( "{}:Episode#:{} \t avg_step_reward:{:.4} \t mean_ep_rew:{:.4}\t best_ep_reward:{:.4}" .format(self.actor_name, episode, np.mean(cum_step_rewards), np.mean(episode_rewards), self.best_reward))
def run(self): self.envs = SubprocVecEnv(self.env_names) self.state_shape = self.envs.observation_space.shape if isinstance(self.envs.action_space.sample(), int): # Discrete action space self.action_shape = self.envs.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.envs.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) #for episode in range(self.params["max_num_episodes"]): obs = self.envs.reset() # TODO: Create appropriate masks to take care of envs that have set dones to True & learn() accordingly episode = 0 cum_step_rewards = np.zeros(self.params["num_agents"]) episode_rewards = [] step_num = 0 while True: action = self.get_action(obs) next_obs, rewards, dones, _ = self.envs.step(action) self.rewards.append(torch.tensor(rewards)) done_env_idxs = np.where(dones)[0] cum_step_rewards += rewards # nd-array of shape=num_actors step_num += self.params["num_agents"] episode += done_env_idxs.size # Update the number of finished episodes if not args.test and ( step_num >= self.params["learning_step_thresh"] or done_env_idxs.size): self.learn(next_obs, dones) step_num = 0 # Monitor performance and save Agent's state when perf improves if done_env_idxs.size > 0: [ episode_rewards.append(r) for r in cum_step_rewards[done_env_idxs] ] if np.max(cum_step_rewards[done_env_idxs] ) > self.best_reward: self.best_reward = np.max( cum_step_rewards[done_env_idxs]) if np.mean(episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean(episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 writer.add_scalar(self.actor_name + "/mean_ep_rew", np.mean(cum_step_rewards[done_env_idxs]), self.global_step_num) # Reset the cum_step_rew for the done envs cum_step_rewards[done_env_idxs] = 0.0 obs = next_obs self.global_step_num += self.params["num_agents"] if args.render: self.envs.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", np.mean(cum_step_rewards), self.global_step_num) print( "{}:Episode#:{} \t avg_step_reward:{:.4} \t mean_ep_rew:{:.4}\t best_ep_reward:{:.4}" .format(self.actor_name, episode, np.mean(cum_step_rewards), np.mean(episode_rewards), self.best_reward))