def run(self): self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape self.action_shape = self.env.action_space.shape[0] self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=1e-3) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) step_num += 1 if step_num >= self.params["learning_step_thresh"] or done: self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: cum_reward = np.sum(self.rewards) episode_rewards.append(cum_reward) if cum_reward > self.best_reward: self.best_reward = cum_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs ep_reward += reward self.global_step_num += 1 #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print(self.actor_name + ":Episode#:", episode, "\t ep_reward=", ep_reward) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
def run(self): # If a custom useful_region configuration for this environment ID is available, use it if not use the Default. # Currently this is utilized for only the Atari env. Follows the same procedure as in Chapter 6 custom_region_available = False for key, value in self.env_conf['useful_region'].items(): if key in args.env: self.env_conf['useful_region'] = value custom_region_available = True break if custom_region_available is not True: self.env_conf['useful_region'] = self.env_conf['useful_region'][ 'Default'] atari_env = False for game in Atari.get_games_list(): if game in args.env.lower(): atari_env = True if atari_env: # Use the Atari wrappers (like we did in Chapter 6) if it's an Atari env self.env = Atari.make_env(self.env_name, self.env_conf) else: #print("Given environment name is not an Atari Env. Creating a Gym env") self.env = gym.make(self.env_name) self.state_shape = self.env.observation_space.shape if isinstance(self.env.action_space.sample(), int): # Discrete action space self.action_shape = self.env.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.env.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) ep_reward += reward step_num += 1 if not args.test and ( step_num >= self.params["learning_step_thresh"] or done): self.learn(next_obs, done) step_num = 0 # Monitor performance and save Agent's state when perf improves if done: episode_rewards.append(ep_reward) if ep_reward > self.best_reward: self.best_reward = ep_reward if np.mean( episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs self.global_step_num += 1 if args.render: self.env.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) print( "{}:Episode#:{} \t ep_reward:{} \t mean_ep_rew:{}\t best_ep_reward:{}" .format(self.actor_name, episode, ep_reward, np.mean(episode_rewards), self.best_reward)) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num)
def run(self): self.envs = SubprocVecEnv(self.env_names) self.state_shape = self.envs.observation_space.shape if isinstance(self.envs.action_space.sample(), int): # Discrete action space self.action_shape = self.envs.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Continuous action space self.action_shape = self.envs.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: # Screen image is the input to the agent if self.continuous_action_space: self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Input is a (single dimensional) vector if self.continuous_action_space: #self.actor_critic = ShallowActorCritic(self.state_shape, self.action_shape, 1, self.params).to(device) self.actor = ShallowActor(self.state_shape, self.action_shape, device).to(device) else: # Discrete action space self.actor = ShallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = ShallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) # Handle loading and saving of trained Agent models episode_rewards = list() prev_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 # To keep track of the num of ep with higher perf to save model #print("Using agent_params:", self.params) if self.params['load_trained_model']: try: self.load() prev_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: if args.test: # Test a saved model print( "FATAL: No saved model found. Cannot test. Press any key to train from scratch" ) input() else: print( "WARNING: No trained model found for this environment. Training from scratch." ) #for episode in range(self.params["max_num_episodes"]): obs = self.envs.reset() # TODO: Create appropriate masks to take care of envs that have set dones to True & learn() accordingly episode = 0 cum_step_rewards = np.zeros(self.params["num_agents"]) episode_rewards = [] step_num = 0 while True: action = self.get_action(obs) next_obs, rewards, dones, _ = self.envs.step(action) self.rewards.append(torch.tensor(rewards)) done_env_idxs = np.where(dones)[0] cum_step_rewards += rewards # nd-array of shape=num_actors step_num += self.params["num_agents"] episode += done_env_idxs.size # Update the number of finished episodes if not args.test and ( step_num >= self.params["learning_step_thresh"] or done_env_idxs.size): self.learn(next_obs, dones) step_num = 0 # Monitor performance and save Agent's state when perf improves if done_env_idxs.size > 0: [ episode_rewards.append(r) for r in cum_step_rewards[done_env_idxs] ] if np.max(cum_step_rewards[done_env_idxs] ) > self.best_reward: self.best_reward = np.max( cum_step_rewards[done_env_idxs]) if np.mean(episode_rewards) > prev_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ "save_freq_when_perf_improves"]: prev_checkpoint_mean_ep_rew = np.mean(episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 writer.add_scalar(self.actor_name + "/mean_ep_rew", np.mean(cum_step_rewards[done_env_idxs]), self.global_step_num) # Reset the cum_step_rew for the done envs cum_step_rewards[done_env_idxs] = 0.0 obs = next_obs self.global_step_num += self.params["num_agents"] if args.render: self.envs.render() #print(self.actor_name + ":Episode#:", episode, "step#:", step_num, "\t rew=", reward, end="\r") writer.add_scalar(self.actor_name + "/reward", np.mean(cum_step_rewards), self.global_step_num) print( "{}:Episode#:{} \t avg_step_reward:{:.4} \t mean_ep_rew:{:.4}\t best_ep_reward:{:.4}" .format(self.actor_name, episode, np.mean(cum_step_rewards), np.mean(episode_rewards), self.best_reward))