def generate(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) archive = Archive( path=os.path.join(ROOT, "generated", f"{generated_name}_" + get_time_string())) archive.add_item("expert_trajectories", trajectories) archive.save() logger.info(f"Trajectories saved as {archive.path}")
def generate_expert_episodes(): actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) logger.info("Training expert PPO") episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 # update episode_observations, episode_total_reward = run_episode(ppo, env) ppo.store_episode(episode_observations) ppo.update() # show reward smoothed_total_reward = smoothed_total_reward * 0.9 + episode_total_reward * 0.1 logger.info( f"Episode {episode} total reward={smoothed_total_reward:.2f}") if smoothed_total_reward > solved_reward: reward_fulfilled += 1 if reward_fulfilled >= solved_repeat: logger.info("Environment solved!") break else: reward_fulfilled = 0 trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) return trajectories
self.fc3 = nn.Linear(256, 1) def forward(self, mem): v = t.relu(self.fc1(mem)) v = t.relu(self.fc2(v)) v = self.fc3(v) return v if __name__ == "__main__": actor = RecurrentActor(action_num).to("cuda:0") critic = Critic().to("cuda:0") rppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), actor_learning_rate=1e-5, critic_learning_rate=1e-4) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 hidden = t.zeros([1, 1, 256]) state = convert(env.reset()) tmp_observations = []
def forward(self, mem): v = t.relu(self.fc1(mem.flatten(start_dim=1))) v = t.relu(self.fc2(v)) v = self.fc3(v) return v if __name__ == "__main__": actor = Actor(history_depth, action_num).to("cuda:0") critic = Critic(history_depth).to("cuda:0") ppo = PPO( actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum"), actor_learning_rate=1e-5, critic_learning_rate=1e-4, ) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 state = convert(env.reset()) history = History(history_depth, (1, 128))
self.fc2 = nn.Linear(16, 16) self.fc3 = nn.Linear(16, 1) def forward(self, state): v = t.relu(self.fc1(state)) v = t.relu(self.fc2(v)) v = self.fc3(v) return v if __name__ == "__main__": actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum')) episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 while episode < max_episodes: episode += 1 total_reward = 0 terminal = False step = 0 state = t.tensor(env.reset(), dtype=t.float32).view(1, observe_dim) tmp_observations = [] while not terminal and step <= max_steps: step += 1
trajectories = [] for i in range(expert_episodes): logger.info(f"Generating trajectory {i}") trajectories.append([{ "state": s["state"], "action": s["action"] } for s in run_episode(ppo, env)[0]]) return trajectories if __name__ == "__main__": actor = Actor(observe_dim, action_num) critic = Critic(observe_dim) discriminator = Discriminator(observe_dim, action_num) ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction="sum")) gail = GAIL(discriminator, ppo, t.optim.Adam) for expert_episode in generate_expert_episodes(): gail.store_expert_episode(expert_episode) # begin training episode, step, reward_fulfilled = 0, 0, 0 smoothed_total_reward = 0 terminal = False logger.info("Training GAIL") while episode < max_episodes: episode += 1 total_reward = 0 terminal = False