time_step +=1 # Running policy_old: # action = ppo.select_action(state, memory).cpu().data.numpy().flatten() action = ppo.select_action(state, memory).cpu().flatten() state, reward, done, _ = env.step(action) #Training if args.test is None: # Saving reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if time_step % args.update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 #Testing else: env.render() running_reward += reward if done: break avg_length += t # stop training if avg_reward > solved_reward if (running_reward > (args.log_interval * args.solved_reward)) and args.test is None: print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(args.environment))
def main(): ############## Hyperparameters ############## env_name = "Switch2-v0" # creating environment env = gym.make(env_name) state_dim = env.observation_space[0].shape[0] action_dim = env.action_space[0].n n_agents = env.n_agents render = False solved_length = 40 # stop training if avg_length > solved_length solved_reward = 90 # stop training if avg_length > solved_length log_interval = 20 # print avg reward in the interval max_episodes = 500000 # max training episodes max_timesteps = 100 # max timesteps in one episode update_timestep = 2000 # update policy every n timesteps lr = 0.0002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 5 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_agents, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: action = ppo.policy_old.act(state, memory) pos, state, reward, done, _ = env.step(action) # Saving reward and is_terminal: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: loss = ppo.update(memory) memory.clear_memory() timestep = 0 running_reward += sum(reward) if render: env.render() if all(done): break avg_length += t # logging if i_episode % log_interval == 0: avg_length = avg_length / log_interval running_reward = running_reward / log_interval print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward)) # stop training if avg_reward > solved_reward if avg_length < solved_length and running_reward > solved_reward: print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './Solved_length_40_PPO_{}.pth'.format(env_name)) break running_reward = 0 avg_length = 0 if i_episode % 5000 == 0 : torch.save(ppo.policy.state_dict(), './PPO_{}_{}_hy.pth'.format(env_name, i_episode))