def ppo(env, brain_name, policy, config, train): if train: optimizier = optim.Adam( policy.parameters(), config['hyperparameters']['adam_learning_rate'], eps=config['hyperparameters']['adam_epsilon']) agent = PPOAgent(env, brain_name, policy, optimizier, config) all_scores = [] averages = [] last_max = 30.0 for i in tqdm.tqdm(range(config['hyperparameters']['episode_count'])): agent.step() last_mean_reward = play_round(env, brain_name, policy, config) if i == 0: last_average = last_mean_reward else: last_average = np.mean(np.array( all_scores[-100:])) if len(all_scores) > 100 else np.mean( np.array(all_scores)) all_scores.append(last_mean_reward) averages.append(last_average) if last_average > last_max: torch.save( policy.state_dict(), f"reacher-ppo/models/ppo-max-hiddensize-{config['hyperparameters']['hidden_size']}.pth" ) last_max = last_average clear_output(True) print( 'Episode: {} Total score this episode: {} Last {} average: {}'. format(i + 1, last_mean_reward, min(i + 1, 100), last_average)) return all_scores, averages else: all_scores = [] for i in range(20): score = play_round(env, brain_name, policy, config, train) all_scores.append(score) return [score], [np.mean(score)]
n_hidden=args.n_hidden, n_outs=n_outs, td_n=args.td_n, ppo_epochs=args.ppo_epochs, mini_batch_size=args.mini_batch_size) if args.load_best_pretrained_model: agent.load_model('../models/ppo/model.pt') print('Loaded pretrained model') if args.test_env: state = env.reset() done = False score = 0 while not done: env.render() dist, value = agent.step(state) action = dist.sample() state, reward, done, _ = env.step(action.cpu().numpy()) score += reward print(score) else: scores = [] state = envs.reset() next_state = None early_stop = False best_avg_score = args.best_avg_reward idx = 0 while idx < args.max_frames and not early_stop:
def experiment(hidden_size=64, lr=3e-4, num_steps=2048, mini_batch_size=32, ppo_epochs=10, threshold_reward=10, max_episodes=15, nrmlz_adv=True, gamma=0.99, tau=0.95, clip_gradients=True): ''' :param hidden_size: number of neurons for the layers of the model :param lr: learning rate :param num_steps: maximum duration of one epoch :param mini_batch_size: mini batch size for ppo :param ppo_epochs: number of epochs for ppo to learn :param threshold_reward: what is the goal of the training :param max_episodes: maximum duration of the training :param nrmlz_adv: True, if advantages should be normalized before PPO :param clip_gradients: True if gradients should ne clipped after PPO :return: list of scores and list of test_rewards ''' use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") scores_window = deque(maxlen=100) test_rewards = [] moving_averages = [] env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] agent = PPOAgent(learning_rate=lr, state_size=state_size, action_size=action_size, hidden_size=hidden_size, num_agents=num_agents, random_seed=0, ppo_epochs=ppo_epochs, mini_batch_size=mini_batch_size, normalize_advantages=nrmlz_adv, clip_gradients=clip_gradients, gamma=gamma, tau=tau, device=device) # while episode < max_episodes and not early_stop: for episode in tqdm(range(max_episodes)): log_probs = [] values = [] states_list = [] actions_list = [] rewards = [] masks = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations for duration in range(num_steps): state = torch.FloatTensor(state).to(device) action, value, log_prob = agent.act(state) env_info = env.step(action.cpu().data.numpy())[ brain_name] # send all actions to the environment next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) dones = np.array(env_info.local_done) # see if episode finished if reward == None: pass log_probs.append(log_prob) values.append(value) reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device) masks_t = torch.FloatTensor(1 - dones) rewards.append(reward_t) masks.append(masks_t) states_list.append(state) actions_list.append(action) state = next_state if np.any(dones): break next_state = torch.FloatTensor(state).to(device) _, next_value, _ = agent.act(next_state) agent.step(states=states_list, actions=actions_list, values=values, log_probs=log_probs, rewards=rewards, masks=masks, next_value=next_value) test_mean_reward = test_agent(env, brain_name, agent, device) test_rewards.append(test_mean_reward) scores_window.append(test_mean_reward) moving_averages.append(np.mean(scores_window)) print('Episode {}, Total score this episode: {}, Last {} average: {}'. format(episode, test_mean_reward, min(episode, 100), np.mean(scores_window))) if np.mean(scores_window) > threshold_reward: agent.save_model( f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth" ) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, test_mean_reward)) break episode += 1 env.close() return scores_window, test_rewards, moving_averages
def ppo(): # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64", no_graphics=True) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) config = Config() config.env = env config.actor_critic_fn = lambda: ActorCritic( actor=Actor(state_size, action_size), critic=Critic(state_size)) config.discount = 0.99 config.use_gae = True config.gae_tau = 0.95 config.gradient_clip = 5 config.rollout_length = 2048 config.optimization_epochs = 5 config.num_mini_batches = 512 config.ppo_ratio_clip = 0.2 config.log_interval = 10 * 2048 config.max_steps = 2e7 config.eval_episodes = 10 # config.logger = get_logger() print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) agent = PPOAgent(config) random_seed() config = agent.config t0 = time.time() scores = [] scores_window = deque(maxlen=100) # last 100 scores while True: if config.log_interval and not agent.total_steps % config.log_interval and len( agent.episode_rewards): rewards = agent.episode_rewards for reward in rewards: scores.append(reward) scores_window.append(reward) agent.episode_rewards = [] print('\r===> Average Score: {:d} episodes {:.2f}'.format( len(scores), np.mean(scores_window))) if np.mean(scores_window) >= 1.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(len(scores_window), np.mean(scores_window))) torch.save(agent.actor_critic.state_dict(), '../checkpoints/ppo_checkpoint.pth') break print( 'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s' % (agent.total_steps, len(rewards), np.mean(rewards), np.median(rewards), np.min(rewards), np.max(rewards), config.log_interval / (time.time() - t0))) t0 = time.time() agent.step() return scores