state = trajectory_collector.last_states is_random_run = [0, 1, 2] for is_random in is_random_run: print(f"Staring {'' if is_random else 'non' } random run...") total_rewards = [] avg_episode_length = 0 episode_lengths = [] for i_run in range(NUM_RUNS): sum_reward = 0 ep = 0 while True: ep += 1 if is_random == 1: actions = agent.act(state).cpu().numpy() elif is_random == 2: actions = np.r_[np.random.randn(3), [0.5]] else: actions = np.random.randn(4) next_states, rewards, dones = trajectory_collector.next_observation( actions) sum_reward += rewards.cpu().numpy().sum() state = next_states if np.any(dones.cpu().numpy()): trajectory_collector.reset() state = trajectory_collector.last_states total_rewards.append(sum_reward)
select_device(0) print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) agent = PPOAgent(config) random_seed() config = agent.config agent.actor_critic.load_state_dict( torch.load('../checkpoints/ppo_checkpoint.pth')) score = 0 # initialize the score for i in range(3): env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations for j in range(2000): action = agent.act(state) env_info = env.step(action.cpu().detach().numpy())[brain_name] next_state = env_info.vector_observations # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished state = next_state score += reward print('\rScore: {:.2f}'.format(score), end="") if done: break env.close()
def experiment(hidden_size=64, lr=3e-4, num_steps=2048, mini_batch_size=32, ppo_epochs=10, threshold_reward=10, max_episodes=15, nrmlz_adv=True, gamma=0.99, tau=0.95, clip_gradients=True): ''' :param hidden_size: number of neurons for the layers of the model :param lr: learning rate :param num_steps: maximum duration of one epoch :param mini_batch_size: mini batch size for ppo :param ppo_epochs: number of epochs for ppo to learn :param threshold_reward: what is the goal of the training :param max_episodes: maximum duration of the training :param nrmlz_adv: True, if advantages should be normalized before PPO :param clip_gradients: True if gradients should ne clipped after PPO :return: list of scores and list of test_rewards ''' use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") scores_window = deque(maxlen=100) test_rewards = [] moving_averages = [] env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] agent = PPOAgent(learning_rate=lr, state_size=state_size, action_size=action_size, hidden_size=hidden_size, num_agents=num_agents, random_seed=0, ppo_epochs=ppo_epochs, mini_batch_size=mini_batch_size, normalize_advantages=nrmlz_adv, clip_gradients=clip_gradients, gamma=gamma, tau=tau, device=device) # while episode < max_episodes and not early_stop: for episode in tqdm(range(max_episodes)): log_probs = [] values = [] states_list = [] actions_list = [] rewards = [] masks = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations for duration in range(num_steps): state = torch.FloatTensor(state).to(device) action, value, log_prob = agent.act(state) env_info = env.step(action.cpu().data.numpy())[ brain_name] # send all actions to the environment next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) dones = np.array(env_info.local_done) # see if episode finished if reward == None: pass log_probs.append(log_prob) values.append(value) reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device) masks_t = torch.FloatTensor(1 - dones) rewards.append(reward_t) masks.append(masks_t) states_list.append(state) actions_list.append(action) state = next_state if np.any(dones): break next_state = torch.FloatTensor(state).to(device) _, next_value, _ = agent.act(next_state) agent.step(states=states_list, actions=actions_list, values=values, log_probs=log_probs, rewards=rewards, masks=masks, next_value=next_value) test_mean_reward = test_agent(env, brain_name, agent, device) test_rewards.append(test_mean_reward) scores_window.append(test_mean_reward) moving_averages.append(np.mean(scores_window)) print('Episode {}, Total score this episode: {}, Last {} average: {}'. format(episode, test_mean_reward, min(episode, 100), np.mean(scores_window))) if np.mean(scores_window) > threshold_reward: agent.save_model( f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth" ) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, test_mean_reward)) break episode += 1 env.close() return scores_window, test_rewards, moving_averages
num_processes=1, obs_shape=obs_shape) step = 0 episode = 0 ppo_update = 0 total_reward = 0 done = True while True: # nb episodes if done: env_info = env.reset(train_mode=train_mode, config=config)[default_brain] obs = env_info.observations[0] obs = img_to_tensor(obs) while True: # nb of steps action, action_log_prob, value = agent.act(obs) #action_cpu = action.data.numpy() action_cuda = action.data.cpu().numpy() #print(action_cuda) env_info = env.step(action_cuda)[default_brain] done = env_info.local_done[0] reward = torch.cuda.FloatTensor([env_info.rewards[0]]) total_reward += env_info.rewards[0] mask = 0 if env_info.local_done[0] else 1 mask = torch.cuda.FloatTensor([mask]) rollouts.insert(step, obs.data, action.data, action_log_prob.data, value.data, reward, mask) step += 1 obs = env_info.observations[0]