next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) agent.entropy += dist.entropy().mean() agent.append_log_prob(log_prob) agent.append_value(value) agent.append_reward(reward) agent.append_done(done) agent.append_state(state) agent.append_action(action) state = next_state idx += 1 if idx % 1000 == 0: score = np.mean([agent.test_env(env) for _ in range(100)]) print(idx, score) scores.append(score) if score > best_avg_score: best_avg_score = score agent.save_model('../models/ppo/model.pt') print('Saved best model') if score > args.threshold_score: early_stop = True agent.train(next_state) plot_and_save_scores(scores, args.max_frames / 1000, args)
def paralle_train(args): logger = SummaryWriter(log_dir='results/{}_{}_{}'.format( args.env, args.seed, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) np.random.seed(args.seed) torch.manual_seed(args.seed) env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) workers, parent_conns, children_conns = workers_initialize(args) obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32) #initialize obs_normalizer print('Start initialize obs normalizer....') next_obs_batch = [] for step in range(args.initialize_episode * args.max_episode_step): actions = np.random.randint(0, env_params['a_dim'], size=(args.num_worker)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: obs_, r, done, info = parent_conn.recv() next_obs_batch.append(obs_) if len(next_obs_batch) % (10 * args.num_worker) == 0: next_obs_batch = np.stack(next_obs_batch) agent.normalizer_obs.update(next_obs_batch) next_obs_batch = [] print('End initialize obs normalizer....') log_reward_ex = 0 log_reward_in = 0 log_step = 0 log_episode = 0 for i_epoch in range(args.max_epoch): epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], [] for i_step in range(args.rollout_len): actions, log_probs = agent.choose_action(obs) for action, parent_conn in zip(actions, parent_conns): parent_conn.send(action) batch_re, batch_mask, batch_next_obs = [], [], [] for parent_conn in parent_conns: obs_, r_e, done, info = parent_conn.recv() batch_next_obs.append(obs_) batch_re.append(r_e) batch_mask.append(0 if done else 1) batch_next_obs = np.stack(batch_next_obs) batch_re = np.stack(batch_re) batch_mask = np.stack(batch_mask) batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy()) #for log log_reward_ex += batch_re[args.log_env_idx] log_reward_in += batch_ri[args.log_env_idx] log_step += 1 if batch_mask[args.log_env_idx] == 0: log_episode += 1 logger.add_scalar('Indicator/Reward_ex', log_reward_ex, log_episode) logger.add_scalar('Indicator/Reward_in', log_reward_in, log_episode) log_reward_ex = 0 log_reward_in = 0 epoch_obs.append(obs) epoch_action.append(actions) epoch_next_obs.append(batch_next_obs) epoch_ri.append(batch_ri) epoch_re.append(batch_re) epoch_mask.append(batch_mask) epoch_logprob.append(log_probs) obs = batch_next_obs[:, :, :, :] epoch_obs = np.stack(epoch_obs) epoch_action = np.stack(epoch_action) epoch_ri = np.stack(epoch_ri) epoch_re = np.stack(epoch_re) epoch_mask = np.stack(epoch_mask) epoch_next_obs = np.stack(epoch_next_obs) epoch_logprob = np.stack(epoch_logprob) epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4]) epoch_action = np.transpose(epoch_action, axes=[1, 0]) epoch_ri = np.transpose(epoch_ri, axes=[1, 0]) epoch_re = np.transpose(epoch_re, axes=[1, 0]) epoch_mask = np.transpose(epoch_mask, axes=[1, 0]) epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4]) epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0]) loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob) used_sample_num = args.rollout_len * args.num_worker * i_epoch logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num) logger.add_scalar('Loss/loss_a', loss_a, used_sample_num) logger.add_scalar('Loss/loss_c', loss_c, used_sample_num) if i_epoch % args.save_model_interval == 0: agent.save_model(remark='{}'.format(i_epoch))
def experiment(hidden_size=64, lr=3e-4, num_steps=2048, mini_batch_size=32, ppo_epochs=10, threshold_reward=10, max_episodes=15, nrmlz_adv=True, gamma=0.99, tau=0.95, clip_gradients=True): ''' :param hidden_size: number of neurons for the layers of the model :param lr: learning rate :param num_steps: maximum duration of one epoch :param mini_batch_size: mini batch size for ppo :param ppo_epochs: number of epochs for ppo to learn :param threshold_reward: what is the goal of the training :param max_episodes: maximum duration of the training :param nrmlz_adv: True, if advantages should be normalized before PPO :param clip_gradients: True if gradients should ne clipped after PPO :return: list of scores and list of test_rewards ''' use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") scores_window = deque(maxlen=100) test_rewards = [] moving_averages = [] env = UnityEnvironment(file_name='reacher20/reacher', base_port=64739) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size num_agents = len(env_info.agents) states = env_info.vector_observations state_size = states.shape[1] agent = PPOAgent(learning_rate=lr, state_size=state_size, action_size=action_size, hidden_size=hidden_size, num_agents=num_agents, random_seed=0, ppo_epochs=ppo_epochs, mini_batch_size=mini_batch_size, normalize_advantages=nrmlz_adv, clip_gradients=clip_gradients, gamma=gamma, tau=tau, device=device) # while episode < max_episodes and not early_stop: for episode in tqdm(range(max_episodes)): log_probs = [] values = [] states_list = [] actions_list = [] rewards = [] masks = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations for duration in range(num_steps): state = torch.FloatTensor(state).to(device) action, value, log_prob = agent.act(state) env_info = env.step(action.cpu().data.numpy())[ brain_name] # send all actions to the environment next_state = env_info.vector_observations # get next state (for each agent) reward = env_info.rewards # get reward (for each agent) dones = np.array(env_info.local_done) # see if episode finished if reward == None: pass log_probs.append(log_prob) values.append(value) reward_t = torch.FloatTensor(reward).unsqueeze(1).to(device) masks_t = torch.FloatTensor(1 - dones) rewards.append(reward_t) masks.append(masks_t) states_list.append(state) actions_list.append(action) state = next_state if np.any(dones): break next_state = torch.FloatTensor(state).to(device) _, next_value, _ = agent.act(next_state) agent.step(states=states_list, actions=actions_list, values=values, log_probs=log_probs, rewards=rewards, masks=masks, next_value=next_value) test_mean_reward = test_agent(env, brain_name, agent, device) test_rewards.append(test_mean_reward) scores_window.append(test_mean_reward) moving_averages.append(np.mean(scores_window)) print('Episode {}, Total score this episode: {}, Last {} average: {}'. format(episode, test_mean_reward, min(episode, 100), np.mean(scores_window))) if np.mean(scores_window) > threshold_reward: agent.save_model( f"ppo_checkpoint_{test_mean_reward}_e{episode}_hs{hidden_size}_lr{lr}_st{num_steps}_b{mini_batch_size}_ppo{ppo_epochs}_r{threshold_reward}_e{episode}_adv{nrmlz_adv}_{test_mean_reward}.pth" ) print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, test_mean_reward)) break episode += 1 env.close() return scores_window, test_rewards, moving_averages