def test_buffer(self): data = \ {AgentKey(0, '0-1'): AgentReplayFrame([2, 1, 2, 2, 3], [0, 1, 0], 3, False, [3, 1, 1, 2, 3]), AgentKey(0, '0-2'): AgentReplayFrame([1, 1, 3, 2, 1], [0, 1, 0], 4, False, [2, 1, 1, 2, 2]), AgentKey(1, '0-1'): AgentReplayFrame([2, 0, 3, 1, 2], [0, 1], 5, False, [3, 0, 1, 3, 4])} max_steps = 4 buffer = ReplayBuffer(max_steps) for i in range(5): buffer.push(data) self.assertEqual(buffer.length(), min(i + 1, max_steps)) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=False) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, data[k].reward) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=True) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, 0) avg_rewards = buffer.get_average_rewards(3) for k, v in avg_rewards.items(): self.assertEqual(v, data[k].reward)
def run(halite_env: BaseEnv, load_latest: bool=False): config = halite_env.config model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest) os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) # Build MAAC model if model_path is None: model = AttentionSAC(halite_env.agent_type_topologies, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) else: model = AttentionSAC.init_from_save(model_path, load_critic=True) # Build replay buffer replay_buffer = ReplayBuffer(config.buffer_length) prev_time = time.perf_counter() t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): curr_time = time.perf_counter() print("Episodes %i-%i of %i (%is)" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes, (curr_time - prev_time))) model.prep_rollouts(device='cpu') game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer) t += config.n_rollout_threads if (replay_buffer.length() >= config.batch_size and (t % config.games_per_update) < config.n_rollout_threads): print("Training") if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size) # print("Original sample size", len(sample)) # print("Preprocessing to batch structure") sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu) # print("Filtered sample size", len(sample)) # if len(sample) < 5: # print("Sample size keys:", sample.keys()) # print("Updating model critic") model.update_critic(sample, logger=logger) # print("Updating model policies") model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for k, v in ep_rews.items(): logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i) logger.add_scalar("global_env_rewards", game_reward, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Saving") model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') print("run_dir", run_dir) prev_time = curr_time model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()