def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_env( env, tau=config.tau, attend_tau=config.attend_tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= max(config.pi_batch_size, config.q_batch_size) and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_critic_updates): sample = replay_buffer.sample(config.q_batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) for u_i in range(config.num_pol_updates): sample = replay_buffer.sample(config.pi_batch_size, to_gpu=config.use_gpu) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def simulate(self, model: Callable[[Dict[AgentKey, AgentObservation]], Dict[AgentKey, AgentAction]], buffer: ReplayBuffer) -> float: """ actions: for each of the active agents, an index indicating the action they selected (probably via softmax) should be returned (i.e. [3, 2, 3, 0]) Return value is in the form: returns rewards, dones, next observations """ self.environment.reset(self.team_count) self.board = Board(self.environment.state[0].observation, self.environment.configuration) sim_buffer: SimulationBuffer = SimulationBuffer() for turn in range(200): # Observe observations: Dict[HaliteKey, AgentObservation] = {} for ship in self.board.ships.values(): observations[HaliteKey(0, ship.id, ship.player_id)] = AgentObservation( get_ship_observation( ship, self.board)) for shipyard in self.board.shipyards.values(): observations[HaliteKey(1, shipyard.id, shipyard.player_id)] = AgentObservation( get_shipyard_observation( shipyard, self.board)) # Act actions: Dict[AgentKey, AgentAction] = model(observations) game_object_types = [ list(self.board.ships.values()), list(self.board.shipyards.values()) ] for i in range(len(game_object_types)): for game_object in game_object_types[i]: key = HaliteKey(i, game_object.id, game_object.player_id) action_index = actions[key].get_action_index() if i == 0: game_object.next_action = ship_actions[action_index] elif i == 1: game_object.next_action = shipyard_actions[ action_index] self.board = self.board.next() # Calculate rewards rewards_by_team: Dict[PlayerId, float] = { k: self.player_reward(v) for k, v in self.board.players.items() } rewards: Dict[HaliteKey, float] = {} dones: Dict[HaliteKey, bool] = {} for k in observations.keys(): rewards[k] = rewards_by_team[k.player] if k.type == 0: dones[k] = k.id not in self.board.ships elif k.type == 1: dones[k] = k.id not in self.board.shipyards sim_buffer.push(observations, actions, rewards, dones) final_rewards_by_team: Dict[PlayerId, float] = { k: self.player_reward(v) for k, v in self.board.players.items() } # Push from sim buffer to actual replay buffer for i in range(len(sim_buffer.frames)): frame = sim_buffer.frames[i] next_frame = sim_buffer.frames[ i + 1] if i < len(sim_buffer.frames) - 1 else None for k in frame.keys(): if next_frame is None: frame[k].next_obs = [0] * len(frame[k].obs) frame[k].reward = final_rewards_by_team[k.player] elif k not in next_frame: assert frame[k].done frame[k].next_obs = [0] * len(frame[k].obs) frame[k].reward = final_rewards_by_team[k.player] else: assert not frame[k].done frame[k].next_obs = next_frame[k].obs buffer.push({k: v.build() for k, v in frame.items()}) return sum(final_rewards_by_team.values()) / len(final_rewards_by_team)
def run(config): scores_window = deque(maxlen=100) model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) # transport configuration name = 'Materials Transport' conf = { 'n_player': 2, #玩家数量 'board_width': 11, #地图宽 'board_height': 11, #地图高 'n_cell_type': 5, #格子的种类 'materials': 4, #集散点数量 'cars': 2, #汽车数 'planes': 0, #飞机数量 'barriers': 12, #固定障碍物数量 'max_step': 100, #最大步数 'game_name': name, #游戏名字 'K': 5, #每个K局更新集散点物资数目 'map_path': 'env/map.txt', #存放初始地图 'cell_range': 6, # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##? 'ob_board_width': None, # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##? 'ob_board_height': None, # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##? 'ob_cell_range': None, # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##? } env = make_parallel_env_transport(config.env_id, conf, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): score = 0 # print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) obs = env.reset() # TODO: TO CHECK # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() # for et_i in range(config.episode_length): while env.is_terminal() is not True: # env._render() # rearrange observations to be per agent, and convert to torch Variable # print('step', et_i) # print(maddpg.nagents) torch_obs = [ Variable( torch.Tensor(np.vstack(obs[:, i])), # 沿着竖直方向将矩阵堆叠起来。 requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] ############################################ # add # actions = actions.astype(int) ############################################ # add: 前两个action joint_action = [] for i in range(2): player = [] for j in range(1): each = [0] * 11 idx = np.random.randint(11) each[idx] = 1 player.append(each) joint_action.append(player) for m in range(2): joint_action.append([actions[0][m].astype(int).tolist()]) next_obs, rewards, dones, infos = env.step(joint_action) ################################# agents_action = actions[0] ################################# replay_buffer.push(obs, agents_action, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() #TODO maddpg.prep_rollouts(device='cpu') score += rewards[0][0] ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') scores_window.append(score) reward_epi = np.mean(scores_window) reward_epi_var = np.var(scores_window) logger.add_scalar('results/completion_window' % reward_epi, ep_i) logger.add_scalar('results/completion_window' % reward_epi_var, ep_i) print( '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '. format(ep_i, reward_epi, reward_epi_var)) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()