Exemplo n.º 1
0
def run(config):
    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        run_num = 1
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            run_num = 1
        else:
            run_num = max(exst_run_nums) + 1
    curr_run = 'run%i' % run_num
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(run_num)
    np.random.seed(run_num)
    env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num)
    model = AttentionSAC.init_from_env(
        env,
        tau=config.tau,
        attend_tau=config.attend_tau,
        pi_lr=config.pi_lr,
        q_lr=config.q_lr,
        gamma=config.gamma,
        pol_hidden_dim=config.pol_hidden_dim,
        critic_hidden_dim=config.critic_hidden_dim,
        attend_heads=config.attend_heads,
        reward_scale=config.reward_scale)
    replay_buffer = ReplayBuffer(
        config.buffer_length, model.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        print(
            "Episodes %i-%i of %i" %
            (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes))
        obs = env.reset()
        model.prep_rollouts(device='cpu')

        for et_i in range(config.episode_length):
            # rearrange observations to be per agent, and convert to torch Variable
            torch_obs = [
                Variable(torch.Tensor(np.vstack(obs[:, i])),
                         requires_grad=False) for i in range(model.nagents)
            ]
            # get actions as torch Variables
            torch_agent_actions = model.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            next_obs, rewards, dones, infos = env.step(actions)
            replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= max(config.pi_batch_size,
                                          config.q_batch_size) and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if config.use_gpu:
                    model.prep_training(device='gpu')
                else:
                    model.prep_training(device='cpu')
                for u_i in range(config.num_critic_updates):
                    sample = replay_buffer.sample(config.q_batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_critic(sample, logger=logger)
                for u_i in range(config.num_pol_updates):
                    sample = replay_buffer.sample(config.pi_batch_size,
                                                  to_gpu=config.use_gpu)
                    model.update_policies(sample, logger=logger)
                model.update_all_targets()
                model.prep_rollouts(device='cpu')
        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            model.prep_rollouts(device='cpu')
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            model.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                  (ep_i + 1)))
            model.save(run_dir / 'model.pt')

    model.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()
Exemplo n.º 2
0
    def simulate(self, model: Callable[[Dict[AgentKey, AgentObservation]],
                                       Dict[AgentKey, AgentAction]],
                 buffer: ReplayBuffer) -> float:
        """
        actions: for each of the active agents, an index indicating the action they selected (probably via
        softmax) should be returned (i.e. [3, 2, 3, 0])
        Return value is in the form:
        returns rewards, dones, next observations
        """

        self.environment.reset(self.team_count)
        self.board = Board(self.environment.state[0].observation,
                           self.environment.configuration)

        sim_buffer: SimulationBuffer = SimulationBuffer()

        for turn in range(200):
            # Observe
            observations: Dict[HaliteKey, AgentObservation] = {}

            for ship in self.board.ships.values():
                observations[HaliteKey(0, ship.id,
                                       ship.player_id)] = AgentObservation(
                                           get_ship_observation(
                                               ship, self.board))

            for shipyard in self.board.shipyards.values():
                observations[HaliteKey(1, shipyard.id,
                                       shipyard.player_id)] = AgentObservation(
                                           get_shipyard_observation(
                                               shipyard, self.board))

            # Act
            actions: Dict[AgentKey, AgentAction] = model(observations)

            game_object_types = [
                list(self.board.ships.values()),
                list(self.board.shipyards.values())
            ]
            for i in range(len(game_object_types)):
                for game_object in game_object_types[i]:
                    key = HaliteKey(i, game_object.id, game_object.player_id)
                    action_index = actions[key].get_action_index()
                    if i == 0:
                        game_object.next_action = ship_actions[action_index]
                    elif i == 1:
                        game_object.next_action = shipyard_actions[
                            action_index]

            self.board = self.board.next()

            # Calculate rewards
            rewards_by_team: Dict[PlayerId, float] = {
                k: self.player_reward(v)
                for k, v in self.board.players.items()
            }

            rewards: Dict[HaliteKey, float] = {}
            dones: Dict[HaliteKey, bool] = {}

            for k in observations.keys():
                rewards[k] = rewards_by_team[k.player]
                if k.type == 0:
                    dones[k] = k.id not in self.board.ships
                elif k.type == 1:
                    dones[k] = k.id not in self.board.shipyards

            sim_buffer.push(observations, actions, rewards, dones)

        final_rewards_by_team: Dict[PlayerId, float] = {
            k: self.player_reward(v)
            for k, v in self.board.players.items()
        }

        # Push from sim buffer to actual replay buffer
        for i in range(len(sim_buffer.frames)):
            frame = sim_buffer.frames[i]
            next_frame = sim_buffer.frames[
                i + 1] if i < len(sim_buffer.frames) - 1 else None
            for k in frame.keys():
                if next_frame is None:
                    frame[k].next_obs = [0] * len(frame[k].obs)
                    frame[k].reward = final_rewards_by_team[k.player]
                elif k not in next_frame:
                    assert frame[k].done
                    frame[k].next_obs = [0] * len(frame[k].obs)
                    frame[k].reward = final_rewards_by_team[k.player]
                else:
                    assert not frame[k].done
                    frame[k].next_obs = next_frame[k].obs
            buffer.push({k: v.build() for k, v in frame.items()})

        return sum(final_rewards_by_team.values()) / len(final_rewards_by_team)
Exemplo n.º 3
0
def run(config):
    scores_window = deque(maxlen=100)

    model_dir = Path('./models') / config.env_id / config.model_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    os.makedirs(log_dir)
    logger = SummaryWriter(str(log_dir))

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    if not USE_CUDA:
        torch.set_num_threads(config.n_training_threads)

    # transport configuration
    name = 'Materials Transport'
    conf = {
        'n_player': 2,  #玩家数量
        'board_width': 11,  #地图宽
        'board_height': 11,  #地图高
        'n_cell_type': 5,  #格子的种类
        'materials': 4,  #集散点数量
        'cars': 2,  #汽车数
        'planes': 0,  #飞机数量
        'barriers': 12,  #固定障碍物数量
        'max_step': 100,  #最大步数
        'game_name': name,  #游戏名字
        'K': 5,  #每个K局更新集散点物资数目
        'map_path': 'env/map.txt',  #存放初始地图
        'cell_range': 6,  # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##?
        'ob_board_width': None,  # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##?
        'ob_board_height': None,  # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##?
        'ob_cell_range':
        None,  # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##?
    }

    env = make_parallel_env_transport(config.env_id, conf,
                                      config.n_rollout_threads, config.seed,
                                      config.discrete_action)

    maddpg = MADDPG.init_from_env(env,
                                  agent_alg=config.agent_alg,
                                  adversary_alg=config.adversary_alg,
                                  tau=config.tau,
                                  lr=config.lr,
                                  hidden_dim=config.hidden_dim)
    replay_buffer = ReplayBuffer(
        config.buffer_length, maddpg.nagents,
        [obsp.shape[0] for obsp in env.observation_space], [
            acsp.shape[0] if isinstance(acsp, Box) else acsp.n
            for acsp in env.action_space
        ])
    t = 0
    for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
        score = 0
        # print("Episodes %i-%i of %i" % (ep_i + 1,
        #                                 ep_i + 1 + config.n_rollout_threads,
        #                                 config.n_episodes))

        obs = env.reset()  # TODO: TO CHECK
        # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
        maddpg.prep_rollouts(device='cpu')

        explr_pct_remaining = max(
            0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
        maddpg.scale_noise(config.final_noise_scale +
                           (config.init_noise_scale -
                            config.final_noise_scale) * explr_pct_remaining)
        maddpg.reset_noise()

        # for et_i in range(config.episode_length):
        while env.is_terminal() is not True:

            # env._render()
            # rearrange observations to be per agent, and convert to torch Variable
            # print('step', et_i)
            # print(maddpg.nagents)
            torch_obs = [
                Variable(
                    torch.Tensor(np.vstack(obs[:, i])),  # 沿着竖直方向将矩阵堆叠起来。
                    requires_grad=False) for i in range(maddpg.nagents)
            ]

            # get actions as torch Variables
            torch_agent_actions = maddpg.step(torch_obs, explore=True)
            # convert actions to numpy arrays
            agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
            # rearrange actions to be per environment
            actions = [[ac[i] for ac in agent_actions]
                       for i in range(config.n_rollout_threads)]
            ############################################
            # add
            # actions = actions.astype(int)
            ############################################
            # add: 前两个action
            joint_action = []
            for i in range(2):
                player = []
                for j in range(1):
                    each = [0] * 11
                    idx = np.random.randint(11)
                    each[idx] = 1
                    player.append(each)
                joint_action.append(player)
            for m in range(2):
                joint_action.append([actions[0][m].astype(int).tolist()])

            next_obs, rewards, dones, infos = env.step(joint_action)

            #################################
            agents_action = actions[0]
            #################################

            replay_buffer.push(obs, agents_action, rewards, next_obs, dones)
            obs = next_obs
            t += config.n_rollout_threads
            if (len(replay_buffer) >= config.batch_size and
                (t % config.steps_per_update) < config.n_rollout_threads):
                if USE_CUDA:
                    maddpg.prep_training(device='gpu')
                else:
                    maddpg.prep_training(device='cpu')

                for u_i in range(config.n_rollout_threads):
                    for a_i in range(maddpg.nagents):
                        sample = replay_buffer.sample(config.batch_size,
                                                      to_gpu=USE_CUDA)
                        maddpg.update(sample, a_i, logger=logger)
                    maddpg.update_all_targets()  #TODO
                maddpg.prep_rollouts(device='cpu')

            score += rewards[0][0]

        ep_rews = replay_buffer.get_average_rewards(config.episode_length *
                                                    config.n_rollout_threads)
        for a_i, a_ep_rew in enumerate(ep_rews):
            logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew,
                              ep_i)

        if ep_i % config.save_interval < config.n_rollout_threads:
            os.makedirs(run_dir / 'incremental', exist_ok=True)
            maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' %
                                                   (ep_i + 1)))
            maddpg.save(run_dir / 'model.pt')

        scores_window.append(score)
        reward_epi = np.mean(scores_window)
        reward_epi_var = np.var(scores_window)
        logger.add_scalar('results/completion_window' % reward_epi, ep_i)
        logger.add_scalar('results/completion_window' % reward_epi_var, ep_i)
        print(
            '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '.
            format(ep_i, reward_epi, reward_epi_var))

    maddpg.save(run_dir / 'model.pt')
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()