示例#1
0
class RewardsTests(unittest.TestCase):
    def setUp(self):
        """Create simple environment to test rewards."""
        self._env = TGEnv('4x4-test-rewards')

    def test_reward_guard_catch_thief(self):
        """Test reward for guard catching thief."""
        _, reward, _, _ = self._env.step([NOOP, UP])

        self.assertEqual(tuple(reward), REWARDS['killed'])

    def test_reward_thief_runs_into_guard(self):
        """Test reward for thief running into a guard, stupidly."""
        _, reward, _, _ = self._env.step([DOWN, NOOP])

        self.assertEqual(tuple(reward), REWARDS['killed'])

    def test_reward_thief_finds_treasure(self):
        """Test reward for thief getting that sweet, sweet treasure."""
        _, reward, _, _ = self._env.step([UP, NOOP])

        self.assertEqual(tuple(reward), REWARDS['treasure'])

    def test_reward_out_of_time(self):
        """Test rewards for when environment time is spent."""
        # Perform no movements until the max number of steps
        done = [False]
        # TODO check this
        while not all(done):
            _, reward, done, _ = self._env.step([NOOP, NOOP])

        self.assertEqual(tuple(reward), REWARDS['out_of_time'])
示例#2
0
class WallMovementsTests(unittest.TestCase):
    def setUp(self):
        """Create simple environment where thief and guardian can't move."""
        """
        '4x4-test-movements-walls': [
            [_, W, _, _],
            [W, T, W, _],
            [_, W, G, W],
            [_, _, W, _],
        ],
        """

        self._env = TGEnv('4x4-test-movements-walls')
        self._init_thief_pos = self._env._id2pos[0]
        self._init_guard_pos = self._env._id2pos[1]

    def test_move_up_to_wall(self):
        """Move up to a wall."""
        self._env.step([UP, UP])

        curr_thief_pos = self._env._id2pos[0]
        curr_guard_pos = self._env._id2pos[1]
        self.assertEqual(curr_thief_pos, self._init_thief_pos)
        self.assertEqual(curr_guard_pos, self._init_guard_pos)

    def test_move_down_to_wall(self):
        """Move down to a wall."""
        self._env.step([DOWN, DOWN])

        curr_thief_pos = self._env._id2pos[0]
        curr_guard_pos = self._env._id2pos[1]
        self.assertEqual(curr_thief_pos, self._init_thief_pos)
        self.assertEqual(curr_guard_pos, self._init_guard_pos)

    def test_move_left_to_wall(self):
        """Move left to a wall."""
        self._env.step([LEFT, LEFT])

        curr_thief_pos = self._env._id2pos[0]
        curr_guard_pos = self._env._id2pos[1]
        self.assertEqual(curr_thief_pos, self._init_thief_pos)
        self.assertEqual(curr_guard_pos, self._init_guard_pos)

    def test_move_right_to_wall(self):
        """Move up to a wall."""
        self._env.step([RIGHT, RIGHT])

        curr_thief_pos = self._env._id2pos[0]
        curr_guard_pos = self._env._id2pos[1]
        self.assertEqual(curr_thief_pos, self._init_thief_pos)
        self.assertEqual(curr_guard_pos, self._init_guard_pos)
示例#3
0
def perform_update(config, env: TGEnv, team_policies: List[Policy],
                   avatar_storages: List[RolloutsStorage]):
    """ Collects rollouts and updates """

    # Used to log
    total_rewards = EpisodeAccumulator(env.num_avatars)
    steps_alive = EpisodeAccumulator(env.num_avatars)
    first_step_probas = EpisodeAccumulator(env.num_avatars,
                                           max(env.num_actions))
    end_reasons = []

    # Will be filled in for each avatar when stepping the environment individually
    actions = [0] * env.num_avatars
    action_log_probs = [0] * env.num_avatars
    values = [0] * env.num_avatars

    avatar_policies = [
        team_policies[env.id2team[avatar_id]]
        for avatar_id in range(env.num_avatars)
    ]

    # Always start with a fresh env
    env_states = env.reset()  # shape: [num_avatars, *env_state_shape]
    rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies)
    next_rec_hs, next_rec_cs = rec_hs, rec_cs
    first_episode_step = True

    # Set to eval mode because batch norm cannot be computed on a batch size of 1
    # and when we pick actions we only pick one at a time
    for policy in team_policies:
        policy.controller.eval()

    # Collect rollouts
    # TODO (?): collect in parallel
    for step in range(config.num_transitions):
        # Alive at the beginning of step
        avatar_alive = env.avatar_alive.copy()

        # Run each alive avatar individually
        for avatar_id in range(env.num_avatars):
            if avatar_alive[avatar_id]:
                # Chose action based on the policy
                team = env.id2team[avatar_id]
                policy = team_policies[team]

                action_source = policy.scheduler.pick_action_source()
                if action_source == SCRIPTED:
                    scripted_action = env.scripted_action(avatar_id)
                else:
                    scripted_action = None

                (
                    actions[avatar_id],
                    action_log_probs[avatar_id],
                    actor_logits,
                    values[avatar_id],
                    next_rec_hs[avatar_id],
                    next_rec_cs[avatar_id],
                ) = policy.pick_action_and_info(
                    env_states[avatar_id],
                    rec_hs[avatar_id],
                    rec_cs[avatar_id],
                    sampling_method=action_source,
                    externally_chosen_action=scripted_action,
                )

                if first_episode_step:
                    probas = softmax(actor_logits.detach().numpy().flatten())
                    first_step_probas.current[avatar_id, :len(probas)] = probas

        # Step the environment with one action for each avatar
        next_env_states, rewards, dones, info = env.step(actions)

        # Insert transitions for alive avatars
        for avatar_id in range(env.num_avatars):
            if avatar_alive[avatar_id]:
                storage = avatar_storages[avatar_id]
                storage.insert(
                    env_states[avatar_id],
                    actions[avatar_id],
                    action_log_probs[avatar_id],
                    values[avatar_id],
                    rewards[avatar_id],
                    dones[avatar_id],
                    rec_hs[avatar_id],
                    rec_cs[avatar_id],
                )

        total_rewards.current += rewards
        steps_alive.current += avatar_alive

        # Episode is done
        if all(dones):
            env_states = env.reset()
            rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies)

            total_rewards.episode_over()
            steps_alive.episode_over()
            first_step_probas.episode_over()
            end_reasons.append(info['end_reason'])
            first_episode_step = True

        # The states were not immediately overwritten because we store the state that was used to generate (env_states)
        # the action for the current time-step, not the one we arrive in (next_env_states)
        else:
            env_states = next_env_states
            rec_hs = next_rec_hs
            rec_cs = next_rec_cs

            first_episode_step = False

    # Compute returns for all storages
    for storage in avatar_storages:
        storage.compute_returns()

    # Report progress
    avatar_rewards = total_rewards.final_history(drop_last=True)
    avg_team_rewards = np.array(
        [
            avatar_rewards[:, mask].sum() /
            sum(mask)  # take average per avatar
            for mask in env.team_masks
        ]
    )  # shape [num_teams,] holds the average reward of all thieves thieves got and the average of all guardians
    relative_team_rewards = avg_team_rewards / avg_team_rewards.sum()
    for measure, policy in zip(relative_team_rewards, team_policies):
        policy.scheduler.end_iteration_report(measure)

    # Set to training mode
    for policy in team_policies:
        policy.controller.train()

    # Update policies
    losses_history = [[] for _ in range(env.num_teams)]
    for epoch in range(config.num_epochs):
        for avatar_id in range(env.num_avatars):
            team = env.id2team[avatar_id]
            policy = team_policies[team]
            storage = avatar_storages[avatar_id]

            for batch in storage.sample_batches():
                # A batch contains multidimensional env_states, rec_hs, rec_cs, actions, old_action_log_probs, returns
                losses = policy.update(*batch)
                losses_history[team].append(losses)

    # Prepare storages for the next update
    for storage in avatar_storages:
        storage.reset()

    scheduling_statuses = [
        policy.sync_scheduled_values() for policy in team_policies
    ]

    # Ignore last episode since it's most likely unfinished
    return (
        total_rewards.final_history(drop_last=True),
        steps_alive.final_history(drop_last=True),
        first_step_probas.final_history(drop_last=False),
        end_reasons,
        losses_history,
        scheduling_statuses,
    )
示例#4
0
def simulate_episode(env: TGEnv, team_policies: List[Policy], sampling_method):
    """
    sampling_method: either one int or one for each team
    """
    if type(sampling_method) is int:
        sampling_method = [sampling_method] * env.num_teams

    map_history = []
    pos2id_history = []
    rewards_history = []
    actions_history = []

    avatar_policies = [
        team_policies[env.id2team[avatar_id]]
        for avatar_id in range(env.num_avatars)
    ]

    env_states = env.reset()  # shape: [num_avatars, *env_state_shape]
    rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies)
    dones = [False] * env.num_avatars
    cumulative_reward = np.zeros(env.num_avatars)

    actions = [0] * env.num_avatars
    action_log_probs = [0] * env.num_avatars

    # Set to evaluation mode
    for policy in team_policies:
        policy.controller.eval()

    while not all(dones):
        map_history.append(env._map.copy())
        pos2id_history.append(copy(env._pos2id))
        rewards_history.append(cumulative_reward.copy())

        # Alive at the beginning of step
        avatar_alive = env.avatar_alive.copy()

        # Run each alive avatar individually
        for avatar_id in range(env.num_avatars):
            if avatar_alive[avatar_id]:
                # Chose action based on the policy
                team = env.id2team[avatar_id]
                policy = team_policies[team]

                if sampling_method[team] == SCRIPTED:
                    scripted_action = env.scripted_action(avatar_id)
                else:
                    scripted_action = None

                with torch.no_grad():
                    (
                        actions[avatar_id],
                        action_log_probs[avatar_id],
                        _,
                        _,
                        rec_hs[avatar_id],
                        rec_cs[avatar_id],
                    ) = policy.pick_action_and_info(
                        env_states[avatar_id],
                        rec_hs[avatar_id],
                        rec_cs[avatar_id],
                        sampling_method=sampling_method[team],
                        externally_chosen_action=scripted_action,
                    )
            else:
                actions[avatar_id] = DEAD

        # Step the environment with one action for each avatar
        env_states, rewards, dones, infos = env.step(actions)
        cumulative_reward += rewards
        actions_history.append([
            ACTION_IDX2SYMBOL[env._interpret_action(a, env.id2team[i])]
            for i, a in enumerate(actions)
        ])

    # Add final state as well
    map_history.append(env._map.copy())
    pos2id_history.append(copy(env._pos2id))
    rewards_history.append(cumulative_reward.copy())
    actions_history.append([ACTION_IDX2SYMBOL[DEAD]] * env.num_avatars)

    return map_history, pos2id_history, rewards_history, actions_history, infos[
        'end_reason']
示例#5
0
 def test_thieves_guardian_one_done_after_guardian_touch(self):
     """ It's still considered alive for this time step, but will be dead for next timestep """
     env = TGEnv('2x3-2thieves-guardian-treasure')
     _, _, done, _ = env.step([NOOP, NOOP, UP])
     self.assertEqual(list(done), [True, False, False])
示例#6
0
 def test_thieves_guardian_alive_on_treasure_touch(self):
     """ It's still considered alive for this time step, but will be dead for next timestep """
     env = TGEnv('2x3-2thieves-guardian-treasure')
     _, _, done, _ = env.step([NOOP, DOWN, NOOP])
     self.assertEqual(list(env.avatar_alive), [False, False, False])
示例#7
0
 def test_thieves_guardian_done_after_treasure(self):
     env = TGEnv('2x3-2thieves-guardian-treasure')
     _, _, done, _ = env.step([NOOP, DOWN, NOOP])
     self.assertEqual(list(done), [True, True, True])
示例#8
0
 def test_thieves_guardian_alive_after_move(self):
     env = TGEnv('2x3-2thieves-guardian-treasure')
     _, _, done, _ = env.step([NOOP, RIGHT, NOOP])
     self.assertEqual(list(env.avatar_alive), [True, True, True])
示例#9
0
 def test_thieves_guardian_not_done_after_move(self):
     env = TGEnv('2x3-2thieves-guardian-treasure')
     _, _, done, _ = env.step([NOOP, RIGHT, NOOP])
     self.assertEqual(list(done), [False, False, False])
示例#10
0
 def test_thief_guardian_done_after_treasure(self):
     env = TGEnv('2x2-thief-guardian-treasure')
     _, _, done, _ = env.step([RIGHT, NOOP])
     _, _, done, _ = env.step([DOWN, NOOP])
     self.assertEqual(list(done), [True, True])
示例#11
0
 def test_single_alive_on_treasure_touch(self):
     """ It's still considered alive for this time step, but will be dead for next timestep """
     env = TGEnv('2x2-thief-treasure')
     _, _, done, _ = env.step([RIGHT])
     _, _, done, _ = env.step([DOWN])
     self.assertEqual(env.avatar_alive, [False])
示例#12
0
 def test_single_done_after_treasure(self):
     env = TGEnv('2x2-thief-treasure')
     _, _, done, _ = env.step([RIGHT])
     _, _, done, _ = env.step([DOWN])
     self.assertEqual(done, [True])
示例#13
0
 def test_single_alive_after_move(self):
     env = TGEnv('2x2-thief-treasure')
     _, _, done, _ = env.step([RIGHT])
     self.assertEqual(env.avatar_alive, [True])
示例#14
0
 def test_single_not_done_after_move(self):
     env = TGEnv('2x2-thief-treasure')
     _, _, done, _ = env.step([RIGHT])
     self.assertEqual(done, [False])
示例#15
0
class BasicMovementTests(unittest.TestCase):
    def setUp(self):
        """Create simple environment to test thief and guardian movements."""
        """
        '4x4-test-movements': [
            [_, _, _, _],
            [_, T, _, _],
            [_, _, G, _],
            [_, _, _, S],
        ],

        """
        self._env = TGEnv('4x4-test-movements')
        self._init_thief_pos = self._env._id2pos[0]
        self._init_guard_pos = self._env._id2pos[1]

    def basic_movement(self, moves: [int], expected_delta: [int]):
        # Take a step
        self._env.step(moves)

        # Get the new thief and guardian positions
        curr_thief_pos = self._env._id2pos[0]
        curr_guard_pos = self._env._id2pos[1]

        # Check the new positions against expected delta + old position
        self.assertEqual(curr_thief_pos,
                         tuple(self._init_thief_pos + expected_delta))
        self.assertEqual(curr_guard_pos,
                         tuple(self._init_guard_pos + expected_delta))

    def test_move_right(self):
        """Test moving right for thief and guardian."""
        self.basic_movement([RIGHT, RIGHT], action_idx2delta[RIGHT])

    def test_move_down(self):
        """Test moving down for thief and guardian."""
        self.basic_movement([DOWN, DOWN], action_idx2delta[DOWN])

    def test_move_up(self):
        """Test moving up for thief and guardian."""
        self.basic_movement([UP, UP], action_idx2delta[UP])

    def test_move_left(self):
        """Test moving left for thief and guardian."""
        self.basic_movement([LEFT, LEFT], action_idx2delta[LEFT])

    def test_move_thief_boundary_left(self):
        """Test thief bumping into left boundary."""
        self._env.step([LEFT, NOOP])
        self._env.step([LEFT, NOOP])
        self._env.step([LEFT, NOOP])

        curr_thief_pos = self._env._id2pos[0]
        expected_pos = (1, 0)
        self.assertEqual(curr_thief_pos, expected_pos)

    def test_move_thief_boundary_up(self):
        """Test thief bumping into top boundary"""
        self._env.step([UP, NOOP])
        self._env.step([UP, NOOP])
        self._env.step([UP, NOOP])

        curr_thief_pos = self._env._id2pos[0]
        expected_pos = (0, 1)
        self.assertEqual(curr_thief_pos, expected_pos)

    def test_move_thief_boundary_down(self):
        """Test thief bumping into bottom boundary"""
        self._env.step([DOWN, NOOP])
        self._env.step([DOWN, NOOP])
        self._env.step([DOWN, NOOP])

        curr_thief_pos = self._env._id2pos[0]
        expected_pos = (3, 1)
        self.assertEqual(curr_thief_pos, expected_pos)

    def test_move_thief_boundary_right(self):
        """Test thief bumping into right boundary"""
        self._env.step([RIGHT, NOOP])
        self._env.step([RIGHT, NOOP])
        self._env.step([RIGHT, NOOP])

        curr_thief_pos = self._env._id2pos[0]
        expected_pos = (1, 3)
        self.assertEqual(curr_thief_pos, expected_pos)

    def test_move_guardian_boundary_left(self):
        """Test guardian bumping into left boundary."""
        self._env.step([NOOP, LEFT])
        self._env.step([NOOP, LEFT])
        self._env.step([NOOP, LEFT])

        curr_guardian_pos = self._env._id2pos[1]
        expected_pos = (2, 0)
        self.assertEqual(curr_guardian_pos, expected_pos)

    def test_move_guardian_boundary_up(self):
        """Test guardian bumping into top boundary"""
        self._env.step([NOOP, UP])
        self._env.step([NOOP, UP])
        self._env.step([NOOP, UP])

        curr_guardian_pos = self._env._id2pos[1]
        expected_pos = (0, 2)
        self.assertEqual(curr_guardian_pos, expected_pos)

    def test_move_guardian_boundary_down(self):
        """Test guardian bumping into bottom boundary"""
        self._env.step([NOOP, DOWN])
        self._env.step([NOOP, DOWN])
        self._env.step([NOOP, DOWN])

        curr_guardian_pos = self._env._id2pos[1]
        expected_pos = (3, 2)
        self.assertEqual(curr_guardian_pos, expected_pos)

    def test_move_guardian_boundary_right(self):
        """Test guardian bumping into right boundary"""
        self._env.step([NOOP, RIGHT])
        self._env.step([NOOP, RIGHT])
        self._env.step([NOOP, RIGHT])

        curr_guardian_pos = self._env._id2pos[1]
        expected_pos = (2, 3)
        self.assertEqual(curr_guardian_pos, expected_pos)

    def test_move_guardian_treasure(self):
        """Test guardian bumping into treasure."""
        self._env.step([NOOP, DOWN])
        self._env.step([NOOP, RIGHT])

        curr_guardian_pos = self._env._id2pos[1]
        expected_pos = (3, 2)
        self.assertEqual(curr_guardian_pos, expected_pos)