class RewardsTests(unittest.TestCase): def setUp(self): """Create simple environment to test rewards.""" self._env = TGEnv('4x4-test-rewards') def test_reward_guard_catch_thief(self): """Test reward for guard catching thief.""" _, reward, _, _ = self._env.step([NOOP, UP]) self.assertEqual(tuple(reward), REWARDS['killed']) def test_reward_thief_runs_into_guard(self): """Test reward for thief running into a guard, stupidly.""" _, reward, _, _ = self._env.step([DOWN, NOOP]) self.assertEqual(tuple(reward), REWARDS['killed']) def test_reward_thief_finds_treasure(self): """Test reward for thief getting that sweet, sweet treasure.""" _, reward, _, _ = self._env.step([UP, NOOP]) self.assertEqual(tuple(reward), REWARDS['treasure']) def test_reward_out_of_time(self): """Test rewards for when environment time is spent.""" # Perform no movements until the max number of steps done = [False] # TODO check this while not all(done): _, reward, done, _ = self._env.step([NOOP, NOOP]) self.assertEqual(tuple(reward), REWARDS['out_of_time'])
class WallMovementsTests(unittest.TestCase): def setUp(self): """Create simple environment where thief and guardian can't move.""" """ '4x4-test-movements-walls': [ [_, W, _, _], [W, T, W, _], [_, W, G, W], [_, _, W, _], ], """ self._env = TGEnv('4x4-test-movements-walls') self._init_thief_pos = self._env._id2pos[0] self._init_guard_pos = self._env._id2pos[1] def test_move_up_to_wall(self): """Move up to a wall.""" self._env.step([UP, UP]) curr_thief_pos = self._env._id2pos[0] curr_guard_pos = self._env._id2pos[1] self.assertEqual(curr_thief_pos, self._init_thief_pos) self.assertEqual(curr_guard_pos, self._init_guard_pos) def test_move_down_to_wall(self): """Move down to a wall.""" self._env.step([DOWN, DOWN]) curr_thief_pos = self._env._id2pos[0] curr_guard_pos = self._env._id2pos[1] self.assertEqual(curr_thief_pos, self._init_thief_pos) self.assertEqual(curr_guard_pos, self._init_guard_pos) def test_move_left_to_wall(self): """Move left to a wall.""" self._env.step([LEFT, LEFT]) curr_thief_pos = self._env._id2pos[0] curr_guard_pos = self._env._id2pos[1] self.assertEqual(curr_thief_pos, self._init_thief_pos) self.assertEqual(curr_guard_pos, self._init_guard_pos) def test_move_right_to_wall(self): """Move up to a wall.""" self._env.step([RIGHT, RIGHT]) curr_thief_pos = self._env._id2pos[0] curr_guard_pos = self._env._id2pos[1] self.assertEqual(curr_thief_pos, self._init_thief_pos) self.assertEqual(curr_guard_pos, self._init_guard_pos)
def perform_update(config, env: TGEnv, team_policies: List[Policy], avatar_storages: List[RolloutsStorage]): """ Collects rollouts and updates """ # Used to log total_rewards = EpisodeAccumulator(env.num_avatars) steps_alive = EpisodeAccumulator(env.num_avatars) first_step_probas = EpisodeAccumulator(env.num_avatars, max(env.num_actions)) end_reasons = [] # Will be filled in for each avatar when stepping the environment individually actions = [0] * env.num_avatars action_log_probs = [0] * env.num_avatars values = [0] * env.num_avatars avatar_policies = [ team_policies[env.id2team[avatar_id]] for avatar_id in range(env.num_avatars) ] # Always start with a fresh env env_states = env.reset() # shape: [num_avatars, *env_state_shape] rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies) next_rec_hs, next_rec_cs = rec_hs, rec_cs first_episode_step = True # Set to eval mode because batch norm cannot be computed on a batch size of 1 # and when we pick actions we only pick one at a time for policy in team_policies: policy.controller.eval() # Collect rollouts # TODO (?): collect in parallel for step in range(config.num_transitions): # Alive at the beginning of step avatar_alive = env.avatar_alive.copy() # Run each alive avatar individually for avatar_id in range(env.num_avatars): if avatar_alive[avatar_id]: # Chose action based on the policy team = env.id2team[avatar_id] policy = team_policies[team] action_source = policy.scheduler.pick_action_source() if action_source == SCRIPTED: scripted_action = env.scripted_action(avatar_id) else: scripted_action = None ( actions[avatar_id], action_log_probs[avatar_id], actor_logits, values[avatar_id], next_rec_hs[avatar_id], next_rec_cs[avatar_id], ) = policy.pick_action_and_info( env_states[avatar_id], rec_hs[avatar_id], rec_cs[avatar_id], sampling_method=action_source, externally_chosen_action=scripted_action, ) if first_episode_step: probas = softmax(actor_logits.detach().numpy().flatten()) first_step_probas.current[avatar_id, :len(probas)] = probas # Step the environment with one action for each avatar next_env_states, rewards, dones, info = env.step(actions) # Insert transitions for alive avatars for avatar_id in range(env.num_avatars): if avatar_alive[avatar_id]: storage = avatar_storages[avatar_id] storage.insert( env_states[avatar_id], actions[avatar_id], action_log_probs[avatar_id], values[avatar_id], rewards[avatar_id], dones[avatar_id], rec_hs[avatar_id], rec_cs[avatar_id], ) total_rewards.current += rewards steps_alive.current += avatar_alive # Episode is done if all(dones): env_states = env.reset() rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies) total_rewards.episode_over() steps_alive.episode_over() first_step_probas.episode_over() end_reasons.append(info['end_reason']) first_episode_step = True # The states were not immediately overwritten because we store the state that was used to generate (env_states) # the action for the current time-step, not the one we arrive in (next_env_states) else: env_states = next_env_states rec_hs = next_rec_hs rec_cs = next_rec_cs first_episode_step = False # Compute returns for all storages for storage in avatar_storages: storage.compute_returns() # Report progress avatar_rewards = total_rewards.final_history(drop_last=True) avg_team_rewards = np.array( [ avatar_rewards[:, mask].sum() / sum(mask) # take average per avatar for mask in env.team_masks ] ) # shape [num_teams,] holds the average reward of all thieves thieves got and the average of all guardians relative_team_rewards = avg_team_rewards / avg_team_rewards.sum() for measure, policy in zip(relative_team_rewards, team_policies): policy.scheduler.end_iteration_report(measure) # Set to training mode for policy in team_policies: policy.controller.train() # Update policies losses_history = [[] for _ in range(env.num_teams)] for epoch in range(config.num_epochs): for avatar_id in range(env.num_avatars): team = env.id2team[avatar_id] policy = team_policies[team] storage = avatar_storages[avatar_id] for batch in storage.sample_batches(): # A batch contains multidimensional env_states, rec_hs, rec_cs, actions, old_action_log_probs, returns losses = policy.update(*batch) losses_history[team].append(losses) # Prepare storages for the next update for storage in avatar_storages: storage.reset() scheduling_statuses = [ policy.sync_scheduled_values() for policy in team_policies ] # Ignore last episode since it's most likely unfinished return ( total_rewards.final_history(drop_last=True), steps_alive.final_history(drop_last=True), first_step_probas.final_history(drop_last=False), end_reasons, losses_history, scheduling_statuses, )
def simulate_episode(env: TGEnv, team_policies: List[Policy], sampling_method): """ sampling_method: either one int or one for each team """ if type(sampling_method) is int: sampling_method = [sampling_method] * env.num_teams map_history = [] pos2id_history = [] rewards_history = [] actions_history = [] avatar_policies = [ team_policies[env.id2team[avatar_id]] for avatar_id in range(env.num_avatars) ] env_states = env.reset() # shape: [num_avatars, *env_state_shape] rec_hs, rec_cs = _get_initial_recurrent_state(avatar_policies) dones = [False] * env.num_avatars cumulative_reward = np.zeros(env.num_avatars) actions = [0] * env.num_avatars action_log_probs = [0] * env.num_avatars # Set to evaluation mode for policy in team_policies: policy.controller.eval() while not all(dones): map_history.append(env._map.copy()) pos2id_history.append(copy(env._pos2id)) rewards_history.append(cumulative_reward.copy()) # Alive at the beginning of step avatar_alive = env.avatar_alive.copy() # Run each alive avatar individually for avatar_id in range(env.num_avatars): if avatar_alive[avatar_id]: # Chose action based on the policy team = env.id2team[avatar_id] policy = team_policies[team] if sampling_method[team] == SCRIPTED: scripted_action = env.scripted_action(avatar_id) else: scripted_action = None with torch.no_grad(): ( actions[avatar_id], action_log_probs[avatar_id], _, _, rec_hs[avatar_id], rec_cs[avatar_id], ) = policy.pick_action_and_info( env_states[avatar_id], rec_hs[avatar_id], rec_cs[avatar_id], sampling_method=sampling_method[team], externally_chosen_action=scripted_action, ) else: actions[avatar_id] = DEAD # Step the environment with one action for each avatar env_states, rewards, dones, infos = env.step(actions) cumulative_reward += rewards actions_history.append([ ACTION_IDX2SYMBOL[env._interpret_action(a, env.id2team[i])] for i, a in enumerate(actions) ]) # Add final state as well map_history.append(env._map.copy()) pos2id_history.append(copy(env._pos2id)) rewards_history.append(cumulative_reward.copy()) actions_history.append([ACTION_IDX2SYMBOL[DEAD]] * env.num_avatars) return map_history, pos2id_history, rewards_history, actions_history, infos[ 'end_reason']
def test_thieves_guardian_one_done_after_guardian_touch(self): """ It's still considered alive for this time step, but will be dead for next timestep """ env = TGEnv('2x3-2thieves-guardian-treasure') _, _, done, _ = env.step([NOOP, NOOP, UP]) self.assertEqual(list(done), [True, False, False])
def test_thieves_guardian_alive_on_treasure_touch(self): """ It's still considered alive for this time step, but will be dead for next timestep """ env = TGEnv('2x3-2thieves-guardian-treasure') _, _, done, _ = env.step([NOOP, DOWN, NOOP]) self.assertEqual(list(env.avatar_alive), [False, False, False])
def test_thieves_guardian_done_after_treasure(self): env = TGEnv('2x3-2thieves-guardian-treasure') _, _, done, _ = env.step([NOOP, DOWN, NOOP]) self.assertEqual(list(done), [True, True, True])
def test_thieves_guardian_alive_after_move(self): env = TGEnv('2x3-2thieves-guardian-treasure') _, _, done, _ = env.step([NOOP, RIGHT, NOOP]) self.assertEqual(list(env.avatar_alive), [True, True, True])
def test_thieves_guardian_not_done_after_move(self): env = TGEnv('2x3-2thieves-guardian-treasure') _, _, done, _ = env.step([NOOP, RIGHT, NOOP]) self.assertEqual(list(done), [False, False, False])
def test_thief_guardian_done_after_treasure(self): env = TGEnv('2x2-thief-guardian-treasure') _, _, done, _ = env.step([RIGHT, NOOP]) _, _, done, _ = env.step([DOWN, NOOP]) self.assertEqual(list(done), [True, True])
def test_single_alive_on_treasure_touch(self): """ It's still considered alive for this time step, but will be dead for next timestep """ env = TGEnv('2x2-thief-treasure') _, _, done, _ = env.step([RIGHT]) _, _, done, _ = env.step([DOWN]) self.assertEqual(env.avatar_alive, [False])
def test_single_done_after_treasure(self): env = TGEnv('2x2-thief-treasure') _, _, done, _ = env.step([RIGHT]) _, _, done, _ = env.step([DOWN]) self.assertEqual(done, [True])
def test_single_alive_after_move(self): env = TGEnv('2x2-thief-treasure') _, _, done, _ = env.step([RIGHT]) self.assertEqual(env.avatar_alive, [True])
def test_single_not_done_after_move(self): env = TGEnv('2x2-thief-treasure') _, _, done, _ = env.step([RIGHT]) self.assertEqual(done, [False])
class BasicMovementTests(unittest.TestCase): def setUp(self): """Create simple environment to test thief and guardian movements.""" """ '4x4-test-movements': [ [_, _, _, _], [_, T, _, _], [_, _, G, _], [_, _, _, S], ], """ self._env = TGEnv('4x4-test-movements') self._init_thief_pos = self._env._id2pos[0] self._init_guard_pos = self._env._id2pos[1] def basic_movement(self, moves: [int], expected_delta: [int]): # Take a step self._env.step(moves) # Get the new thief and guardian positions curr_thief_pos = self._env._id2pos[0] curr_guard_pos = self._env._id2pos[1] # Check the new positions against expected delta + old position self.assertEqual(curr_thief_pos, tuple(self._init_thief_pos + expected_delta)) self.assertEqual(curr_guard_pos, tuple(self._init_guard_pos + expected_delta)) def test_move_right(self): """Test moving right for thief and guardian.""" self.basic_movement([RIGHT, RIGHT], action_idx2delta[RIGHT]) def test_move_down(self): """Test moving down for thief and guardian.""" self.basic_movement([DOWN, DOWN], action_idx2delta[DOWN]) def test_move_up(self): """Test moving up for thief and guardian.""" self.basic_movement([UP, UP], action_idx2delta[UP]) def test_move_left(self): """Test moving left for thief and guardian.""" self.basic_movement([LEFT, LEFT], action_idx2delta[LEFT]) def test_move_thief_boundary_left(self): """Test thief bumping into left boundary.""" self._env.step([LEFT, NOOP]) self._env.step([LEFT, NOOP]) self._env.step([LEFT, NOOP]) curr_thief_pos = self._env._id2pos[0] expected_pos = (1, 0) self.assertEqual(curr_thief_pos, expected_pos) def test_move_thief_boundary_up(self): """Test thief bumping into top boundary""" self._env.step([UP, NOOP]) self._env.step([UP, NOOP]) self._env.step([UP, NOOP]) curr_thief_pos = self._env._id2pos[0] expected_pos = (0, 1) self.assertEqual(curr_thief_pos, expected_pos) def test_move_thief_boundary_down(self): """Test thief bumping into bottom boundary""" self._env.step([DOWN, NOOP]) self._env.step([DOWN, NOOP]) self._env.step([DOWN, NOOP]) curr_thief_pos = self._env._id2pos[0] expected_pos = (3, 1) self.assertEqual(curr_thief_pos, expected_pos) def test_move_thief_boundary_right(self): """Test thief bumping into right boundary""" self._env.step([RIGHT, NOOP]) self._env.step([RIGHT, NOOP]) self._env.step([RIGHT, NOOP]) curr_thief_pos = self._env._id2pos[0] expected_pos = (1, 3) self.assertEqual(curr_thief_pos, expected_pos) def test_move_guardian_boundary_left(self): """Test guardian bumping into left boundary.""" self._env.step([NOOP, LEFT]) self._env.step([NOOP, LEFT]) self._env.step([NOOP, LEFT]) curr_guardian_pos = self._env._id2pos[1] expected_pos = (2, 0) self.assertEqual(curr_guardian_pos, expected_pos) def test_move_guardian_boundary_up(self): """Test guardian bumping into top boundary""" self._env.step([NOOP, UP]) self._env.step([NOOP, UP]) self._env.step([NOOP, UP]) curr_guardian_pos = self._env._id2pos[1] expected_pos = (0, 2) self.assertEqual(curr_guardian_pos, expected_pos) def test_move_guardian_boundary_down(self): """Test guardian bumping into bottom boundary""" self._env.step([NOOP, DOWN]) self._env.step([NOOP, DOWN]) self._env.step([NOOP, DOWN]) curr_guardian_pos = self._env._id2pos[1] expected_pos = (3, 2) self.assertEqual(curr_guardian_pos, expected_pos) def test_move_guardian_boundary_right(self): """Test guardian bumping into right boundary""" self._env.step([NOOP, RIGHT]) self._env.step([NOOP, RIGHT]) self._env.step([NOOP, RIGHT]) curr_guardian_pos = self._env._id2pos[1] expected_pos = (2, 3) self.assertEqual(curr_guardian_pos, expected_pos) def test_move_guardian_treasure(self): """Test guardian bumping into treasure.""" self._env.step([NOOP, DOWN]) self._env.step([NOOP, RIGHT]) curr_guardian_pos = self._env._id2pos[1] expected_pos = (3, 2) self.assertEqual(curr_guardian_pos, expected_pos)