def test_reorganize(self): trajectories = reorganize([[[1, 2], 1, [4, 5]]], [1]) self.assertEqual(np.array(trajectories).shape, (1, 1, 5))
def run(self, is_training: bool = False, seed: int = None) -> (List[List[dict]], dict): """ Run a complete game, either for evaluation or training RL agent. :param is_training: (boolean): True if for training purpose. :param seed: (int): A seed for running the game. For single-process program, the seed should be set to None. For multi-process program, the seed should be asigned for reproducibility. :return: (tuple) Tuple containing: (list): A list of trajectories generated from the environment. (list): A list payoffs. Each entry corresponds to one player. Note: The trajectories are 3-dimension list. The first dimension is for different players. The second dimension is for different transitions. The third dimension is for the contents of each transiton """ if self.single_agent_mode or self.human_mode: raise ValueError( 'Run in single agent mode or human mode is not allowed.') if seed is not None: np.random.seed(seed) random.seed(seed) trajectories = [[] for _ in range(self.player_num)] state, player_id = self.init_game() # Loop to play the game trajectories[player_id].append(state) while not self.is_over(): # Agent plays if not is_training: action = self.agents[player_id].eval_step(state) else: action = self.agents[player_id].step(state) # Environment steps next_state, next_player_id = self.step(action) # Save action trajectories[player_id].append(action) # Set the state and player state = next_state player_id = next_player_id # Save state. if not self.game.is_over(): trajectories[player_id].append(state) # Add a final state to all the players for player_id in range(self.player_num): state = self.get_state(player_id) trajectories[player_id].append(state) # Payoffs payoffs = self.get_payoffs() # Reorganize the trajectories trajectories = reorganize(trajectories, payoffs) return trajectories, payoffs