def create_games(origin, destination, num_vehicles, graph, max_time_step, time_step_length=1.0, departure_time=None): if departure_time is not None: raise NotImplementedError("To do.") list_of_vehicles = [ dynamic_routing_utils.Vehicle(origin, destination) for _ in range(num_vehicles) ] game = dynamic_routing.DynamicRoutingGame( { "max_num_time_step": max_time_step, "time_step_length": time_step_length }, network=graph, vehicles=list_of_vehicles) seq_game = pyspiel.convert_to_turn_based(game) od_demand = [ dynamic_routing_utils.OriginDestinationDemand(origin, destination, 0, num_vehicles) ] mfg_game = mean_field_routing_game.MeanFieldRoutingGame( { "max_num_time_step": max_time_step, "time_step_length": time_step_length }, network=graph, od_demand=od_demand) return game, seq_game, mfg_game
def nfsp_measure_exploitability_nonlstm(rllib_policies: List[Policy], poker_game_version: str, open_spiel_env_config: dict = None): if open_spiel_env_config is None: if poker_game_version in ["kuhn_poker", "leduc_poker"]: open_spiel_env_config = { "players": pyspiel.GameParameter(2) } else: open_spiel_env_config = {} open_spiel_env_config = {k: pyspiel.GameParameter(v) if not isinstance(v, pyspiel.GameParameter) else v for k, v in open_spiel_env_config.items()} openspiel_game = pyspiel.load_game(poker_game_version, open_spiel_env_config) if poker_game_version == "oshi_zumo": openspiel_game = pyspiel.convert_to_turn_based(openspiel_game) opnsl_policies = [] for rllib_policy in rllib_policies: openspiel_policy = openspiel_policy_from_nonlstm_rllib_policy(openspiel_game=openspiel_game, rllib_policy=rllib_policy, game_version=poker_game_version, game_parameters=open_spiel_env_config, ) opnsl_policies.append(openspiel_policy) nfsp_policy = JointPlayerPolicy(game=openspiel_game, policies=opnsl_policies) # Exploitability is NashConv / num_players if poker_game_version == "universal_poker": print("Measuring exploitability for universal_poker policy. This will take a while...") exploitability_result = exploitability(game=openspiel_game, policy=nfsp_policy) return exploitability_result
def _rollout_until_timeout(game_name, time_limit, give_up_after, if_simultaneous_convert_to_turn_based=False): """Run rollouts on the specified game until the time limit. Args: game_name: str time_limit: In number of seconds give_up_after: Cuts off trajectories longer than specified if_simultaneous_convert_to_turn_based: if the game is simultaneous and this boolean is true, then the game is loaded as a turn based game. Returns: A dict of collected statistics. """ game = pyspiel.load_game(game_name) if game.get_type().dynamics == pyspiel.GameType.Dynamics.MEAN_FIELD: raise NotImplementedError( "Benchmark on mean field games is not available yet.") if (game.get_type().dynamics == pyspiel.GameType.Dynamics.SIMULTANEOUS and if_simultaneous_convert_to_turn_based): game = pyspiel.convert_to_turn_based(game) is_time_out = lambda t: time.time() - t > time_limit num_rollouts = 0 num_giveups = 0 num_moves = 0 start = time.time() while not is_time_out(start): state = game.new_initial_state() while not state.is_terminal(): if len(state.history()) > give_up_after: num_giveups += 1 break if state.is_simultaneous_node(): def random_choice(actions): if actions: return random.choice(actions) return 0 actions = [ random_choice(state.legal_actions(i)) for i in range(state.num_players()) ] state.apply_actions(actions) else: action = random.choice(state.legal_actions(state.current_player())) state.apply_action(action) num_moves += 1 num_rollouts += 1 time_elapsed = time.time() - start return dict( game_name=game_name, ms_per_rollouts=time_elapsed / num_rollouts * 1000, ms_per_moves=time_elapsed / num_moves * 1000, giveups_per_rollout=num_giveups / num_rollouts, time_elapsed=time_elapsed )
def test_game_as_turn_based(self): """Check the game can be converted to a turn-based game.""" game = pyspiel.load_game("python_iterated_prisoners_dilemma") turn_based = pyspiel.convert_to_turn_based(game) pyspiel.random_sim_test(turn_based, num_sims=10, serialize=False, verbose=True)
def test_int_mccfr_on_turn_based_game_with_exploitability(self): """Check if outcome sampling MCCFR can be applied.""" game = pyspiel.load_game( "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)") seq_game = pyspiel.convert_to_turn_based(game) cfr_solver = outcome_mccfr.OutcomeSamplingSolver(seq_game) for _ in range(_NUM_ITERATION_CFR_TEST): cfr_solver.iteration() exploitability.nash_conv(seq_game, cfr_solver.average_policy())
def test_cfr_on_turn_based_game_with_exploitability(self): """Check if CFR can be applied to the sequential game.""" game = pyspiel.load_game( "python_dynamic_routing(max_num_time_step=5,time_step_length=1.0)") seq_game = pyspiel.convert_to_turn_based(game) cfr_solver = cfr.CFRSolver(seq_game) for _ in range(_NUM_ITERATION_CFR_TEST): cfr_solver.evaluate_and_update_policy() exploitability.nash_conv(seq_game, cfr_solver.average_policy())
def test_action_consistency_convert_to_turn_based(self): """Check if the sequential game is consistent with the game.""" game = pyspiel.load_game("python_dynamic_routing") seq_game = pyspiel.convert_to_turn_based(game) state = game.new_initial_state() seq_state = seq_game.new_initial_state() self.assertEqual( state.legal_actions(seq_state.current_player()), seq_state.legal_actions(), msg="The sequential actions are not correct.")
def psro_measure_exploitability_nonlstm( br_checkpoint_path_tuple_list: List[Tuple[str, str]], metanash_weights: List[Tuple[float, float]], set_policy_weights_fn: Callable, rllib_policies: List[Policy], poker_game_version: str, open_spiel_env_config: dict = None): if open_spiel_env_config is None: if poker_game_version in ["kuhn_poker", "leduc_poker"]: open_spiel_env_config = {"players": pyspiel.GameParameter(2)} else: open_spiel_env_config = {} open_spiel_env_config = { k: pyspiel.GameParameter(v) if not isinstance(v, pyspiel.GameParameter) else v for k, v in open_spiel_env_config.items() } openspiel_game = pyspiel.load_game(poker_game_version, open_spiel_env_config) if poker_game_version == "oshi_zumo": openspiel_game = pyspiel.convert_to_turn_based(openspiel_game) def policy_iterable(): for checkpoint_path_tuple in br_checkpoint_path_tuple_list: openspiel_policies = [] for player, player_rllib_policy in enumerate(rllib_policies): checkpoint_path = checkpoint_path_tuple[player] if checkpoint_path not in _psro_tabular_policies_cache: set_policy_weights_fn(player_rllib_policy, checkpoint_path=checkpoint_path) single_openspiel_policy = openspiel_policy_from_nonlstm_rllib_policy( openspiel_game=openspiel_game, rllib_policy=player_rllib_policy, game_version=poker_game_version, game_parameters=open_spiel_env_config, ) if CACHE_PSRO_TABULAR_POLICIES: _psro_tabular_policies_cache[ checkpoint_path] = single_openspiel_policy else: single_openspiel_policy = _psro_tabular_policies_cache[ checkpoint_path] openspiel_policies.append(single_openspiel_policy) yield openspiel_policies avg_policies = tabular_policies_from_weighted_policies( game=openspiel_game, policy_iterable=policy_iterable(), weights=metanash_weights) joint_player_policy = JointPlayerPolicy(game=openspiel_game, policies=avg_policies) # Exploitability is NashConv / num_players if poker_game_version == "universal_poker": print( "Measuring exploitability for universal_poker policy. This will take a while..." ) exploitability_result = exploitability(game=openspiel_game, policy=joint_player_policy) return exploitability_result
def test_game_as_turn_based(self): """Check the game can be converted to a turn-based game.""" game = pyspiel.load_game("python_dynamic_routing") turn_based = pyspiel.convert_to_turn_based(game) pyspiel.random_sim_test( turn_based, num_sims=10, serialize=False, verbose=True)
def test_creation_of_rl_environment(self): """Check if RL environment can be created.""" game = pyspiel.load_game("python_dynamic_routing") seq_game = pyspiel.convert_to_turn_based(game) rl_environment.Environment(seq_game)