def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) if not hasattr(worker, "avg_pol_loaded") or not worker.avg_pol_loaded: avg_policy = worker.policy_map["average_policy"] load_pure_strat(policy=avg_policy, pure_strat_spec=player_0_avg_pol_spec) worker.avg_pol_loaded = True
def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) metanash_policy = worker.policy_map["metanash"] load_pure_strat(policy=metanash_policy, pure_strat_spec=opponent_policy_distribution. sample_policy_spec())
def on_episode_start(self, *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): super().on_episode_start(worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_index, **kwargs) metanash_policy = worker.policy_map["metanash"] load_pure_strat( policy=metanash_policy, checkpoint_path=opponent_policy_distribution. sample_policy_spec().metadata["checkpoint_path"].replace( "deploy", "jblanier"))
def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): # Sample new pure strategy policy weights from the opponent strategy distribution for the best response to # train against. For better runtime performance, this function can be modified to load new weights # only every few episodes instead. resample_pure_strat_every_n_episodes = 1 metanash_policy: Policy = policies[f"metanash"] opponent_policy_distribution: PolicySpecDistribution = worker.opponent_policy_distribution time_for_resample = (not hasattr(metanash_policy, "episodes_since_resample") or metanash_policy.episodes_since_resample >= resample_pure_strat_every_n_episodes) if time_for_resample and opponent_policy_distribution is not None: new_pure_strat_spec: StrategySpec = opponent_policy_distribution.sample_policy_spec() # noinspection PyTypeChecker load_pure_strat(policy=metanash_policy, pure_strat_spec=new_pure_strat_spec) metanash_policy.episodes_since_resample = 1 elif opponent_policy_distribution is not None: metanash_policy.episodes_since_resample += 1
def run_poker_evaluation_loop(scenario_name: str, eval_dispatcher_port: int, eval_dispatcher_host: str): scenario: PSROScenario = scenario_catalog.get(scenario_name=scenario_name) if not isinstance(scenario, PSROScenario): raise TypeError(f"Only instances of {PSROScenario} can be used here. {scenario.name} is a {type(scenario)}.") eval_dispatcher = RemoteEvalDispatcherClient(port=eval_dispatcher_port, remote_server_host=eval_dispatcher_host) env = scenario.env_class(env_config=scenario.env_config) num_players = 2 trainer_config = scenario.get_trainer_config(env) trainer_config["explore"] = scenario.allow_stochastic_best_responses policies = [scenario.policy_classes["eval"](env.observation_space, env.action_space, with_common_config(trainer_config)) for _ in range(num_players)] while True: policy_specs_for_each_player, required_games_to_play = eval_dispatcher.take_eval_job() if policy_specs_for_each_player is None: time.sleep(2) else: if len(policy_specs_for_each_player) != 2: raise NotImplementedError(f"This evaluation code only supports two player games. " f"{len(policy_specs_for_each_player)} players were requested.") # print(f"Got eval matchup:") # for spec in policy_specs_for_each_player: # print(f"spec: {spec.to_json()}") for policy, spec in zip(policies, policy_specs_for_each_player): load_pure_strat(policy=policy, pure_strat_spec=spec) total_payoffs_per_player = np.zeros(shape=num_players, dtype=np.float64) # max_reward = None # min_reward = None # time_since_last_output = time.time() for game in range(required_games_to_play): # if game % 1000 == 0: # now = time.time() # print(f"{policy_specs_for_each_player[0].id} vs " # f"{policy_specs_for_each_player[1].id}: " # f"{game}/{required_games_to_play} games played, {now - time_since_last_output} seconds") # time_since_last_output = now payoffs_per_player_this_episode = run_episode(env=env, policies_for_each_player=policies) total_payoffs_per_player += payoffs_per_player_this_episode # if max_reward is None or max(payoffs_per_player_this_episode) > max_reward: # max_reward = max(payoffs_per_player_this_episode) # if min_reward is None or min(payoffs_per_player_this_episode) < min_reward: # min_reward = min(payoffs_per_player_this_episode) payoffs_per_player = total_payoffs_per_player / required_games_to_play print(f"payoffs per player:" f"{policy_specs_for_each_player[0].id} vs " f"{policy_specs_for_each_player[1].id}: " f"{payoffs_per_player}") eval_dispatcher.submit_eval_job_result( policy_specs_for_each_player_tuple=policy_specs_for_each_player, payoffs_for_each_player=payoffs_per_player, games_played=required_games_to_play )
def _set_br_initial_weights(worker: RolloutWorker): br_policy = worker.policy_map["best_response"] load_pure_strat(policy=br_policy, checkpoint_path=previous_br_checkpoint_path)
def _set_worker_metanash(worker: RolloutWorker): if metanash_specs_for_players is not None: metanash_policy = worker.policy_map["metanash"] metanash_strategy_spec: StrategySpec = metanash_specs_for_players[other_player] load_pure_strat(policy=metanash_policy, pure_strat_spec=metanash_strategy_spec)