def on_train_result(self, *, trainer, result: dict, **kwargs): result["scenario_name"] = trainer.scenario_name training_iteration = result["training_iteration"] super().on_train_result(trainer=trainer, result=result, **kwargs) if training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1: for player in range(2): checkpoint_metadata = create_metadata_with_new_checkpoint( policy_id_to_save=f"best_response_{player}", br_trainer=trainer, policy_player=player, save_dir=checkpoint_dir(trainer=trainer), timesteps_training=result["timesteps_total"], episodes_training=result["episodes_total"], checkpoint_name= f"best_response_player_{player}_iter_{training_iteration}.h5" ) joint_pol_checkpoint_spec = StrategySpec( strategy_id= f"best_response_player_{player}_iter_{training_iteration}", metadata=checkpoint_metadata) checkpoint_path = os.path.join( spec_checkpoint_dir(trainer), f"best_response_player_{player}_iter_{training_iteration}.json" ) ensure_dir(checkpoint_path) with open(checkpoint_path, "+w") as checkpoint_spec_file: checkpoint_spec_file.write( joint_pol_checkpoint_spec.to_json())
def on_train_result(self, *, trainer, result: dict, **kwargs): super().on_train_result(trainer=trainer, result=result, **kwargs) result["scenario_name"] = trainer.scenario_name result["avg_br_reward_both_players"] = ray.get(trainer.avg_br_reward_deque.get_mean.remote()) training_iteration = result["training_iteration"] if (calculate_openspiel_metanash and (training_iteration == 1 or training_iteration % calc_metanash_every_n_iters == 0)): base_env = _create_env() open_spiel_env_config = base_env.open_spiel_env_config openspiel_game_version = base_env.game_version local_avg_policy_0 = trainer.workers.local_worker().policy_map["average_policy_0"] local_avg_policy_1 = trainer.workers.local_worker().policy_map["average_policy_1"] exploitability = nfsp_measure_exploitability_nonlstm( rllib_policies=[local_avg_policy_0, local_avg_policy_1], poker_game_version=openspiel_game_version, open_spiel_env_config=open_spiel_env_config ) result["avg_policy_exploitability"] = exploitability logger.info(colored( f"(Graph this in a notebook) Exploitability: {exploitability} - Saving exploitability stats " f"to {os.path.join(trainer.logdir, 'result.json')}", "green")) if checkpoint_every_n_iters and (training_iteration % checkpoint_every_n_iters == 0 or training_iteration == 1): for player in range(2): checkpoint_metadata = create_metadata_with_new_checkpoint( policy_id_to_save=f"average_policy_{player}", br_trainer=br_trainer, save_dir=checkpoint_dir(trainer=br_trainer), timesteps_training=result["timesteps_total"], episodes_training=result["episodes_total"], checkpoint_name=f"average_policy_player_{player}_iter_{training_iteration}.h5" ) avg_pol_checkpoint_spec = StrategySpec( strategy_id=f"avg_pol_player_{player}_iter_{training_iteration}", metadata=checkpoint_metadata) checkpoint_path = os.path.join(spec_checkpoint_dir(br_trainer), f"average_policy_player_{player}_iter_{training_iteration}.json") ensure_dir(checkpoint_path) with open(checkpoint_path, "+w") as checkpoint_spec_file: checkpoint_spec_file.write(avg_pol_checkpoint_spec.to_json())