def main(path: str, name: str): task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0] ) # PPO test training sorted_population = load_population_from_path(path=path, sort_fn=sort_fn) for agent in sorted_population: print(agent.algorithm.num_updates) agent.requires_environment_model = False agent.training = False winrate_matrix = compute_winrate_matrix_metagame( population=sorted_population, episodes_per_matchup=1000, task=task) maxent_nash, nash_averaging = compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True) winrate_matrix = np.array(winrate_matrix) print( 'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging' ) np.savetxt(f'{name}_winrate_matrix.csv', winrate_matrix, delimiter=', ') np.savetxt(f'{name}_maxent_nash.csv', maxent_nash, delimiter=', ') np.savetxt(f'{name}_nash_averaging.csv', maxent_nash, delimiter=', ') ax = plot_winrate_matrix(winrate_matrix) plt.show()
def main(population: List, name: str, num_stack: int): #task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION, wrappers=create_wrapper(num_stack=num_stack)) winrate_matrix = compute_winrate_matrix_metagame( population=sorted_population, episodes_per_matchup=200, num_envs=-1, task=task, is_game_symmetrical=False, show_progress=True) maxent_nash, nash_averaging = compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True) winrate_matrix = np.array(winrate_matrix) print( 'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging' ) np.savetxt(f'{name}/winrate_matrix.csv', winrate_matrix, delimiter=', ') np.savetxt(f'{name}/maxent_nash.csv', maxent_nash, delimiter=', ') np.savetxt(f'{name}/nash_averaging.csv', maxent_nash, delimiter=', ') ax = plot_winrate_matrix(winrate_matrix) plt.show()
def compute_optimality_metrics(population, task, benchmarking_episodes, logger): logger.info('Computing winrate matrix') winrate_matrix_start_time = time.time() winrate_matrix = compute_winrate_matrix_metagame( population, task=task, episodes_per_matchup=benchmarking_episodes) winrate_submatrices = [ winrate_matrix[:i, :i] for i in range(1, len(winrate_matrix) + 1) ] winrate_matrix_total_time = time.time() - start_time logger.info('Computing winrate matrix took: {:.2} seconds'.format( winrate_matrix_total_time)) nash_averaging_start_time = time.time() logger.info('Computing nash averagings for all submatrices') evolution_maxent_nash_and_nash_averaging = [ compute_nash_averaging(m, perform_logodds_transformation=True) for m in winrate_submatrices ] nash_averaging_total_time = time.time() - nash_averaging_start_time logger.info( 'Computing nash averagings for all submatrices too: {:.2} seconds'. format(nash_averaging_total_time)) return winrate_submatrices, evolution_maxent_nash_and_nash_averaging
def compute_optimality_metrics(population, task, benchmarking_episodes, logger): logger.info('Computing winrate matrix') winrate_matrix = compute_winrate_matrix_metagame(population, task=task, episodes_per_matchup=benchmarking_episodes) winrate_submatrices = [winrate_matrix[:i, :i] for i in range(1, len(winrate_matrix) + 1)] logger.info('Computing nash averagings for all submatrices') evolution_maxent_nash_and_nash_averaging = [compute_nash_averaging(m, perform_logodds_transformation=True) for m in winrate_submatrices] return winrate_submatrices, evolution_maxent_nash_and_nash_averaging
def compute_progression_of_nash_averagings(winrate_matrix: np.ndarray): ''' Creates a lower triangular matrix ''' maxent_nashes = [compute_nash_averaging( winrate_matrix[:i,:i], perform_logodds_transformation=True)[0] for i in range(1, winrate_matrix.shape[0] + 1)] for max_ent in maxent_nashes: max_ent.resize(winrate_matrix.shape[0], refcheck=False) return np.stack(maxent_nashes)
def __init__(self, task: Task, meta_game_solver: Callable = lambda winrate_matrix: compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True)[0], threshold_best_response: float = 0.7, benchmarking_episodes: int = 10, match_outcome_rolling_window_size: int = 10): ''' :param task: Multiagent task :param meta_game_solver: Function which takes a meta-game and returns a probability distribution over the policies in the meta-game. Default uses maxent-Nash equilibrium for the logodds transformation of the winrate_matrix metagame. :param threshold_best_response: Winrate thrshold after which the agent being trained is to converge towards a best response againts the current meta-game solution. :param benchmarking_episodes: Number of episodes that will be used to compute winrates to fill the metagame. :param match_outcome_rolling_window_size: Number of episodes that will be used to decide whether the currently training agent has converged to a best response. ''' self.name = f'PSRO(M=maxentNash,O=BestResponse(wr={threshold_best_response},ws={match_outcome_rolling_window_size})' self.logger = logging.getLogger(self.name) self.logger.setLevel(logging.INFO) self.check_parameter_validity(task, threshold_best_response, benchmarking_episodes, match_outcome_rolling_window_size) self.task = task self.meta_game_solver = meta_game_solver self.meta_game, self.meta_game_solution = None, None self.menagerie = [] self.threshold_best_response = threshold_best_response self.match_outcome_rolling_window = [] self.match_outcome_rolling_window_size = match_outcome_rolling_window_size self.benchmarking_episodes = benchmarking_episodes self.statistics = [self.IterationStatistics(0, 0, 0, [0], np.nan)]
def single_experiment(task: Task, agents: List, selfplay_schemes: List[SelfPlayTrainingScheme], checkpoint_at_iterations: List[int], base_path: str, seed: int, benchmarking_episodes: int): trained_agent_paths = [] for sp_scheme in sp_schemes: for agent in agents: training_agent = agent.clone(training=True) path = f'{base_path}/{sp_scheme.name}-{agent.name}' trained_agent_paths += [path] train_and_evaluate(task=task, self_play_scheme=sp_scheme, training_agent=training_agent, checkpoint_at_iterations=checkpoint_at_iterations, benchmarking_episodes=experiment_config['benchmarking_episodes'], base_path=path, seed=seed) # Self-play schemes like PSRO contain useful information dill.dump(sp_scheme, open(f'{path}/{sp_scheme.name}.pickle', 'wb')) logging.info('Computing relative performances') relative_performances_path = f'{base_path}/relative_performances/' if not os.path.exists(relative_performances_path): os.mkdir(relative_performances_path) compute_relative_pop_performance_all_populations(trained_agent_paths, task, benchmarking_episodes, base_path=relative_performances_path) logging.info('Loading all trained agents') joint_trained_population = reduce(lambda succ, path: succ + load_population_from_path(path), trained_agent_paths, []) logging.info('START winrate matrix computation of all trained policies') final_winrate_matrix = compute_winrate_matrix_metagame(joint_trained_population, episodes_per_matchup=5, task=task) logging.info('START Nash averaging computation of all trained policies') maxent_nash, nash_avg = compute_nash_averaging(final_winrate_matrix, perform_logodds_transformation=True) logging.info('Experiment FINISHED!') dill.dump(final_winrate_matrix, open(f'{base_path}/final_winrate_matrix.pickle', 'wb')) dill.dump(maxent_nash, open(f'{base_path}/final_maxent_nash.pickle', 'wb'))
def test_for_none_game_raises_valueerror(): with pytest.raises(ValueError) as _: _ = compute_nash_averaging(None)
def test_for_non_antisymmetric_matrix_raises_valueerror(): random_winrate_matrix = [[0.5, 0.2], [0.8, 0.5]] with pytest.raises(ValueError) as _: _ = compute_nash_averaging(random_winrate_matrix)
def test_for_non_integer_or_float_list_raises_valueerror(): with pytest.raises(ValueError) as _: _ = compute_nash_averaging([['a', 'b']])
def test_for_empty_numpy_array_game_raises_valueerror(): with pytest.raises(ValueError) as _: _ = compute_nash_averaging(np.array(None))