def test_cfr_cce_dist_goofspiel(self): """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc.""" game = pyspiel.load_game( "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order=" "descending,returns_type=total_points))") for num_iterations in [1, 10, 100]: policies = [] cfr_solver = cfr.CFRSolver(game) for _ in range(num_iterations): cfr_solver.evaluate_and_update_policy() policies.append( policy.python_policy_to_pyspiel_policy( cfr_solver.current_policy())) mu = pyspiel.uniform_correlation_device(policies) cce_dist1 = pyspiel.cce_dist(game, mu) print( "goofspiel, cce test num_iterations: {}, cce_dist: {}".format( num_iterations, cce_dist1)) # Assemble the same correlation device manually, just as an example for # how to do non-uniform distributions of them and to test the python # bindings for lists of tuples works properly uniform_prob = 1.0 / len(policies) mu2 = [(uniform_prob, policy) for policy in policies] cce_dist2 = pyspiel.cce_dist(game, mu2) self.assertAlmostEqual(cce_dist1, cce_dist2)
def policy_bots(): random_policy = policy.UniformRandomPolicy(GAME) py_bot = PolicyBot(0, np.random.RandomState(4321), random_policy) cpp_bot = pyspiel.make_policy_bot( GAME, 1, 1234, policy.python_policy_to_pyspiel_policy(random_policy.to_tabular())) return [py_bot, cpp_bot]
def test_cfr_plus_solver_best_response_mdp(self): game = pyspiel.load_game("kuhn_poker") cfr_solver = cfr.CFRPlusSolver(game) for _ in range(200): cfr_solver.evaluate_and_update_policy() average_policy = cfr_solver.average_policy() pyspiel_avg_policy = policy.python_policy_to_pyspiel_policy( average_policy) br_computer = pyspiel.TabularBestResponseMDP(game, pyspiel_avg_policy) br_info = br_computer.exploitability() self.assertLessEqual(br_info.exploitability, 0.001)
def test_record_batched_trajectories(self): for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]: game = pyspiel.load_game(game_name) python_policy = policy.TabularPolicy(game) tabular_policy = policy.python_policy_to_pyspiel_policy(python_policy) policies = [tabular_policy] * 2 # We test that we can create a batch of trajectories. seed = 0 batch_size = 128 include_full_observations = False pyspiel.record_batched_trajectories(game, policies, python_policy.state_lookup, batch_size, include_full_observations, seed, -1)
def test_cfr_cce_ce_dist_goofspiel(self): """Copy of the TestCCEDistCFRGoofSpiel in corr_dist_test.cc.""" game = pyspiel.load_game( "turn_based_simultaneous_game(game=goofspiel(num_cards=3,points_order=" "descending,returns_type=total_points))") for num_iterations in [1, 10, 100]: policies = [] cfr_solver = cfr.CFRSolver(game) for _ in range(num_iterations): cfr_solver.evaluate_and_update_policy() policies.append( policy.python_policy_to_pyspiel_policy( cfr_solver.current_policy())) mu = pyspiel.uniform_correlation_device(policies) cce_dist_info = pyspiel.cce_dist(game, mu) print( "goofspiel, cce test num_iters: {}, cce_dist: {}, per player: {}" .format(num_iterations, cce_dist_info.dist_value, cce_dist_info.deviation_incentives)) # Try converting one of the BR policies: _ = policy.pyspiel_policy_to_python_policy( game, cce_dist_info.best_response_policies[0]) # Assemble the same correlation device manually, just as an example for # how to do non-uniform distributions of them and to test the python # bindings for lists of tuples works properly uniform_prob = 1.0 / len(policies) mu2 = [(uniform_prob, policy) for policy in policies] cce_dist_info2 = pyspiel.cce_dist(game, mu2) self.assertAlmostEqual(cce_dist_info2.dist_value, sum(cce_dist_info.deviation_incentives)) # Test the CEDist function too, why not. Disable the exact one, as it # takes too long for a test. # ce_dist_info = pyspiel.ce_dist(game, pyspiel.determinize_corr_dev(mu)) ce_dist_info = pyspiel.ce_dist( game, pyspiel.sampled_determinize_corr_dev(mu, 100)) print( "goofspiel, ce test num_iters: {}, ce_dist: {}, per player: {}" .format(num_iterations, ce_dist_info.dist_value, ce_dist_info.deviation_incentives)) print("number of conditional best responses per player:") for p in range(game.num_players()): print(" player {}, num: {}".format( p, len(ce_dist_info.conditional_best_response_policies[p])))
def test_matching_pennies_3p(self): game = pyspiel.load_game_as_turn_based('matching_pennies_3p') deep_cfr_solver = deep_cfr.DeepCFRSolver(game, policy_network_layers=(16, 8), advantage_network_layers=(32, 16), num_iterations=2, num_traversals=2, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=1e7) deep_cfr_solver.solve() conv = pyspiel.nash_conv( game, policy.python_policy_to_pyspiel_policy( policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities))) logging.info('Deep CFR in Matching Pennies 3p. NashConv: %.2f', conv)
def main(unused_argv): logging.info("Loading %s", FLAGS.game_name) game = pyspiel.load_game(FLAGS.game_name) deep_cfr_solver = deep_cfr.DeepCFRSolver( game, policy_network_layers=(32, 32), advantage_network_layers=(16, 16), num_iterations=FLAGS.num_iterations, num_traversals=FLAGS.num_traversals, learning_rate=1e-3, batch_size_advantage=None, batch_size_strategy=None, memory_capacity=int(1e7)) _, advantage_losses, policy_loss = deep_cfr_solver.solve() for player, losses in six.iteritems(advantage_losses): logging.info("Advantage for player %d: %s", player, losses[:2] + ["..."] + losses[-2:]) logging.info("Advantage Buffer Size for player %s: '%s'", player, len(deep_cfr_solver.advantage_buffers[player])) logging.info("Strategy Buffer Size: '%s'", len(deep_cfr_solver.strategy_buffer)) logging.info("Final policy loss: '%s'", policy_loss) average_policy = policy.tabular_policy_from_callable( game, deep_cfr_solver.action_probabilities) pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy) conv = pyspiel.nash_conv(game, pyspiel_policy) logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv) average_policy_values = expected_game_score.policy_value( game.new_initial_state(), [average_policy] * 2) logging.info("Computed player 0 value: %.2f (expected: %.2f).", average_policy_values[0], -1 / 18) logging.info("Computed player 1 value: %.2f (expected: %.2f).", average_policy_values[1], 1 / 18)
def test_can_create_cpp_tabular_policy(self): for game_name in ["kuhn_poker", "leduc_poker", "liars_dice"]: game = pyspiel.load_game(game_name) # We just test that we can create a tabular policy. policy.python_policy_to_pyspiel_policy(policy.TabularPolicy(game))
def add_new_policies( per_player_new_policies, per_player_gaps, per_player_repeats, per_player_policies, joint_policies, joint_returns, game, br_selection): """Adds novel policies from new policies.""" num_players = len(per_player_new_policies) per_player_num_novel_policies = [0 for _ in range(num_players)] # Update policies and policy counts. for player in range(num_players): new_policies = per_player_new_policies[player] new_gaps = per_player_gaps[player] repeat_policies = [] repeat_gaps = [] repeat_ids = [] novel_policies = [] novel_gaps = [] for new_policy, new_gap in zip(new_policies, new_gaps): for policy_id, policy_ in enumerate(per_player_policies[player]): if np.all( # New policy is not novel. new_policy.action_probability_array == policy_.action_probability_array): logging.debug("Player %d's new policy is not novel.", player) repeat_policies.append(new_policy) repeat_gaps.append(new_gap) repeat_ids.append(policy_id) break else: # New policy is novel. logging.debug("Player %d's new policy is novel.", player) novel_policies.append(new_policy) novel_gaps.append(new_gap) add_novel_policies = [] add_repeat_ids = [] if (novel_policies or repeat_policies): if br_selection == "all": add_novel_policies.extend(novel_policies) add_repeat_ids.extend(repeat_ids) elif br_selection == "all_novel": add_novel_policies.extend(novel_policies) elif br_selection == "random": index = np.random.randint(0, len(repeat_policies) + len(novel_policies)) if index < len(novel_policies): add_novel_policies.append(novel_policies[index]) else: add_repeat_ids.append(repeat_ids[index - len(novel_policies)]) elif br_selection == "random_novel": if novel_policies: index = np.random.randint(0, len(novel_policies)) add_novel_policies.append(novel_policies[index]) else: # Fall back on random. index = np.random.randint(0, len(repeat_policies)) add_repeat_ids.append(repeat_ids[index]) elif br_selection == "largest_gap": if novel_policies: index = np.argmax(novel_gaps) if novel_gaps[index] == 0.0: # Fall back to random when zero. index = np.random.randint(0, len(novel_policies)) add_novel_policies.append(novel_policies[index]) else: # Fall back on random. index = np.random.randint(0, len(repeat_policies)) add_repeat_ids.append(repeat_ids[index]) else: raise ValueError("Unrecognized br_selection method: %s" % br_selection) for add_repeat_id in add_repeat_ids: per_player_repeats[player][add_repeat_id] += 1 for add_novel_policy in add_novel_policies: per_player_policies[player].append(add_novel_policy) # Add new policy. per_player_repeats[player].append(1) # Add new count. per_player_num_novel_policies[player] += 1 # Add new joint policies. for pids in itertools.product(*[ range(len(policies)) for policies in per_player_policies]): if pids in joint_policies: continue logging.debug("Evaluating novel joint policy: %s.", pids) policies = [ policies[pid] for pid, policies in zip(pids, per_player_policies)] python_tabular_policy = policy.merge_tabular_policies( policies, game) pyspiel_tabular_policy = policy.python_policy_to_pyspiel_policy( python_tabular_policy) joint_policies[pids] = pyspiel_tabular_policy joint_returns[pids] = [ 0.0 if abs(er) < RETURN_TOL else er for er in pyspiel.expected_returns( game.new_initial_state(), pyspiel_tabular_policy, -1, True)] return per_player_num_novel_policies