def xfsp_train(_):
    exploit_history = list()
    exploit_idx = list()
    game = pyspiel.load_game(FLAGS.game, {"players": pyspiel.GameParameter(2)})
    fsp_solver = fictitious_play.XFPSolver(game)
    checkpoint = datetime.now()
    for ep in range(FLAGS.episodes):
        if (ep % 1000) == 0:
            delta = datetime.now() - checkpoint
            pol = policy.PolicyFromCallable(
                game, fsp_solver.average_policy_callable())
            conv = exploitability.exploitability(game, pol)
            exploit_history.append(conv)
            exploit_idx.append(ep)
            print(
                "[XFSP] Iteration {} exploitability {} - {} seconds since last checkpoint"
                .format(ep, conv, delta.seconds))
            checkpoint = datetime.now()

        fsp_solver.iteration()

    agent_name = "xfsp"
    pickle.dump([exploit_idx, exploit_history],
                open(
                    FLAGS.game + "_" + agent_name + "_" + str(FLAGS.episodes) +
                    ".dat", "wb"))

    pol = policy.PolicyFromCallable(game, fsp_solver.average_policy_callable())
    for pid in [1, 2]:
        policy_to_csv(
            game, pol, f"policies/policy_" + now.strftime("%m-%d-%Y_%H-%M") +
            "_" + agent_name + "_" + str(pid + 1) + "_+" +
            str(FLAGS.episodes) + "episodes.csv")
示例#2
0
    def test_runs_with_uniform_policies(self, game_name):
        game = pyspiel.load_game(game_name)
        calc = action_value.TreeWalkCalculator(game)

        calc.compute_all_states_action_values([
            policy.PolicyFromCallable(game, _uniform_policy),
            policy.PolicyFromCallable(game, _uniform_policy)
        ])
示例#3
0
 def test_kuhn_poker_always_pass_p0(self):
   game = pyspiel.load_game("kuhn_poker")
   calc = action_value_vs_best_response.Calculator(game)
   (expl, avvbr, cfrp, player_reach_probs) = calc(
       0, policy.PolicyFromCallable(game, lambda state: [(0, 1.0), (1, 0.0)]),
       ["0", "1", "2", "0pb", "1pb", "2pb"])
   self.assertAlmostEqual(expl, 1.)
   np.testing.assert_allclose(
       avvbr,
       [
           # Opening bet. If we pass, we always lose (pass-pass with op's K,
           # otherwise pass-bet-pass).
           # If we bet, we always win (because op's best response is to pass,
           # because this is an unreachable state and we break ties in favour
           # of the lowest action).
           [-1, 1],
           [-1, 1],
           [-1, 1],
           # We pass, opp bets into us. This can be either J or Q (K will pass
           # because of the tie-break rules).
           # So we are guaranteed to be winning with Q or K.
           [-1, -2],  # 0pb
           [-1, 2],  # 1pb
           [-1, 2],  # 2pb
       ])
   np.testing.assert_allclose(cfrp, [1 / 3, 1 / 3, 1 / 3, 1 / 6, 1 / 6, 1 / 3])
   np.testing.assert_allclose([1., 1., 1., 1., 1., 1.], player_reach_probs)
    def solve(self):
        """Solution logic for Deep CFR."""
        advantage_losses = collections.defaultdict(list)
        start = datetime.now()
        expl_idx = []
        expl_hist = []
        for it in range(self._num_iterations):
            if (it % self._eval_freq == 0) and it != 0:
                conv = self.get_exploitabilitiy()
                elapsed = datetime.now() - start
                print(
                    "Episode {}/{}, running for {} seconds - Exploitability = {}"
                    .format(it, self._num_iterations, elapsed.seconds, conv))
                expl_idx.append(it)
                expl_hist.append(conv)
            for p in range(self._num_players):
                for _ in range(self._num_traversals):
                    self._traverse_game_tree(self._root_node, p)
                self.reinitialize_advantage_networks()
                # Re-initialize advantage networks and train from scratch.
                advantage_losses[p].append(self._learn_advantage_network(p))
            self._iteration += 1
        # Train policy network.
        policy_loss = self._learn_strategy_network()

        conv = exploitability.exploitability(
            self._game,
            policy.PolicyFromCallable(self._game, self.action_probabilities))
        print("Final exploitability: {}".format(conv))
        return self._policy_network, advantage_losses, policy_loss, expl_idx, expl_hist
示例#5
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = pyspiel.load_game(FLAGS.game_name)
    with tf.Session() as sess:
        deep_cfr_solver = deep_cfr.DeepCFRSolver(
            sess,
            game,
            policy_network_layers=(32, 32),
            advantage_network_layers=(16, 16),
            num_iterations=FLAGS.num_iterations,
            num_traversals=FLAGS.num_traversals,
            learning_rate=1e-3,
            batch_size_advantage=None,
            batch_size_strategy=None,
            memory_capacity=1e7)
        sess.run(tf.global_variables_initializer())
        _, advantage_losses, policy_loss = deep_cfr_solver.solve()
        for player, losses in six.iteritems(advantage_losses):
            logging.info("Advantage for player %d: %s", player,
                         losses[:2] + ["..."] + losses[-2:])
            logging.info("Advantage Buffer Size for player %s: '%s'", player,
                         len(deep_cfr_solver.advantage_buffers[player]))
        logging.info("Strategy Buffer Size: '%s'",
                     len(deep_cfr_solver.strategy_buffer))
        logging.info("Final policy loss: '%s'", policy_loss)
        conv = exploitability.nash_conv(
            game,
            policy.PolicyFromCallable(game,
                                      deep_cfr_solver.action_probabilities))
        logging.info("Deep CFR in '%s' - NashConv: %s", FLAGS.game_name, conv)
    def get_exploitabilitiy(self):
        #Define placeholders
        iter_ph = tf.placeholder(shape=[None, 1],
                                 dtype=tf.float32,
                                 name="iter_ph")
        action_probs_ph = tf.placeholder(shape=[None, self._num_actions],
                                         dtype=tf.float32,
                                         name="action_probs_ph")
        info_state_ph = tf.placeholder(shape=[None, self._embedding_size],
                                       dtype=tf.float32,
                                       name="info_state_ph")

        policy_network = snt.nets.MLP(
            list(self._policy_network_layers) + [self._num_actions])
        action_logits = policy_network(info_state_ph)
        # Illegal actions are handled in the traversal code where expected payoff
        # and sampled regret is computed from the advantage networks.
        action_probs = tf.nn.softmax(action_logits)
        loss_policy = tf.reduce_mean(
            tf.losses.mean_squared_error(
                labels=tf.math.sqrt(iter_ph) * action_probs_ph,
                predictions=tf.math.sqrt(iter_ph) * action_probs))
        optimizer_policy = tf.train.AdamOptimizer(
            learning_rate=self._learning_rate)
        learn_step_policy = optimizer_policy.minimize(loss_policy)

        self._session.run(tf.global_variables_initializer())

        def _local_action_probabilities(state):
            """Returns action probabilities dict for a single batch."""
            cur_player = state.current_player()
            legal_actions = state.legal_actions(cur_player)
            info_state_vector = np.array(state.information_state_tensor())
            if len(info_state_vector.shape) == 1:
                info_state_vector = np.expand_dims(info_state_vector, axis=0)
            probs = self._session.run(
                action_probs, feed_dict={info_state_ph: info_state_vector})
            return {action: probs[0][action] for action in legal_actions}

        info_states_l = []
        action_probs_l = []
        iterations_l = []
        for s in self._strategy_memories.sample(self._batch_size_strategy):
            info_states_l.append(s.info_state)
            action_probs_l.append(s.strategy_action_probs)
            iterations_l.append([s.iteration])
        self._session.run(
            [loss_policy, learn_step_policy],
            feed_dict={
                info_state_ph: np.array(info_states_l),
                action_probs_ph: np.array(np.squeeze(action_probs_l)),
                iter_ph: np.array(iterations_l),
            })

        conv = exploitability.exploitability(
            self._game,
            policy.PolicyFromCallable(self._game, _local_action_probabilities))
        return conv
示例#7
0
  def _compute_best_responses(self):
    """Computes each player best-response against the pool of other players."""
    # pylint: disable=g-long-lambda
    current_policy = policy.PolicyFromCallable(
        self._game,
        lambda state: self._get_infostate_policy(state.information_state()))
    # pylint: disable=g-long-lambda

    for player_id in range(self._game.num_players()):
      self._best_responses[player_id] = exploitability.best_response(
          self._game, current_policy, player_id)
示例#8
0
 def test_shapleys_game(self):
     game = pyspiel.load_game_as_turn_based("matrix_shapleys_game")
     xfp_solver = fictitious_play.XFPSolver(game)
     for i in range(1000):
         xfp_solver.iteration()
         if i % 10 == 0:
             conv = exploitability.nash_conv(
                 game,
                 policy.PolicyFromCallable(
                     game, xfp_solver.average_policy_callable()))
             print("FP in Shapley's Game. Iter: {}, NashConv: {}".format(
                 i, conv))
示例#9
0
 def test_outcome_sampling_kuhn_2p(self):
     np.random.seed(SEED)
     game = pyspiel.load_game("kuhn_poker")
     os_solver = outcome_sampling_mccfr.OutcomeSamplingSolver(game)
     for _ in range(1000):
         os_solver.iteration()
     conv = exploitability.nash_conv(
         game,
         policy.PolicyFromCallable(game, os_solver.callable_avg_policy()))
     print("Kuhn2P, conv = {}".format(conv))
     self.assertGreater(conv, 0.2)
     self.assertLess(conv, 0.3)
 def compute_best_reponses(self):
   """Updates self._oracles to hold best responses for each player."""
   for i in range(self._num_players):
     # Compute a best response policy to pi_{-i}.
     # First, construct pi_{-i}.
     joint_policy = _joint_policy(self._policies)
     br_info = exploitability.best_response(
         self._game, policy.PolicyFromCallable(self._game, joint_policy), i)
     full_br_policy = _full_best_response_policy(
         br_info["best_response_action"])
     self._best_responses[i] = full_br_policy
     if self._oracles is not None:
       self._oracles[i].append(full_br_policy)
示例#11
0
def main(_):
    game = pyspiel.load_game(FLAGS.game,
                             {"players": pyspiel.GameParameter(FLAGS.players)})
    xfp_solver = fictitious_play.XFPSolver(game)
    for i in range(FLAGS.iterations):
        xfp_solver.iteration()
        conv = exploitability.exploitability(
            game,
            policy.PolicyFromCallable(game,
                                      xfp_solver.average_policy_callable()))
        if i % FLAGS.print_freq == 0:
            print("Iteration: {} Conv: {}".format(i, conv))
            sys.stdout.flush()
示例#12
0
 def test_matching_pennies_3p(self):
     game = pyspiel.load_game_as_turn_based("matching_pennies_3p")
     xfp_solver = fictitious_play.XFPSolver(game)
     for i in range(1000):
         xfp_solver.iteration()
         if i % 10 == 0:
             conv = exploitability.nash_conv(
                 game,
                 policy.PolicyFromCallable(
                     game, xfp_solver.average_policy_callable()))
             print(
                 "FP in Matching Pennies 3p. Iter: {}, NashConv: {}".format(
                     i, conv))
示例#13
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    env = rl_environment.Environment(FLAGS.game_name)
    num_players = env.num_players
    num_actions = env.action_spec()["num_actions"]
    state_size = env.observation_spec()["info_state"][0]
    eva_agents = []
    with tf.Session() as sess:
        for player in range(num_players):
            eva_agents.append(
                eva.EVAAgent(sess,
                             env,
                             player,
                             state_size,
                             num_actions,
                             embedding_network_layers=(64, 32),
                             embedding_size=12,
                             learning_rate=1e-4,
                             mixing_parameter=0.5,
                             memory_capacity=1e6,
                             discount_factor=1.0,
                             epsilon_start=1.0,
                             epsilon_end=0.1,
                             epsilon_decay_duration=int(1e6)))
        sess.run(tf.global_variables_initializer())
        time_step = env.reset()
        for _ in range(FLAGS.num_episodes):
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                current_agent = eva_agents[current_player]
                step_out = current_agent.step(time_step)
                time_step = env.step([step_out.action])

            for agent in eva_agents:
                agent.step(time_step)

        game = pyspiel.load_game(FLAGS.game_name)
        joint_policy = JointPolicy(eva_agents)
        conv = exploitability.nash_conv(
            game,
            policy.PolicyFromCallable(game, joint_policy.action_probabilities))
        logging.info("EVA in '%s' - NashConv: %s", FLAGS.game_name, conv)
def test_callable_policy_to_csv(tmpdir):
    def _uniform_policy(state):
        actions = state.legal_actions()
        p = 1.0 / len(actions)
        return [(a, p) for a in actions]

    # Setup game and policy
    game = pyspiel.load_game("kuhn_poker")
    callable_policy = policy.PolicyFromCallable(game, _uniform_policy)
    # Save policy as CSV
    output = os.path.join(tmpdir, 'policy.csv')
    policy_to_csv(game, callable_policy, output)
    assert list(tmpdir.listdir()) == [output]
    # Check created CSV
    csv = pd.read_csv(output, index_col=0)
    # Get all states in the game at which players have to make decisions.
    states = get_all_states.get_all_states(game,
                                           depth_limit=-1,
                                           include_terminals=False,
                                           include_chance_states=False)
    assert set(csv.index.values) <= set(states.keys())
 def test_matching_pennies_3p(self):
     # We don't expect Deep CFR to necessarily converge on 3-player games but
     # it's nonetheless interesting to see this result.
     game = pyspiel.load_game_as_turn_based('matching_pennies_3p')
     with tf.Session() as sess:
         deep_cfr_solver = deep_cfr.DeepCFRSolver(
             sess,
             game,
             policy_network_layers=(16, 8),
             advantage_network_layers=(32, 16),
             num_iterations=2,
             num_traversals=2,
             learning_rate=1e-3,
             batch_size_advantage=None,
             batch_size_strategy=None,
             memory_capacity=1e7)
         sess.run(tf.global_variables_initializer())
         deep_cfr_solver.solve()
         conv = exploitability.nash_conv(
             game,
             policy.PolicyFromCallable(
                 game, deep_cfr_solver.action_probabilities))
         print('Deep CFR in Matching Pennies 3p. NashConv: {}'.format(conv))
示例#16
0
    def test_kuhn_poker_always_pass_p0(self):
        game = pyspiel.load_game("kuhn_poker")
        calc = action_value.TreeWalkCalculator(game)

        for always_pass_policy in [
                lambda state: [(0, 1.0), (1, 0.0)],
                # On purpose, we use a policy that do not list all the legal actions.
                lambda state: [(0, 1.0), (1, 0.0)],
        ]:
            tabular_policy = policy.tabular_policy_from_policy(
                game, policy.PolicyFromCallable(game, always_pass_policy))

            # States are ordered using tabular_policy.states_per_player:
            # ['0', '0pb', '1', '1pb', '2', '2pb'] +
            # ['1p', '1b', '2p', '2b', '0p', '0b']
            np.testing.assert_array_equal(
                np.asarray([
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                    [1., 0.],
                ]), tabular_policy.action_probability_array)

            returned_values = calc([
                policy.PolicyFromCallable(game, always_pass_policy),
                policy.PolicyFromCallable(game, _uniform_policy)
            ], tabular_policy)

            # Action 0 == Pass. Action 1 == Bet
            # Some values are 0 because the states are not reached, thus the expected
            # value of that node is undefined.
            np.testing.assert_array_almost_equal(
                np.asarray([
                    [-1.0, -0.5],
                    [-1.0, -2.0],
                    [-0.5, 0.5],
                    [-1.0, 0.0],
                    [0.0, 1.5],
                    [-1.0, 2.0],
                    [0.0, 1.0],
                    [0, 0],
                    [1.0, 1.0],
                    [0, 0],
                    [-1.0, 1.0],
                    [0, 0],
                ]), returned_values.action_values)

            np.testing.assert_array_almost_equal(
                np.asarray([
                    # Player 0 states
                    1 / 3,  # '0'
                    1 / 6,  # '0pb'
                    1 / 3,  # '1'
                    1 / 6,  # '1pb'
                    1 / 3,  # '2'
                    1 / 6,  # '2pb'
                    # Player 1 states
                    1 / 3,  # '1p'
                    0.0,  # '1b': zero because player 0 always play pass
                    1 / 3,  # 2p'
                    0.0,  # '2b': zero because player 0 always play pass
                    1 / 3,  # '0p'
                    0.0,  # '0b':  zero because player 0 always play pass
                ]),
                returned_values.counterfactual_reach_probs)

            # The reach probabilities are always one, even though we have player 0
            # who only plays pass, because the unreachable nodes for player 0 are
            # terminal nodes: e.g.  'x x b b p' has a player 0 reach of 0, but it is
            # a terminal node, thus it does not appear in the tabular policy
            # states.
            np.testing.assert_array_equal(
                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                returned_values.player_reach_probs)

            np.testing.assert_array_almost_equal(
                np.asarray([
                    np.array([-1 / 3, -1 / 6]),
                    np.array([-1 / 6, -1 / 3]),
                    np.array([-1 / 6, 1 / 6]),
                    np.array([-1 / 6, 0.]),
                    np.array([0., 0.5]),
                    np.array([-1 / 6, 1 / 3]),
                    np.array([0., 1 / 3]),
                    np.array([0., 0.]),
                    np.array([1 / 3, 1 / 3]),
                    np.array([0., 0.]),
                    np.array([-1 / 3, 1 / 3]),
                    np.array([0., 0.])
                ]), returned_values.sum_cfr_reach_by_action_value)
示例#17
0
    def __call__(self, player, player_policy, info_states):
        """Computes action values per state for the player.

    Args:
      player: The id of the player 0 <= player < game.num_players().
      player_policy: A `policy.Policy` object.
      info_states: A list of info state strings.

    Returns:
      A `_CalculatorReturn` nametuple. See its docstring for the documentation.
    """
        self.player = player
        opponent = 1 - player

        def best_response_policy(state):
            infostate = state.information_state_string(opponent)
            action = best_response_actions[infostate]
            return [(action, 1.0)]

        # If the policy is a TabularPolicy, we can directly copy the infostate
        # strings & values from the class. This is significantly faster than having
        # to create the infostate strings.
        if isinstance(player_policy, policy.TabularPolicy):
            tabular_policy = {
                key: _tuples_from_policy(player_policy.policy_for_key(key))
                for key in player_policy.state_lookup
            }
        # Otherwise, we have to calculate all the infostate strings everytime. This
        # is ~2x slower.
        else:
            # We cache these as they are expensive to compute & do not change.
            if self._all_states is None:
                self._all_states = get_all_states.get_all_states(
                    self.game,
                    depth_limit=-1,
                    include_terminals=False,
                    include_chance_states=False)
                self._state_to_information_state = {
                    state: self._all_states[state].information_state_string()
                    for state in self._all_states
                }
            tabular_policy = policy_utils.policy_to_dict(
                player_policy, self.game, self._all_states,
                self._state_to_information_state)

        # When constructed, TabularBestResponse does a lot of work; we can save that
        # work by caching it.
        if self._best_responder[player] is None:
            self._best_responder[player] = pyspiel.TabularBestResponse(
                self.game, opponent, tabular_policy)
        else:
            self._best_responder[player].set_policy(tabular_policy)

        # Computing the value at the root calculates best responses everywhere.
        history = str(self.game.new_initial_state())
        best_response_value = self._best_responder[player].value(history)
        best_response_actions = self._best_responder[
            player].get_best_response_actions()

        # Compute action values
        self.action_values = collections.defaultdict(
            lambda: collections.defaultdict(lambda: np.zeros(2)))
        self.info_state_prob = collections.defaultdict(float)
        self.info_state_player_prob = collections.defaultdict(float)
        self.info_state_cf_prob = collections.defaultdict(float)
        self.info_state_chance_prob = collections.defaultdict(float)
        self.get_action_values(
            self.game.new_initial_state(), {
                player:
                player_policy,
                opponent:
                policy.PolicyFromCallable(self.game, best_response_policy),
            })

        # Collect normalized action values for each information state
        rv = []
        cfrp = []
        player_reach_probs_vs_br = []
        for info_state in info_states:
            key = (player, info_state)
            av = self.action_values[key]
            norm_prob = self.info_state_prob[key]
            rv.append([(av[a][player] / norm_prob) if
                       (a in av and norm_prob > 0) else 0
                       for a in range(self.num_actions)])
            cfrp.append(self.info_state_cf_prob[key])
            player_reach_probs_vs_br.append(self.info_state_player_prob[key])

        # Return values
        return _CalculatorReturn(
            exploitability=best_response_value,
            values_vs_br=rv,
            counterfactual_reach_probs_vs_br=cfrp,
            player_reach_probs_vs_br=player_reach_probs_vs_br)
示例#18
0
  def __call__(self, player, player_policy, info_states):
    """Computes action values per state for the player.

    Args:
      player: The id of the player (0 <= player < game.num_players()). This
        player will play `player_policy`, while the opponent will play a best
        response.
      player_policy: A `policy.Policy` object.
      info_states: A list of info state strings.

    Returns:
      A `_CalculatorReturn` nametuple. See its docstring for the documentation.
    """
    self.player = player
    opponent = 1 - player

    def best_response_policy(state):
      infostate = state.information_state_string(opponent)
      action = best_response_actions[infostate]
      return [(action, 1.0)]

    # If the policy is a TabularPolicy, we can directly copy the infostate
    # strings & values from the class. This is significantly faster than having
    # to create the infostate strings.
    if isinstance(player_policy, policy.TabularPolicy):
      tabular_policy = {
          key: _tuples_from_policy(player_policy.policy_for_key(key))
          for key in player_policy.state_lookup
      }
    # Otherwise, we have to calculate all the infostate strings everytime. This
    # is ~2x slower.
    else:
      # We cache these as they are expensive to compute & do not change.
      if self._all_states is None:
        self._all_states = get_all_states.get_all_states(
            self.game,
            depth_limit=-1,
            include_terminals=False,
            include_chance_states=False)
        self._state_to_information_state = {
            state: self._all_states[state].information_state_string()
            for state in self._all_states
        }
      tabular_policy = policy_utils.policy_to_dict(
          player_policy, self.game, self._all_states,
          self._state_to_information_state)

    # When constructed, TabularBestResponse does a lot of work; we can save that
    # work by caching it.
    if self._best_responder[player] is None:
      self._best_responder[player] = pyspiel.TabularBestResponse(
          self.game, opponent, tabular_policy)
    else:
      self._best_responder[player].set_policy(tabular_policy)

    # Computing the value at the root calculates best responses everywhere.
    history = str(self.game.new_initial_state())
    best_response_value = self._best_responder[player].value(history)
    best_response_actions = self._best_responder[
        player].get_best_response_actions()

    # Compute action values
    self._action_value_calculator.compute_all_states_action_values({
        player: player_policy,
        opponent: policy.PolicyFromCallable(self.game, best_response_policy),
    })
    obj = self._action_value_calculator._get_tabular_statistics(  # pylint: disable=protected-access
        ((player, s) for s in info_states))

    # Return values
    return _CalculatorReturn(
        exploitability=best_response_value,
        values_vs_br=obj.action_values,
        counterfactual_reach_probs_vs_br=obj.counterfactual_reach_probs,
        player_reach_probs_vs_br=obj.player_reach_probs)