예제 #1
0
  def test_cfr(self):
    root = rcfr.RootStateWrapper(_GAME.new_initial_state())
    num_half_iterations = 6

    cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences]
    cumulative_reach_weights = [np.zeros(n) for n in root.num_player_sequences]

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91)

    regret_player = 0
    for _ in range(num_half_iterations):
      reach_weights_player = 1 if regret_player == 0 else 0

      regrets, reach = root.counterfactual_regrets_and_reach_weights(
          regret_player, reach_weights_player, *rcfr.relu(cumulative_regrets))

      cumulative_regrets[regret_player] += regrets
      cumulative_reach_weights[reach_weights_player] += reach

      regret_player = reach_weights_player

    average_profile = root.sequence_weights_to_tabular_profile(
        cumulative_reach_weights)
    self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.27)
예제 #2
0
  def test_root_state_wrapper_sequence_terminal_values(self):
    root_state_wrapper = rcfr.RootStateWrapper(_GAME.new_initial_state())

    expected_terminal_values = {}
    no_call_histories_p1_win = [
        '2 0 0 0', '2 0 1 0', '0 1 1 0', '1 2 1 0', '1 0 1 0', '1 0 0 0',
        '2 1 1 0', '2 1 0 0', '0 2 1 0'
    ]
    for h in no_call_histories_p1_win:
      expected_terminal_values[h] = [1., -1.]

    no_call_histories_p2_win = [
        '0 2 0 1 0', '0 1 0 0', '0 1 0 1 0', '0 2 0 0', '1 2 0 0', '2 0 0 1 0',
        '1 2 0 1 0', '2 1 0 1 0', '1 0 0 1 0'
    ]
    for h in no_call_histories_p2_win:
      expected_terminal_values[h] = [-1., 1.]

    call_histories_p1_win = [
        '1 0 1 1', '2 1 1 1', '2 1 0 1 1', '2 0 0 1 1', '1 0 0 1 1', '2 0 1 1'
    ]
    for h in call_histories_p1_win:
      expected_terminal_values[h] = [2., -2.]

    call_histories_p2_win = [
        '0 2 0 1 1', '0 1 0 1 1', '0 1 1 1', '1 2 1 1', '1 2 0 1 1', '0 2 1 1'
    ]
    for h in call_histories_p2_win:
      expected_terminal_values[h] = [-2., 2.]

    self.assertAllEqual(
        expected_terminal_values,
        {k: v.tolist() for k, v in root_state_wrapper.terminal_values.items()})
예제 #3
0
    def test_rcfr_functions(self):
        models = [_new_model() for _ in range(_GAME.num_players())]
        root = rcfr.RootStateWrapper(_GAME.new_initial_state())

        num_half_iterations = 4
        num_epochs = 100

        cumulative_regrets = [np.zeros(n) for n in root.num_player_sequences]
        cumulative_reach_weights = [
            np.zeros(n) for n in root.num_player_sequences
        ]

        average_profile = root.sequence_weights_to_tabular_profile(
            cumulative_reach_weights)
        self.assertGreater(pyspiel.nash_conv(_GAME, average_profile), 0.91)

        regret_player = 0
        sequence_weights = [
            model(root.sequence_features[player]).numpy()
            for player, model in enumerate(models)
        ]

        for _ in range(num_half_iterations):
            reach_weights_player = 1 if regret_player == 0 else 0

            sequence_weights[reach_weights_player] = models[
                reach_weights_player](
                    root.sequence_features[reach_weights_player]).numpy()

            regrets, seq_probs = root.counterfactual_regrets_and_reach_weights(
                regret_player, reach_weights_player, *sequence_weights)

            cumulative_regrets[regret_player] += regrets
            cumulative_reach_weights[reach_weights_player] += seq_probs

            data = tf.data.Dataset.from_tensor_slices(
                (root.sequence_features[regret_player],
                 tf.expand_dims(cumulative_regrets[regret_player], axis=1)))
            data = data.shuffle(12)
            data = data.batch(12)
            data = data.repeat(num_epochs)

            optimizer = tf.keras.optimizers.Adam(lr=0.005, amsgrad=True)

            for x, y in data:
                optimizer.minimize(
                    lambda: tf.compat.v1.losses.huber_loss(
                        y, models[regret_player](x)),  # pylint: disable=cell-var-from-loop
                    models[regret_player].trainable_variables)

            regret_player = reach_weights_player

        average_profile = root.sequence_weights_to_tabular_profile(
            cumulative_reach_weights)

        self.assertLess(pyspiel.nash_conv(_GAME, average_profile), 0.91)
예제 #4
0
  def test_root_state_wrapper_sequence_features(self):
    root_state_wrapper = rcfr.RootStateWrapper(_GAME.new_initial_state())

    p1_info_state_features = [
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0.],
        [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.],
    ]
    p2_info_state_features = [
        [0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.],
    ]
    action_features = [[1., 0.], [0., 1.]]
    expected_p1_sequence_features = [
        p1_info_state_features[0] + action_features[0],
        p1_info_state_features[0] + action_features[1],
        p1_info_state_features[1] + action_features[0],
        p1_info_state_features[1] + action_features[1],
        p1_info_state_features[2] + action_features[0],
        p1_info_state_features[2] + action_features[1],
        p1_info_state_features[3] + action_features[0],
        p1_info_state_features[3] + action_features[1],
        p1_info_state_features[4] + action_features[0],
        p1_info_state_features[4] + action_features[1],
        p1_info_state_features[5] + action_features[0],
        p1_info_state_features[5] + action_features[1],
    ]
    expected_p2_sequence_features = [
        p2_info_state_features[0] + action_features[0],
        p2_info_state_features[0] + action_features[1],
        p2_info_state_features[1] + action_features[0],
        p2_info_state_features[1] + action_features[1],
        p2_info_state_features[2] + action_features[0],
        p2_info_state_features[2] + action_features[1],
        p2_info_state_features[3] + action_features[0],
        p2_info_state_features[3] + action_features[1],
        p2_info_state_features[4] + action_features[0],
        p2_info_state_features[4] + action_features[1],
        p2_info_state_features[5] + action_features[0],
        p2_info_state_features[5] + action_features[1],
    ]
    expected_sequence_features = [
        expected_p1_sequence_features, expected_p2_sequence_features
    ]

    self.assertAllEqual(expected_sequence_features,
                        root_state_wrapper.sequence_features)
예제 #5
0
파일: neurd.py 프로젝트: ngrupen/open_spiel
    def __init__(self, game, models, session=None):
        """Creates a new `CounterfactualNeurdSolver`.

    Args:
      game: An OpenSpiel `Game`.
      models: Current policy models (optimizable array-like -> `tf.Tensor`
        callables) for both players.
      session: A TensorFlow `Session` to convert sequence weights from
        `tf.Tensor`s produced by `models` to `np.array`s. If `None`, it is
        assumed that eager mode is enabled. Defaults to `None`.
    """
        self._game = game
        self._models = models
        self._root_wrapper = rcfr.RootStateWrapper(game.new_initial_state())
        self._session = session

        self._cumulative_seq_probs = [
            np.zeros(n) for n in self._root_wrapper.num_player_sequences
        ]
예제 #6
0
 def test_root_state_wrapper_sequence_indices(self):
   root_state_wrapper = rcfr.RootStateWrapper(_GAME.new_initial_state())
   self.assertAllEqual(
       {
           # Info state string -> initial sequence index map for player 1.
           '0': 0,
           '0pb': 2,
           '1': 4,
           '1pb': 6,
           '2': 8,
           '2pb': 10,
           # Info state string -> initial sequence index map for player 2.
           '1p': 0,
           '1b': 2,
           '2p': 4,
           '2b': 6,
           '0p': 8,
           '0b': 10,
       },
       root_state_wrapper.info_state_to_sequence_idx)
예제 #7
0
    def test_sequence_weights_to_tabular_profile(self):
        root = rcfr.RootStateWrapper(_GAME.new_initial_state())

        def policy_fn(state):
            """Generates a policy profile by treating sequence indices as weights."""
            info_state = state.information_state()
            sequence_offset = root.info_state_to_sequence_idx[info_state]
            num_actions = len(state.legal_actions())
            return rcfr.normalized_by_sum(
                list(range(sequence_offset, sequence_offset + num_actions)))

        profile = rcfr.sequence_weights_to_tabular_profile(
            root.root, policy_fn)

        expected_profile = {
            # Player 1
            '0': [(0, 0.), (1, 1.)],  # Sequences 0 and 1 (sums to 1)
            '0pb': [(0, 0.4), (1, 0.6)],  # Sequences 2 and 3 (sums to 5)
            # Sequences 4 and 5 (sums to 9)
            '1': [(0, 0.44444444444444442), (1, 0.55555555555555558)],
            # Sequences 6 and 7 (sums to 13)
            '1pb': [(0, 0.46153846153846156), (1, 0.53846153846153844)],
            # Sequences 8 and 9 (sums to 17)
            '2': [(0, 0.47058823529411764), (1, 0.52941176470588236)],
            # Sequences 10 and 11 (sums to 21)
            '2pb': [(0, 0.47619047619047616), (1, 0.52380952380952384)],

            # Player 2
            '1p': [(0, 0.), (1, 1.)],  # Sequences 0 and 1 (sums to 1)
            '1b': [(0, 0.4), (1, 0.6)],  # Sequences 2 and 3 (sums to 5)
            # Sequences 4 and 5 (sums to 9)
            '2p': [(0, 0.44444444444444442), (1, 0.55555555555555558)],
            # Sequences 6 and 7 (sums to 13)
            '2b': [(0, 0.46153846153846156), (1, 0.53846153846153844)],
            # Sequences 8 and 9 (sums to 17)
            '0p': [(0, 0.47058823529411764), (1, 0.52941176470588236)],
            # Sequences 10 and 11 (sums to 21)
            '0b': [(0, 0.47619047619047616), (1, 0.52380952380952384)],
        }
        self.assertAllClose(profile, expected_profile)
예제 #8
0
    def test_counterfactual_regrets_and_reach_weights_value_error(self):
        root = rcfr.RootStateWrapper(_GAME.new_initial_state())

        # Initialize arbitrary weights to generate an arbitrary profile.
        sequence_weights1_with_a_missing_sequence = [
            0.4967141530112327,
            0.0,
            0.6476885381006925,
            1.5230298564080254,
            0.0,
            0.0,
            1.5792128155073915,
            0.7674347291529088,
            0.0,
            0.5425600435859647,
            0.0,
            # 0.0,
        ]
        # Ensure this player's policy is fully mixed so that each of player 1's
        # information states are reached.
        sequence_weights2 = [
            0.24196227156603412,
            0.1,
            0.1,
            0.1,
            0.1,
            0.3142473325952739,
            0.1,
            0.1,
            1.465648768921554,
            0.1,
            0.06752820468792384,
            0.1,
        ]

        with self.assertRaises(ValueError):
            root.counterfactual_regrets_and_reach_weights(
                0, 1, sequence_weights1_with_a_missing_sequence,
                sequence_weights2)
예제 #9
0
  def test_counterfactual_regrets_and_reach_weights(self):
    root = rcfr.RootStateWrapper(_GAME.new_initial_state())

    # Initialize arbitrary weights to generate an arbitrary profile.
    sequence_weights1 = [
        0.4967141530112327,
        0.0,
        0.6476885381006925,
        1.5230298564080254,
        0.0,
        0.0,
        1.5792128155073915,
        0.7674347291529088,
        0.0,
        0.5425600435859647,
        0.0,
        0.0,
    ]
    sequence_weights2 = [
        0.24196227156603412,
        0.0,
        0.0,
        0.0,
        0.0,
        0.3142473325952739,
        0.0,
        0.0,
        1.465648768921554,
        0.0,
        0.06752820468792384,
        0.0,
    ]

    # These expected regrets and sequence weights were computed for the given
    # sequence weights.
    expected_regrets_given_sequence_weights = [
        0.,
        0.283604,
        0.116937,
        -0.049729,
        -0.06892,
        0.06892,
        0.054506,
        -0.112161,
        -0.083333,
        0.,
        0.,
        0.,
    ]
    expected_reach_weights_given_sequence_weights = [
        2.,
        0.,
        1.,
        1.,
        0.,
        2.,
        1.,
        1.,
        2.,
        0.,
        2.,
        0.,
    ]

    regrets, weights = root.counterfactual_regrets_and_reach_weights(
        0, 1, sequence_weights1, sequence_weights2)

    self.assertAllClose(regrets, expected_regrets_given_sequence_weights)
    self.assertAllClose(weights, expected_reach_weights_given_sequence_weights)
예제 #10
0
 def test_root_state_wrapper_num_sequences(self):
   root_state_wrapper = rcfr.RootStateWrapper(_GAME.new_initial_state())
   assert root_state_wrapper.num_player_sequences[0] == 12
   assert root_state_wrapper.num_player_sequences[1] == 12