Python TabularValueFunction 예제들, open_spiel.python.mfg.value.TabularValueFunction Python 예제들

예제 #1

0

파일 보기

파일: nash_conv.py 프로젝트: ngrupen/open_spiel

    def __init__(self, game, policy: policy_std.Policy, root_state=None):
        """Initializes the nash conv.

    Args:
      game: The game to analyze.
      policy: A `policy.Policy` object.
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
        self._game = game
        self._policy = policy
        if root_state is None:
            self._root_states = game.new_initial_states()
        else:
            self._root_states = [root_state]
        self._distrib = distribution.DistributionPolicy(self._game,
                                                        self._policy,
                                                        root_state=root_state)
        self._pi_value = policy_value.PolicyValue(self._game,
                                                  self._distrib,
                                                  self._policy,
                                                  value.TabularValueFunction(
                                                      self._game),
                                                  root_state=root_state)
        self._br_value = best_response_value.BestResponse(
            self._game,
            self._distrib,
            value.TabularValueFunction(self._game),
            root_state=root_state)

예제 #2

0

파일 보기

    def __init__(self,
                 game,
                 state_value: Optional[value.ValueFunction] = None,
                 lr=0.01,
                 root_state=None):
        """Initializes mirror descent.

    Args:
      game: The game,
      state_value: A state value function. Default to TabularValueFunction.
      lr: The learning rate of mirror descent,
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
        self._game = game
        if root_state is None:
            self._root_states = game.new_initial_states()
        else:
            self._root_states = [root_state]
        self._policy = policy_std.UniformRandomPolicy(game)
        self._distribution = distribution.DistributionPolicy(
            game, self._policy)
        self._md_step = 0
        self._lr = lr

        self._state_value = (state_value if state_value else
                             value.TabularValueFunction(game))
        self._cumulative_state_value = value.TabularValueFunction(game)

예제 #3

0

파일 보기

  def __init__(self,
               game,
               distribution: distribution_std.Distribution,
               policy: policy_std.Policy,
               state_value: Optional[value.ValueFunction] = None,
               root_state=None):
    """Initializes the value calculation.

    Args:
      game: The game to analyze.
      distribution: A `distribution.Distribution` object.
      policy: A `policy.Policy` object.
      state_value: A state value function. Defaults to Tabular.
      root_state: The state of the game at which to start. If `None`, the game
        root state is used.
    """
    super(PolicyValue, self).__init__(game)
    if root_state is None:
      self._root_states = game.new_initial_states()
    else:
      self._root_states = [root_state]
    self._distribution = distribution
    self._policy = policy

    self._state_value = (state_value if state_value is not None
                         else value.TabularValueFunction(game))

    self.evaluate()

예제 #4

0

파일 보기

  def iteration(self, rl_br_agent=None, learning_rate=None):
    """Returns a new `TabularPolicy` equivalent to this policy.

    Args:
      rl_br_agent: An instance of the RL approximation method to use to compute
        the best response value for each iteration. If none provided, the exact
        value is computed.
      learning_rate: The learning rate.
    """
    self._fp_step += 1

    distrib = distribution.DistributionPolicy(self._game, self._policy)

    if rl_br_agent:
      joint_avg_policy = rl_agent_policy.RLAgentPolicy(
          self._game, rl_br_agent, rl_br_agent.player_id, use_observation=True)
      br_value = policy_value.PolicyValue(self._game, distrib, joint_avg_policy)
    else:
      br_value = best_response_value.BestResponse(
          self._game, distrib, value.TabularValueFunction(self._game))

    greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value)
    greedy_pi = greedy_pi.to_tabular(states=self._states)
    distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi)

    weight = learning_rate if learning_rate else 1.0 / (self._fp_step + 1)

    self._policy = MergedPolicy(
        self._game, list(range(self._game.num_players())),
        [self._policy, greedy_pi], [distrib, distrib_greedy],
        [1.0 - weight, weight]).to_tabular(states=self._states)

예제 #5

0

파일 보기

 def test_best_response(self, name):
   """Checks if the value of a policy computation works."""
   game = pyspiel.load_game(name)
   uniform_policy = policy.UniformRandomPolicy(game)
   dist = distribution.DistributionPolicy(game, uniform_policy)
   br_value = best_response_value.BestResponse(
       game, dist, value.TabularValueFunction(game))
   br_val = br_value(game.new_initial_state())
   self.assertAlmostEqual(br_val, 30.029387484327486)

예제 #6

0

파일 보기

파일: mirror_descent_test.py 프로젝트: ngrupen/open_spiel

  def test_fp(self, name):
    """Checks if mirror descent works."""
    game = pyspiel.load_game(name)
    md = mirror_descent.MirrorDescent(game, value.TabularValueFunction(game))
    for _ in range(10):
      md.iteration()
    md_policy = md.get_policy()
    nash_conv_md = nash_conv.NashConv(game, md_policy)

    self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)

예제 #7

0

파일 보기

 def iteration(self, learning_rate=None):
     """an iteration of Mirror Descent."""
     self._md_step += 1
     # TODO(sertan): Fix me.
     self._state_value = value.TabularValueFunction(self._game)
     for state in self._root_states:
         self.eval_state(state,
                         learning_rate if learning_rate else self._lr)
     self._policy = ProjectedPolicy(self._game,
                                    list(range(self._game.num_players())),
                                    self._cumulative_state_value)
     self._distribution = distribution.DistributionPolicy(
         self._game, self._policy)

예제 #8

0

파일 보기

파일: greedy_policy_test.py 프로젝트: sarahperrin/open_spiel

    def test_greedy(self, name):
        """Check if the greedy policy works as expected.

    The test checks that a greedy policy with respect to an optimal value is
    an optimal policy.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(
            game, dist, value.TabularValueFunction(game))
        br_val = br_value(game.new_initial_state())

        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        pybr_value = policy_value.PolicyValue(game, dist, greedy_pi,
                                              value.TabularValueFunction(game))
        pybr_val = pybr_value(game.new_initial_state())
        self.assertAlmostEqual(br_val, pybr_val)

예제 #9

0

파일 보기

파일: policy_value_test.py 프로젝트: ngrupen/open_spiel

    def test_policy_value(self, name):
        """Checks if the value of a policy computation works.

    Args:
      name: Name of the game.
    """
        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        py_value = policy_value.PolicyValue(game, dist, uniform_policy,
                                            value.TabularValueFunction(game))
        py_val = py_value(game.new_initial_state())
        self.assertAlmostEqual(py_val, 27.215850929940448)

예제 #10

0

파일 보기

파일: fictitious_play_test.py 프로젝트: sarahperrin/open_spiel

    def test_average(self):
        """Test the average of policies.

    Here we test that the average of values is the value of the average policy.
    """
        game = crowd_modelling.MFGCrowdModellingGame()
        uniform_policy = policy.UniformRandomPolicy(game)
        mfg_dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(
            game, mfg_dist, value.TabularValueFunction(game))
        py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy,
                                            value.TabularValueFunction(game))
        greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value)
        greedy_pi = greedy_pi.to_tabular()
        merged_pi = fictitious_play.MergedPolicy(
            game, list(range(game.num_players())), [uniform_policy, greedy_pi],
            [mfg_dist,
             distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5])
        merged_pi_value = policy_value.PolicyValue(
            game, mfg_dist, merged_pi, value.TabularValueFunction(game))

        self.assertAlmostEqual(merged_pi_value(game.new_initial_state()),
                               (br_value(game.new_initial_state()) +
                                py_value(game.new_initial_state())) / 2)

예제 #11

0

파일 보기

def main(argv: Sequence[str]) -> None:
  # TODO(perolat): move to an example directory.
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')
  mfg_game = pyspiel.load_game(FLAGS.game, GAME_SETTINGS.get(FLAGS.game, {}))
  mfg_state = mfg_game.new_initial_state()
  print('Playing a single arbitrary trajectory')
  while not mfg_state.is_terminal():
    print('State obs string:', mfg_state.observation_string(0))
    if mfg_state.current_player() == pyspiel.PlayerId.CHANCE:
      action_list, prob_list = zip(*mfg_state.chance_outcomes())
      action = np.random.choice(action_list, p=prob_list)
      mfg_state.apply_action(action)
    elif mfg_state.current_player() == pyspiel.PlayerId.MEAN_FIELD:
      dist_to_register = mfg_state.distribution_support()
      n_states = len(dist_to_register)
      dist = [1.0 / n_states for _ in range(n_states)]
      mfg_state.update_distribution(dist)
    else:
      legal_list = mfg_state.legal_actions()
      action = np.random.choice(legal_list)
      mfg_state.apply_action(action)

  print('compute nashconv')
  uniform_policy = policy.UniformRandomPolicy(mfg_game)
  nash_conv_fp = nash_conv.NashConv(mfg_game, uniform_policy)
  print('Nashconv:', nash_conv_fp.nash_conv())

  print('compute distribution')
  mfg_dist = distribution.DistributionPolicy(mfg_game, uniform_policy)
  br_value = best_response_value.BestResponse(
      mfg_game, mfg_dist, value.TabularValueFunction(mfg_game))
  py_value = policy_value.PolicyValue(mfg_game, mfg_dist, uniform_policy,
                                      value.TabularValueFunction(mfg_game))
  print(
      'Value of a best response policy to a uniform policy '
      '(computed with best_response_value)',
      br_value(mfg_game.new_initial_state()))
  print('Value of the uniform policy:', py_value(mfg_game.new_initial_state()))
  greedy_pi = greedy_policy.GreedyPolicy(mfg_game, None, br_value)
  greedy_pi = greedy_pi.to_tabular()
  pybr_value = policy_value.PolicyValue(mfg_game, mfg_dist, greedy_pi,
                                        value.TabularValueFunction(mfg_game))
  print(
      'Value of a best response policy to a uniform policy (computed at the '
      'value of the greedy policy of the best response value)',
      pybr_value(mfg_game.new_initial_state()))
  print('merge')
  merged_pi = fictitious_play.MergedPolicy(
      mfg_game, list(range(mfg_game.num_players())),
      [uniform_policy, greedy_pi],
      [mfg_dist, distribution.DistributionPolicy(mfg_game, greedy_pi)],
      [0.5, 0.5])

  merged_pi_value = policy_value.PolicyValue(
      mfg_game, mfg_dist, merged_pi, value.TabularValueFunction(mfg_game))
  print(br_value(mfg_game.new_initial_state()))
  print(py_value(mfg_game.new_initial_state()))
  print(merged_pi_value(mfg_game.new_initial_state()))
  print((br_value(mfg_game.new_initial_state()) +
         py_value(mfg_game.new_initial_state())) / 2)
  print('fp')
  fp = fictitious_play.FictitiousPlay(mfg_game)
  for j in range(100):
    print('Iteration', j, 'of fictitious play')
    fp.iteration()
    fp_policy = fp.get_policy()
    nash_conv_fp = nash_conv.NashConv(mfg_game, fp_policy)
    print('Nashconv of the current FP policy', nash_conv_fp.nash_conv())
  print('md')
  md = mirror_descent.MirrorDescent(mfg_game,
                                    value.TabularValueFunction(mfg_game))
  for j in range(10):
    print('Iteration', j, 'of mirror descent')
    md.iteration()
    md_policy = md.get_policy()
    nash_conv_md = nash_conv.NashConv(mfg_game, md_policy)
    print('Nashconv of the current MD policy', nash_conv_md.nash_conv())

예제 #12

0

파일 보기

파일: softmax_policy_test.py 프로젝트: ngrupen/open_spiel

    def test_softmax(self, name):
        """Check if the softmax policy works as expected.

    The test checks that:
    - uniform prior policy gives the same results than no prior.
    - very high temperature gives almost a uniform policy.
    - very low temperature gives almost a deterministic policy for the best
    action.

    Args:
      name: Name of the game.
    """

        game = pyspiel.load_game(name)
        uniform_policy = policy.UniformRandomPolicy(game)
        dist = distribution.DistributionPolicy(game, uniform_policy)
        br_value = best_response_value.BestResponse(
            game, dist, value.TabularValueFunction(game))
        br_init_val = br_value(game.new_initial_state())

        # uniform prior policy gives the same results than no prior.
        softmax_pi_uniform_prior = softmax_policy.SoftmaxPolicy(
            game, None, 1.0, br_value, uniform_policy).to_tabular()
        softmax_pi_uniform_prior_value = policy_value.PolicyValue(
            game, dist, softmax_pi_uniform_prior,
            value.TabularValueFunction(game))
        softmax_pi_uniform_prior_init_val = softmax_pi_uniform_prior_value(
            game.new_initial_state())
        softmax_pi_no_prior = softmax_policy.SoftmaxPolicy(
            game, None, 1.0, br_value, None)
        softmax_pi_no_prior_value = policy_value.PolicyValue(
            game, dist, softmax_pi_no_prior, value.TabularValueFunction(game))
        softmax_pi_no_prior_init_val = softmax_pi_no_prior_value(
            game.new_initial_state())

        self.assertAlmostEqual(softmax_pi_uniform_prior_init_val,
                               softmax_pi_no_prior_init_val)

        # very high temperature gives almost a uniform policy.
        uniform_policy = uniform_policy.to_tabular()
        uniform_value = policy_value.PolicyValue(
            game, dist, uniform_policy, value.TabularValueFunction(game))
        uniform_init_val = uniform_value(game.new_initial_state())

        softmax_pi_no_prior = softmax_policy.SoftmaxPolicy(
            game, None, 100000000, br_value, None)
        softmax_pi_no_prior_value = policy_value.PolicyValue(
            game, dist, softmax_pi_no_prior, value.TabularValueFunction(game))
        softmax_pi_no_prior_init_val = softmax_pi_no_prior_value(
            game.new_initial_state())

        self.assertAlmostEqual(uniform_init_val, softmax_pi_no_prior_init_val)

        # very low temperature gives almost a best response policy.
        softmax_pi_no_prior = softmax_policy.SoftmaxPolicy(
            game, None, 0.0001, br_value, None)
        softmax_pi_no_prior_value = policy_value.PolicyValue(
            game, dist, softmax_pi_no_prior, value.TabularValueFunction(game))
        softmax_pi_no_prior_init_val = softmax_pi_no_prior_value(
            game.new_initial_state())

        self.assertAlmostEqual(br_init_val, softmax_pi_no_prior_init_val)