def __init__(self, game, policy: policy_std.Policy, root_state=None): """Initializes the nash conv. Args: game: The game to analyze. policy: A `policy.Policy` object. root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game self._policy = policy if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._distrib = distribution.DistributionPolicy(self._game, self._policy, root_state=root_state) self._pi_value = policy_value.PolicyValue(self._game, self._distrib, self._policy, value.TabularValueFunction( self._game), root_state=root_state) self._br_value = best_response_value.BestResponse( self._game, self._distrib, value.TabularValueFunction(self._game), root_state=root_state)
def __init__(self, game, state_value: Optional[value.ValueFunction] = None, lr=0.01, root_state=None): """Initializes mirror descent. Args: game: The game, state_value: A state value function. Default to TabularValueFunction. lr: The learning rate of mirror descent, root_state: The state of the game at which to start. If `None`, the game root state is used. """ self._game = game if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._policy = policy_std.UniformRandomPolicy(game) self._distribution = distribution.DistributionPolicy( game, self._policy) self._md_step = 0 self._lr = lr self._state_value = (state_value if state_value else value.TabularValueFunction(game)) self._cumulative_state_value = value.TabularValueFunction(game)
def __init__(self, game, distribution: distribution_std.Distribution, policy: policy_std.Policy, state_value: Optional[value.ValueFunction] = None, root_state=None): """Initializes the value calculation. Args: game: The game to analyze. distribution: A `distribution.Distribution` object. policy: A `policy.Policy` object. state_value: A state value function. Defaults to Tabular. root_state: The state of the game at which to start. If `None`, the game root state is used. """ super(PolicyValue, self).__init__(game) if root_state is None: self._root_states = game.new_initial_states() else: self._root_states = [root_state] self._distribution = distribution self._policy = policy self._state_value = (state_value if state_value is not None else value.TabularValueFunction(game)) self.evaluate()
def iteration(self, rl_br_agent=None, learning_rate=None): """Returns a new `TabularPolicy` equivalent to this policy. Args: rl_br_agent: An instance of the RL approximation method to use to compute the best response value for each iteration. If none provided, the exact value is computed. learning_rate: The learning rate. """ self._fp_step += 1 distrib = distribution.DistributionPolicy(self._game, self._policy) if rl_br_agent: joint_avg_policy = rl_agent_policy.RLAgentPolicy( self._game, rl_br_agent, rl_br_agent.player_id, use_observation=True) br_value = policy_value.PolicyValue(self._game, distrib, joint_avg_policy) else: br_value = best_response_value.BestResponse( self._game, distrib, value.TabularValueFunction(self._game)) greedy_pi = greedy_policy.GreedyPolicy(self._game, None, br_value) greedy_pi = greedy_pi.to_tabular(states=self._states) distrib_greedy = distribution.DistributionPolicy(self._game, greedy_pi) weight = learning_rate if learning_rate else 1.0 / (self._fp_step + 1) self._policy = MergedPolicy( self._game, list(range(self._game.num_players())), [self._policy, greedy_pi], [distrib, distrib_greedy], [1.0 - weight, weight]).to_tabular(states=self._states)
def test_best_response(self, name): """Checks if the value of a policy computation works.""" game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) self.assertAlmostEqual(br_val, 30.029387484327486)
def test_fp(self, name): """Checks if mirror descent works.""" game = pyspiel.load_game(name) md = mirror_descent.MirrorDescent(game, value.TabularValueFunction(game)) for _ in range(10): md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(game, md_policy) self.assertAlmostEqual(nash_conv_md.nash_conv(), 2.2730324915546056)
def iteration(self, learning_rate=None): """an iteration of Mirror Descent.""" self._md_step += 1 # TODO(sertan): Fix me. self._state_value = value.TabularValueFunction(self._game) for state in self._root_states: self.eval_state(state, learning_rate if learning_rate else self._lr) self._policy = ProjectedPolicy(self._game, list(range(self._game.num_players())), self._cumulative_state_value) self._distribution = distribution.DistributionPolicy( self._game, self._policy)
def test_greedy(self, name): """Check if the greedy policy works as expected. The test checks that a greedy policy with respect to an optimal value is an optimal policy. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_val = br_value(game.new_initial_state()) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(game, dist, greedy_pi, value.TabularValueFunction(game)) pybr_val = pybr_value(game.new_initial_state()) self.assertAlmostEqual(br_val, pybr_val)
def test_policy_value(self, name): """Checks if the value of a policy computation works. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) py_value = policy_value.PolicyValue(game, dist, uniform_policy, value.TabularValueFunction(game)) py_val = py_value(game.new_initial_state()) self.assertAlmostEqual(py_val, 27.215850929940448)
def test_average(self): """Test the average of policies. Here we test that the average of values is the value of the average policy. """ game = crowd_modelling.MFGCrowdModellingGame() uniform_policy = policy.UniformRandomPolicy(game) mfg_dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, mfg_dist, value.TabularValueFunction(game)) py_value = policy_value.PolicyValue(game, mfg_dist, uniform_policy, value.TabularValueFunction(game)) greedy_pi = greedy_policy.GreedyPolicy(game, None, br_value) greedy_pi = greedy_pi.to_tabular() merged_pi = fictitious_play.MergedPolicy( game, list(range(game.num_players())), [uniform_policy, greedy_pi], [mfg_dist, distribution.DistributionPolicy(game, greedy_pi)], [0.5, 0.5]) merged_pi_value = policy_value.PolicyValue( game, mfg_dist, merged_pi, value.TabularValueFunction(game)) self.assertAlmostEqual(merged_pi_value(game.new_initial_state()), (br_value(game.new_initial_state()) + py_value(game.new_initial_state())) / 2)
def main(argv: Sequence[str]) -> None: # TODO(perolat): move to an example directory. if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') mfg_game = pyspiel.load_game(FLAGS.game, GAME_SETTINGS.get(FLAGS.game, {})) mfg_state = mfg_game.new_initial_state() print('Playing a single arbitrary trajectory') while not mfg_state.is_terminal(): print('State obs string:', mfg_state.observation_string(0)) if mfg_state.current_player() == pyspiel.PlayerId.CHANCE: action_list, prob_list = zip(*mfg_state.chance_outcomes()) action = np.random.choice(action_list, p=prob_list) mfg_state.apply_action(action) elif mfg_state.current_player() == pyspiel.PlayerId.MEAN_FIELD: dist_to_register = mfg_state.distribution_support() n_states = len(dist_to_register) dist = [1.0 / n_states for _ in range(n_states)] mfg_state.update_distribution(dist) else: legal_list = mfg_state.legal_actions() action = np.random.choice(legal_list) mfg_state.apply_action(action) print('compute nashconv') uniform_policy = policy.UniformRandomPolicy(mfg_game) nash_conv_fp = nash_conv.NashConv(mfg_game, uniform_policy) print('Nashconv:', nash_conv_fp.nash_conv()) print('compute distribution') mfg_dist = distribution.DistributionPolicy(mfg_game, uniform_policy) br_value = best_response_value.BestResponse( mfg_game, mfg_dist, value.TabularValueFunction(mfg_game)) py_value = policy_value.PolicyValue(mfg_game, mfg_dist, uniform_policy, value.TabularValueFunction(mfg_game)) print( 'Value of a best response policy to a uniform policy ' '(computed with best_response_value)', br_value(mfg_game.new_initial_state())) print('Value of the uniform policy:', py_value(mfg_game.new_initial_state())) greedy_pi = greedy_policy.GreedyPolicy(mfg_game, None, br_value) greedy_pi = greedy_pi.to_tabular() pybr_value = policy_value.PolicyValue(mfg_game, mfg_dist, greedy_pi, value.TabularValueFunction(mfg_game)) print( 'Value of a best response policy to a uniform policy (computed at the ' 'value of the greedy policy of the best response value)', pybr_value(mfg_game.new_initial_state())) print('merge') merged_pi = fictitious_play.MergedPolicy( mfg_game, list(range(mfg_game.num_players())), [uniform_policy, greedy_pi], [mfg_dist, distribution.DistributionPolicy(mfg_game, greedy_pi)], [0.5, 0.5]) merged_pi_value = policy_value.PolicyValue( mfg_game, mfg_dist, merged_pi, value.TabularValueFunction(mfg_game)) print(br_value(mfg_game.new_initial_state())) print(py_value(mfg_game.new_initial_state())) print(merged_pi_value(mfg_game.new_initial_state())) print((br_value(mfg_game.new_initial_state()) + py_value(mfg_game.new_initial_state())) / 2) print('fp') fp = fictitious_play.FictitiousPlay(mfg_game) for j in range(100): print('Iteration', j, 'of fictitious play') fp.iteration() fp_policy = fp.get_policy() nash_conv_fp = nash_conv.NashConv(mfg_game, fp_policy) print('Nashconv of the current FP policy', nash_conv_fp.nash_conv()) print('md') md = mirror_descent.MirrorDescent(mfg_game, value.TabularValueFunction(mfg_game)) for j in range(10): print('Iteration', j, 'of mirror descent') md.iteration() md_policy = md.get_policy() nash_conv_md = nash_conv.NashConv(mfg_game, md_policy) print('Nashconv of the current MD policy', nash_conv_md.nash_conv())
def test_softmax(self, name): """Check if the softmax policy works as expected. The test checks that: - uniform prior policy gives the same results than no prior. - very high temperature gives almost a uniform policy. - very low temperature gives almost a deterministic policy for the best action. Args: name: Name of the game. """ game = pyspiel.load_game(name) uniform_policy = policy.UniformRandomPolicy(game) dist = distribution.DistributionPolicy(game, uniform_policy) br_value = best_response_value.BestResponse( game, dist, value.TabularValueFunction(game)) br_init_val = br_value(game.new_initial_state()) # uniform prior policy gives the same results than no prior. softmax_pi_uniform_prior = softmax_policy.SoftmaxPolicy( game, None, 1.0, br_value, uniform_policy).to_tabular() softmax_pi_uniform_prior_value = policy_value.PolicyValue( game, dist, softmax_pi_uniform_prior, value.TabularValueFunction(game)) softmax_pi_uniform_prior_init_val = softmax_pi_uniform_prior_value( game.new_initial_state()) softmax_pi_no_prior = softmax_policy.SoftmaxPolicy( game, None, 1.0, br_value, None) softmax_pi_no_prior_value = policy_value.PolicyValue( game, dist, softmax_pi_no_prior, value.TabularValueFunction(game)) softmax_pi_no_prior_init_val = softmax_pi_no_prior_value( game.new_initial_state()) self.assertAlmostEqual(softmax_pi_uniform_prior_init_val, softmax_pi_no_prior_init_val) # very high temperature gives almost a uniform policy. uniform_policy = uniform_policy.to_tabular() uniform_value = policy_value.PolicyValue( game, dist, uniform_policy, value.TabularValueFunction(game)) uniform_init_val = uniform_value(game.new_initial_state()) softmax_pi_no_prior = softmax_policy.SoftmaxPolicy( game, None, 100000000, br_value, None) softmax_pi_no_prior_value = policy_value.PolicyValue( game, dist, softmax_pi_no_prior, value.TabularValueFunction(game)) softmax_pi_no_prior_init_val = softmax_pi_no_prior_value( game.new_initial_state()) self.assertAlmostEqual(uniform_init_val, softmax_pi_no_prior_init_val) # very low temperature gives almost a best response policy. softmax_pi_no_prior = softmax_policy.SoftmaxPolicy( game, None, 0.0001, br_value, None) softmax_pi_no_prior_value = policy_value.PolicyValue( game, dist, softmax_pi_no_prior, value.TabularValueFunction(game)) softmax_pi_no_prior_init_val = softmax_pi_no_prior_value( game.new_initial_state()) self.assertAlmostEqual(br_init_val, softmax_pi_no_prior_init_val)