def test_loss_modes(self):
        loss_dict = {
            "qpg": rl_losses.BatchQPGLoss,
            "rpg": rl_losses.BatchRPGLoss,
            "rm": rl_losses.BatchRMLoss,
            "a2c": rl_losses.BatchA2CLoss,
        }
        with self.session() as sess:
            for loss_str, loss_class in loss_dict.items():
                agent_by_str = policy_gradient.PolicyGradient(
                    sess,
                    player_id=0,
                    info_state_size=32,
                    num_actions=2,
                    loss_str=loss_str,
                    loss_class=None)
                agent_by_class = policy_gradient.PolicyGradient(
                    sess,
                    player_id=0,
                    info_state_size=32,
                    num_actions=2,
                    loss_str=None,
                    loss_class=loss_class)

                self.assertEqual(agent_by_str._pi_loss.shape,
                                 agent_by_class._pi_loss.shape)
                self.assertEqual(agent_by_str._pi_loss.dtype,
                                 agent_by_class._pi_loss.dtype)
                self.assertEqual(agent_by_str._pi_loss.op.type,
                                 agent_by_class._pi_loss.op.type)
    def test_run_game(self, loss_str, game_name):
        env = rl_environment.Environment(game_name)
        info_state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        with self.session() as sess:
            agents = [
                policy_gradient.PolicyGradient(  # pylint: disable=g-complex-comprehension
                    sess,
                    player_id=player_id,
                    info_state_size=info_state_size,
                    num_actions=num_actions,
                    loss_str=loss_str,
                    hidden_layers_sizes=[8, 8],
                    batch_size=16,
                    entropy_cost=0.001,
                    critic_learning_rate=0.01,
                    pi_learning_rate=0.01,
                    num_critic_before_pi=4) for player_id in [0, 1]
            ]
            sess.run(tf.global_variables_initializer())

            for _ in range(2):
                time_step = env.reset()
                while not time_step.last():
                    current_player = time_step.observations["current_player"]
                    current_agent = agents[current_player]
                    agent_output = current_agent.step(time_step)
                    time_step = env.step([agent_output.action])

                for agent in agents:
                    agent.step(time_step)
def main(_):

    game = "mst"
    num_players = 1
    train_games, train_rewards, test_games, test_rewards = mst.game_params(
        FLAGS.num_nodes)

    env_configs = train_games[0]
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(sess,
                                           idx,
                                           info_state_size,
                                           num_actions,
                                           loss_str=FLAGS.loss_str,
                                           hidden_layers_sizes=(128, ))
            for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(env, agents)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_episodes):
            env_configs = train_games[ep % len(train_games)]
            env = rl_environment.Environment(game, **env_configs)
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                #expl = exploitability.exploitability(env.game, expl_policies_avg)
                msg = "-" * 80 + "\n"
                msg += "{}: {}\n".format(ep + 1, losses)  #expl, losses)
                logging.info("%s", msg)
            if (ep + 1) % FLAGS.test_every == 0:
                test_accuracy = test_trained_bot(test_games, test_rewards,
                                                 agents[0], ep,
                                                 FLAGS.num_nodes, game,
                                                 FLAGS.game_version)
                logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
def main(_):
    game = "kuhn_poker"
    num_players = 2

    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(sess,
                                           idx,
                                           info_state_size,
                                           num_actions,
                                           loss_str=FLAGS.loss_str,
                                           hidden_layers_sizes=(128, ))
            for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(env, agents)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_episodes):

            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                expl = exploitability.exploitability(env.game,
                                                     expl_policies_avg)
                msg = "-" * 80 + "\n"
                msg += "{}: {}\n{}\n".format(ep + 1, expl, losses)
                logging.info("%s", msg)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        for pid, agent in enumerate(agents):
            policy_to_csv(env.game, expl_policies_avg,
                          f"{FLAGS.modeldir}/test_p{pid+1}.csv")
예제 #5
0
def pgrad_train(unused_arg):
    sess = tf.Session()
    env = rl_environment.Environment(FLAGS.game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    agents = [
        policy_gradient.PolicyGradient(sess,
                                       idx,
                                       state_size,
                                       num_actions,
                                       loss_str="rpg",
                                       hidden_layers_sizes=(128, ))
        for idx in range(2)
    ]
    expl_policies_avg = PolicyGradientPolicies(env, agents)
    run_agents(sess, env, agents, expl_policies_avg)
    def test_run_hanabi(self):
        # Hanabi is an optional game, so check we have it before running the test.
        game = "hanabi"
        if game not in pyspiel.registered_names():
            return

        num_players = 3
        env_configs = {
            "players": num_players,
            "max_life_tokens": 1,
            "colors": 2,
            "ranks": 3,
            "hand_size": 2,
            "max_information_tokens": 3,
            "discount": 0.
        }
        env = rl_environment.Environment(game, **env_configs)
        info_state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        with self.session() as sess:
            agents = [
                policy_gradient.PolicyGradient(  # pylint: disable=g-complex-comprehension
                    sess,
                    player_id=player_id,
                    info_state_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=[8, 8],
                    batch_size=16,
                    entropy_cost=0.001,
                    critic_learning_rate=0.01,
                    pi_learning_rate=0.01,
                    num_critic_before_pi=4) for player_id in range(num_players)
            ]
            sess.run(tf.global_variables_initializer())
            time_step = env.reset()
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                agent_output = [agent.step(time_step) for agent in agents]
                time_step = env.step([agent_output[current_player].action])

            for agent in agents:
                agent.step(time_step)
 def build_graph(self,
                 scope_name,
                 current_player,
                 info_state_size,
                 num_actions,
                 hidden_layers_sizes=[64,64],
                 replay_buffer_capacity=1e5,
                 batch_size=32,
                 entropy_cost=0.001,
                 critic_learning_rate=0.01,
                 pi_learning_rate=0.01,
                 num_critic_before_pi=32
                 ):
     with tf.variable_scope(scope_name) as scope:
       if self._oracle == "dqn":
             training_agent = dqn.DQN(
                   session=self._session,
                   player_id=current_player,
                   state_representation_size=info_state_size,
                   num_actions=num_actions,
                   hidden_layers_sizes=hidden_layers_sizes,
                   replay_buffer_capacity=replay_buffer_capacity,
                   batch_size=batch_size)
       elif self._oracle in ["rpg", "qpg", "rm", "a2c"]:
           training_agent = policy_gradient.PolicyGradient(  # pylint: disable=g-complex-comprehension
                 session=self._session,
                 player_id=current_player,
                 info_state_size=info_state_size,
                 num_actions=num_actions,
                 loss_str=self._oracle,
                 hidden_layers_sizes=hidden_layers_sizes,
                 batch_size=batch_size,
                 entropy_cost=entropy_cost,
                 critic_learning_rate=critic_learning_rate,
                 pi_learning_rate=pi_learning_rate,
                 num_critic_before_pi=num_critic_before_pi)
       else:
           raise ValueError("Oracle selected is not supported.")
     return training_agent
예제 #8
0
def main_loop(unused_arg):
  """Trains a DQN agent in the catch environment."""
  env = catch.Environment()
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  train_episodes = FLAGS.num_episodes

  with tf.Session() as sess:
    if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}:
      agent = policy_gradient.PolicyGradient(
          sess,
          player_id=0,
          info_state_size=info_state_size,
          num_actions=num_actions,
          loss_str=FLAGS.algorithm,
          hidden_layers_sizes=[128, 128],
          batch_size=128,
          entropy_cost=0.01,
          critic_learning_rate=0.1,
          pi_learning_rate=0.1,
          num_critic_before_pi=3)
    elif FLAGS.algorithm == "dqn":
      agent = dqn.DQN(
          sess,
          player_id=0,
          state_representation_size=info_state_size,
          num_actions=num_actions,
          learning_rate=0.1,
          replay_buffer_capacity=10000,
          hidden_layers_sizes=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    elif FLAGS.algorithm == "eva":
      agent = eva.EVAAgent(
          sess,
          env,
          player_id=0,
          state_size=info_state_size,
          num_actions=num_actions,
          learning_rate=1e-3,
          trajectory_len=2,
          num_neighbours=2,
          mixing_parameter=0.95,
          memory_capacity=10000,
          dqn_hidden_layers=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    else:
      raise ValueError("Algorithm not implemented!")

    sess.run(tf.global_variables_initializer())

    # Train agent
    for ep in range(train_episodes):
      time_step = env.reset()
      while not time_step.last():
        agent_output = agent.step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)
      # Episode is over, step agent with final info state.
      agent.step(time_step)

      if ep and ep % FLAGS.eval_every == 0:
        logging.info("-" * 80)
        logging.info("Episode %s", ep)
        logging.info("Loss: %s", agent.loss)
        avg_return = _eval_agent(env, agent, 100)
        logging.info("Avg return: %s", avg_return)
예제 #9
0
def main(_):
    game = "risk"
    num_players = 2

    env_configs = env_configs = {
        "players": num_players,
        "map": 0,
        "rng_seed": -1,
        "max_turns": 90,
        "dep_abs": False,
        "atk_abs": True,
        "redist_abs": True,
        "fort_abs": True,
        "dep_q": 31,
        "atk_q": 2,
        "redist_q": 2,
        "fort_q": 2
    }
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(sess,
                                           idx,
                                           info_state_size,
                                           num_actions,
                                           loss_str=FLAGS.loss_str,
                                           hidden_layers_sizes=(
                                               174,
                                               174,
                                               174,
                                           )) for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(env, agents)

        sess.run(tf.global_variables_initializer())

        if FLAGS.use_checkpoints:
            for agent in agents:
                if agent.has_checkpoint(FLAGS.checkpoint_dir):
                    agent.restore(FLAGS.checkpoint_dir)

        for ep in range(FLAGS.num_episodes):

            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                msg = "-" * 80 + "\n"
                msg += "{}: {}\n".format(ep + 1, losses)
                if FLAGS.use_checkpoints:
                    for agent in agents:
                        agent.save(FLAGS.checkpoint_dir)
                logging.info("%s", msg)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
예제 #10
0
def PG_Solving(game, iterations, save_every=0, save_prefix='base'):
    class PolicyGradientPolicies(policy.Policy):
        """Joint policy to be evaluated."""

        def __init__(self, nfsp_policies):
            player_ids = [0, 1]
            super(PolicyGradientPolicies, self).__init__(game, player_ids)
            self._policies = nfsp_policies
            self._obs = {"info_state": [None, None], "legal_actions": [None, None]}

        def action_probabilities(self, state, player_id=None):
            cur_player = state.current_player()
            legal_actions = state.legal_actions(cur_player)

            self._obs["current_player"] = cur_player
            self._obs["info_state"][cur_player] = (
                state.information_state_tensor(cur_player))
            self._obs["legal_actions"][cur_player] = legal_actions

            info_state = rl_environment.TimeStep(
                observations=self._obs, rewards=None, discounts=None, step_type=None)

            p = self._policies[cur_player].step(info_state, is_evaluation=True).probs
            prob_dict = {action: p[action] for action in legal_actions}
            return prob_dict

    def save_pg():
        tabular_policy = policy.tabular_policy_from_callable(game, expl_policies_avg)
        policy_handler.save_tabular_policy(game, tabular_policy, "policies/PG/{}/{}".format(save_prefix, it))

    num_players = 2
    env = rl_environment.Environment(game, **{"players": num_players})
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            policy_gradient.PolicyGradient(
                sess,
                idx,
                info_state_size,
                num_actions,
                loss_str="rpg",  # ["rpg", "qpg", "rm"] = PG loss to use.
                hidden_layers_sizes=(128,)) for idx in range(num_players)
        ]
        expl_policies_avg = PolicyGradientPolicies(agents)

        sess.run(tf.global_variables_initializer())
        for it in range(iterations + 1):
            if save_every != 0 and it % save_every == 0:  # order is important
                save_pg()

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)
            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
        save_pg()