예제 #1
0
    def test_simple_game(self):
        game = pyspiel.load_efg_game(SIMPLE_EFG_DATA)
        env = rl_environment.Environment(game=game)
        with self.session() as sess:
            agent = dqn.DQN(sess,
                            0,
                            state_representation_size=game.
                            information_state_tensor_shape()[0],
                            num_actions=game.num_distinct_actions(),
                            hidden_layers_sizes=[16],
                            replay_buffer_capacity=100,
                            batch_size=5,
                            epsilon_start=0.02,
                            epsilon_end=0.01)
            total_reward = 0
            sess.run(tf.global_variables_initializer())

            for _ in range(100):
                time_step = env.reset()
                while not time_step.last():
                    agent_output = agent.step(time_step)
                    time_step = env.step([agent_output.action])
                    total_reward += time_step.rewards[0]
                agent.step(time_step)
            self.assertGreaterEqual(total_reward, 75)
예제 #2
0
def main(_):
    game = "breakthrough"
    num_players = 2

    env_configs = {"columns": 5, "rows": 5}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        print(type(agents[0].get_weights()), agents[0].get_weights())
예제 #3
0
    def test_run_tic_tac_toe(self):
        env = rl_environment.Environment("tic_tac_toe")
        state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        with self.session() as sess:
            agents = [
                dqn.DQN(  # pylint: disable=g-complex-comprehension
                    sess,
                    player_id,
                    state_representation_size=state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=[16],
                    replay_buffer_capacity=10,
                    batch_size=5) for player_id in [0, 1]
            ]
            sess.run(tf.global_variables_initializer())
            time_step = env.reset()
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                current_agent = agents[current_player]
                agent_output = current_agent.step(time_step)
                time_step = env.step([agent_output.action])

            for agent in agents:
                agent.step(time_step)
예제 #4
0
def create_training_agents(num_players, sess, num_actions, info_state_size,
                           hidden_layers_sizes):
    """Create the agents we want to use for learning."""
    if FLAGS.learner == "qlearning":
        # pylint: disable=g-complex-comprehension
        return [
            tabular_qlearner.QLearner(
                player_id=idx,
                num_actions=num_actions,
                # step_size=0.02,
                step_size=0.1,
                # epsilon_schedule=rl_tools.ConstantSchedule(0.5),
                epsilon_schedule=rl_tools.LinearSchedule(0.5, 0.2, 1000000),
                discount_factor=0.99) for idx in range(num_players)
        ]
    elif FLAGS.learner == "dqn":
        # pylint: disable=g-complex-comprehension
        return [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    discount_factor=0.99,
                    epsilon_start=0.5,
                    epsilon_end=0.1,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
    else:
        raise RuntimeError("Unknown learner")
예제 #5
0
def main(_):
  game = FLAGS.game
  num_players = 1
  games, rewards,_,_ = mst.game_params(FLAGS.num_nodes)

  env_configs = games[0]
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3 #env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  print("Info State Size: ", info_state_size)
  print("Num Actions: ", num_actions)  
    
  # random agents for evaluation
  random_agents = [
      random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
      for idx in range(num_players)
  ]

  with tf.Session() as sess:
    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    # pylint: disable=g-complex-comprehension
    agents = [
        dqn.DQN(
            session=sess,
            player_id=idx,
            state_representation_size=info_state_size,
            num_actions=num_actions,
            hidden_layers_sizes=hidden_layers_sizes,
            replay_buffer_capacity=FLAGS.replay_buffer_capacity,
            batch_size=FLAGS.batch_size) for idx in range(num_players)
    ]
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    for ep in range(FLAGS.num_train_episodes):
      if (ep + 1) % FLAGS.eval_every == 0:
        r_mean = eval_against_random_bots(env, agents, random_agents, 1)
        logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
        saver.save(sess, FLAGS.checkpoint_dir, ep)
        print("Actual MST Value: ", rewards[0])

      #env = rl_environment.Environment(game, **games[ep])
      time_step = env.reset()
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        if env.is_turn_based:
          agent_output = agents[player_id].step(time_step)
          action_list = [agent_output.action]
        else:
          agents_output = [agent.step(time_step) for agent in agents]
          action_list = [agent_output.action for agent_output in agents_output]
        time_step = env.step(action_list)

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)

    print("Actual MST: ", rewards)
예제 #6
0
def main(_):
    game = "breakthrough"
    num_players = 2

    env_configs = {"columns": 5, "rows": 5}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        sess.run(tf.global_variables_initializer())

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  1000)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
            if (ep + 1) % FLAGS.save_every == 0:
                for agent in agents:
                    agent.save(FLAGS.checkpoint_dir)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if env.is_turn_based:
                    agent_output = agents[player_id].step(time_step)
                    action_list = [agent_output.action]
                else:
                    agents_output = [agent.step(time_step) for agent in agents]
                    action_list = [
                        agent_output.action for agent_output in agents_output
                    ]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
예제 #7
0
  def test_run_landlord(self):
    # landlord is an optional game, so check we have it before running the test.
    game = "landlord"
    if game not in pyspiel.registered_names():
      return

    num_players = 3
    env_configs = {
    }
    env = rl_environment.Environment(game, **env_configs)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    with self.session() as sess:
      agents = [
          dqn.DQN(  # pylint: disable=g-complex-comprehension
              sess,
              player_id,
              state_representation_size=state_size,
              num_actions=num_actions,
              hidden_layers_sizes=[16],
              replay_buffer_capacity=10,
              batch_size=5) for player_id in range(num_players)
      ]
      sess.run(tf.global_variables_initializer())
      time_step = env.reset()
      while not time_step.last():
        current_player = time_step.observations["current_player"]
        #agent_output = [agent.step(time_step) for agent in agents]
        #time_step = env.step([agent_output[current_player].action])
        if env.is_turn_based:
          agent_output = agents[current_player].step(time_step)
          action_list = [agent_output.action]
        else:
          agents_output = [agent.step(time_step) for agent in agents]
          action_list = [agent_output.action for agent_output in agents_output]
        print_iteration(time_step, current_player, action_list)
        time_step = env.step(action_list)

      for agent in agents:
        agent.step(time_step)
예제 #8
0
    def test_run_hanabi(self):
        # Hanabi is an optional game, so check we have it before running the test.
        game = "hanabi"
        if game not in pyspiel.registered_names():
            return

        num_players = 3
        env_configs = {
            "players": num_players,
            "max_life_tokens": 1,
            "colors": 2,
            "ranks": 3,
            "hand_size": 2,
            "max_information_tokens": 3,
            "discount": 0.
        }
        env = rl_environment.Environment(game, **env_configs)
        state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        with self.session() as sess:
            agents = [
                dqn.DQN(  # pylint: disable=g-complex-comprehension
                    sess,
                    player_id,
                    state_representation_size=state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=[16],
                    replay_buffer_capacity=10,
                    batch_size=5) for player_id in range(num_players)
            ]
            sess.run(tf.global_variables_initializer())
            time_step = env.reset()
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                agent_output = [agent.step(time_step) for agent in agents]
                time_step = env.step([agent_output[current_player].action])

            for agent in agents:
                agent.step(time_step)
 def build_graph(self,
                 scope_name,
                 current_player,
                 info_state_size,
                 num_actions,
                 hidden_layers_sizes=[64,64],
                 replay_buffer_capacity=1e5,
                 batch_size=32,
                 entropy_cost=0.001,
                 critic_learning_rate=0.01,
                 pi_learning_rate=0.01,
                 num_critic_before_pi=32
                 ):
     with tf.variable_scope(scope_name) as scope:
       if self._oracle == "dqn":
             training_agent = dqn.DQN(
                   session=self._session,
                   player_id=current_player,
                   state_representation_size=info_state_size,
                   num_actions=num_actions,
                   hidden_layers_sizes=hidden_layers_sizes,
                   replay_buffer_capacity=replay_buffer_capacity,
                   batch_size=batch_size)
       elif self._oracle in ["rpg", "qpg", "rm", "a2c"]:
           training_agent = policy_gradient.PolicyGradient(  # pylint: disable=g-complex-comprehension
                 session=self._session,
                 player_id=current_player,
                 info_state_size=info_state_size,
                 num_actions=num_actions,
                 loss_str=self._oracle,
                 hidden_layers_sizes=hidden_layers_sizes,
                 batch_size=batch_size,
                 entropy_cost=entropy_cost,
                 critic_learning_rate=critic_learning_rate,
                 pi_learning_rate=pi_learning_rate,
                 num_critic_before_pi=num_critic_before_pi)
       else:
           raise ValueError("Oracle selected is not supported.")
     return training_agent
예제 #10
0
def dqn_train(unused_arg):
    env = rl_environment.Environment(FLAGS.game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]
    sess = tf.Session()
    players = [
        dqn.DQN(sess,
                idx,
                state_representation_size=state_size,
                num_actions=num_actions,
                hidden_layers_sizes=[64],
                reservoir_buffer_capacity=2e6,
                batch_size=128,
                learn_every=64,
                replay_buffer_capacity=2e5,
                epsilon_decay_duration=FLAGS.episodes,
                epsilon_start=0.06,
                eplsilon_end=0.001) for idx in range(2)
    ]
    expl_policies_avg = NFSPPolicies(env, players, nfsp.MODE.average_policy)
    run_agents(sess, env, players, expl_policies_avg)
    sess.close()
예제 #11
0
def main(_):
    game = "tic_tac_toe"
    num_players = 2
    env = rl_environment.Environment(game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [32, 32]
    replay_buffer_capacity = int(1e4)
    train_episodes = FLAGS.num_episodes
    loss_report_interval = 1000

    with tf.Session() as sess:
        dqn_agent = dqn.DQN(sess,
                            player_id=0,
                            state_representation_size=state_size,
                            num_actions=num_actions,
                            hidden_layers_sizes=hidden_layers_sizes,
                            replay_buffer_capacity=replay_buffer_capacity)
        tabular_q_agent = tabular_qlearner.QLearner(player_id=1,
                                                    num_actions=num_actions)
        agents = [dqn_agent, tabular_q_agent]

        sess.run(tf.global_variables_initializer())

        # Train agent
        for ep in range(train_episodes):
            if ep and ep % loss_report_interval == 0:
                logging.info("[%s/%s] DQN loss: %s", ep, train_episodes,
                             agents[0].loss)
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        # Evaluate against random agent
        random_agents = [
            random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
            for idx in range(num_players)
        ]
        r_mean = eval_against_random_bots(env, agents, random_agents, 1000)
        logging.info("Mean episode rewards: %s", r_mean)

        if not FLAGS.iteractive_play:
            return

        # Play from the command line against the trained DQN agent.
        human_player = 1
        while True:
            logging.info("You are playing as %s", "O" if human_player else "X")
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == human_player:
                    agent_out = agents[human_player].step(time_step,
                                                          is_evaluation=True)
                    logging.info("\n%s", agent_out.probs.reshape((3, 3)))
                    logging.info("\n%s", pretty_board(time_step))
                    action = command_line_action(time_step)
                else:
                    agent_out = agents[1 - human_player].step(
                        time_step, is_evaluation=True)
                    action = agent_out.action
                time_step = env.step([action])

            logging.info("\n%s", pretty_board(time_step))

            logging.info("End of game!")
            if time_step.rewards[human_player] > 0:
                logging.info("You win")
            elif time_step.rewards[human_player] < 0:
                logging.info("You lose")
            else:
                logging.info("Draw")
            # Switch order of players
            human_player = 1 - human_player
예제 #12
0
def main(_):
    game = "skat"
    num_players = 3

    env_configs = {}
    env = rl_environment.Environment(game, **env_configs)
    observation_tensor_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]

    with tf.Session() as sess:
        summaries_dir = os.path.join(FLAGS.checkpoint_dir, "random_eval")
        summary_writer = tf.summary.FileWriter(summaries_dir,
                                               tf.get_default_graph())
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=observation_tensor_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  FLAGS.num_eval_games)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
                for i in range(num_players):
                    summary = tf.Summary()
                    summary.value.add(tag="mean_reward/random_{}".format(i),
                                      simple_value=r_mean[i])
                    summary_writer.add_summary(summary, ep)
                summary_writer.flush()
                saver.save(sess, FLAGS.checkpoint_dir, ep)

            time_step = env.reset()
            # Randomize position.
            if FLAGS.randomize_positions:
                positions = random.sample(range(len(agents)), len(agents))
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if FLAGS.randomize_positions:
                    position = positions[player_id]
                    agents[position].player_id = player_id
                else:
                    position = player_id
                agent_output = agents[position].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
예제 #13
0
    def __init__(self,
                 session,
                 game,
                 player_id,
                 state_size,
                 num_actions,
                 embedding_network_layers=(128, ),
                 embedding_size=16,
                 dqn_hidden_layers=(128, 128),
                 batch_size=16,
                 trajectory_len=10,
                 num_neighbours=5,
                 learning_rate=1e-4,
                 mixing_parameter=0.9,
                 memory_capacity=int(1e6),
                 discount_factor=1.0,
                 update_target_network_every=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_duration=int(1e4),
                 embedding_as_parametric_input=False):
        """Initialize the Ephemeral VAlue Adjustment algorithm.

    Args:
      session: (tf.Session) TensorFlow session.
      game: (rl_environment.Environment) Open Spiel game.
      player_id: (int) Player id for this player.
      state_size: (int) Size of info state vector.
      num_actions: (int) number of actions.
      embedding_network_layers: (list[int]) Layer sizes of strategy net MLP.
      embedding_size: (int) Size of memory embeddings.
      dqn_hidden_layers: (list(int)) MLP layer sizes of DQN network.
      batch_size: (int) Size of batches for DQN learning steps.
      trajectory_len: (int) Length of trajectories from replay buffer.
      num_neighbours: (int) Number of neighbours to fetch from replay buffer.
      learning_rate: (float) Learning rate.
      mixing_parameter: (float) Value mixing parameter between 0 and 1.
      memory_capacity: Number af samples that can be stored in memory.
      discount_factor: (float) Discount factor for Q-Learning.
      update_target_network_every: How often to update DQN target network.
      epsilon_start: (float) Starting epsilon-greedy value.
      epsilon_end: (float) Final epsilon-greedy value.
      epsilon_decay_duration: (float) Number of steps over which epsilon decays.
      embedding_as_parametric_input: (bool) Whether we use embeddings as input
        to the parametric model.
    """
        assert (mixing_parameter >= 0 and mixing_parameter <= 1)
        self._game = game
        self._session = session
        self.player_id = player_id
        self._env = game
        self._num_actions = num_actions
        self._info_state_size = state_size
        self._embedding_size = embedding_size
        self._lambda = mixing_parameter
        self._trajectory_len = trajectory_len
        self._num_neighbours = num_neighbours
        self._discount = discount_factor
        self._epsilon_start = epsilon_start
        self._epsilon_end = epsilon_end
        self._epsilon_decay_duration = epsilon_decay_duration
        self._last_time_step = None
        self._last_action = None
        self._embedding_as_parametric_input = embedding_as_parametric_input

        # Create required TensorFlow placeholders to perform the Q-network updates.
        self._info_state_ph = tf.placeholder(
            shape=[None, self._info_state_size],
            dtype=tf.float32,
            name="info_state_ph")
        self._embedding_network = snt.nets.MLP(
            list(embedding_network_layers) + [embedding_size])
        self._embedding = self._embedding_network(self._info_state_ph)

        # The DQN agent requires this be an integer.
        if not isinstance(memory_capacity, int):
            raise ValueError("Memory capacity not an integer.")

        # Initialize the parametric & non-parametric Q-networks.
        self._agent = dqn.DQN(
            session,
            player_id,
            state_representation_size=self._info_state_size,
            num_actions=self._num_actions,
            hidden_layers_sizes=list(dqn_hidden_layers),
            replay_buffer_capacity=memory_capacity,
            replay_buffer_class=QueryableFixedSizeRingBuffer,
            batch_size=batch_size,
            learning_rate=learning_rate,
            update_target_network_every=update_target_network_every,
            learn_every=batch_size,
            discount_factor=1.0,
            epsilon_start=1.0,
            epsilon_end=0.1,
            epsilon_decay_duration=int(1e6))
        # Initialize Value Buffers - Fetch Replay buffers from agents.
        self._value_buffer = QueryableFixedSizeRingBuffer(memory_capacity)
        self._replay_buffer = self._agent.replay_buffer

        # Initialize non-parametric & EVA Q-values.
        self._v_np = collections.defaultdict(float)
        self._q_np = collections.defaultdict(lambda: [0] * self._num_actions)
        self._q_eva = collections.defaultdict(lambda: [0] * self._num_actions)
예제 #14
0
    def __init__(self,
                 session,
                 player_id,
                 state_representation_size,
                 num_actions,
                 hidden_layers_sizes,
                 reservoir_buffer_capacity,
                 anticipatory_param,
                 batch_size=128,
                 rl_learning_rate=0.01,
                 sl_learning_rate=0.01,
                 min_buffer_size_to_learn=1000,
                 learn_every=64,
                 optimizer_str="sgd",
                 **kwargs):
        """Initialize the `NFSP` agent."""
        self.player_id = player_id
        self._session = session
        self._num_actions = num_actions
        self._layer_sizes = hidden_layers_sizes + [num_actions]
        self._batch_size = batch_size
        self._learn_every = learn_every
        self._anticipatory_param = anticipatory_param
        self._min_buffer_size_to_learn = min_buffer_size_to_learn

        self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None

        # Step counter to keep track of learning.
        self._step_counter = 0

        # Inner RL agent
        kwargs.update({
            "batch_size": batch_size,
            "learning_rate": rl_learning_rate,
            "learn_every": learn_every,
            "min_buffer_size_to_learn": min_buffer_size_to_learn,
            "optimizer_str": optimizer_str,
        })
        self._rl_agent = dqn.DQN(session, player_id, state_representation_size,
                                 num_actions, hidden_layers_sizes, **kwargs)

        # Keep track of the last training loss achieved in an update step.
        self._last_rl_loss_value = lambda: self._rl_agent.loss
        self._last_sl_loss_value = None

        # Placeholders.
        self._info_state_ph = tf.placeholder(
            shape=[None, state_representation_size],
            dtype=tf.float32,
            name="info_state_ph")

        self._action_probs_ph = tf.placeholder(shape=[None, num_actions],
                                               dtype=tf.float32,
                                               name="action_probs_ph")

        self._legal_actions_mask_ph = tf.placeholder(
            shape=[None, num_actions],
            dtype=tf.float32,
            name="legal_actions_mask_ph")

        # Average policy network.
        self._avg_network = snt.nets.MLP(output_sizes=self._layer_sizes)
        self._avg_policy = self._avg_network(self._info_state_ph)
        self._avg_policy_probs = tf.nn.softmax(self._avg_policy)

        # Loss
        self._loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=tf.stop_gradient(self._action_probs_ph),
                logits=self._avg_policy))

        if optimizer_str == "adam":
            optimizer = tf.train.AdamOptimizer(learning_rate=sl_learning_rate)
        elif optimizer_str == "sgd":
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=sl_learning_rate)
        else:
            raise ValueError("Not implemented. Choose from ['adam', 'sgd'].")

        self._learn_step = optimizer.minimize(self._loss)
        self._sample_episode_policy()
예제 #15
0
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = parameters.hidden_layers_sizes
    replay_buffer_capacity = int(1e4)
    train_episodes = 50000
    loss_report_interval = 1000
    delta_rank = 10
    max_rank = 20
    min_rank = -20

    with tf.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        dqn_agents = [
            dqn.DQN(sess,
                    player_id=idx,
                    state_representation_size=state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=replay_buffer_capacity)
            for idx in range(num_players)
        ]

        stds = []
        #cette boucle for permet d'effectuer un ladder sur les versions de l'agent voulu : si on ne veut rank qu'une seule version de l'agent, utiliser la premier ligne, sinon, la 2nd
        for version in [159]:
            #for version in range(1, 160, 5) :
            jjs = []
            for i in range(num_players):
                dqn_agents[i].restore(parameters.agent_path, str(i),
                                      str(version))

            #creation de la population de jean jacques, avec son
def main(_):
  game = "lewis_signaling"
  num_players = 2

  num_states = FLAGS.num_states
  num_messages = FLAGS.num_messages
  if FLAGS.payoffs == "random":
    payoffs = np.random.random((num_states, num_states))
    payoffs_str = ",".join([str(x) for x in payoffs.flatten()])
  elif FLAGS.payoffs == "climbing":
    # This is a particular payoff matrix that is hard for decentralized
    # algorithms. Introduced in C. Claus and C. Boutilier, "The dynamics of
    # reinforcement learning in cooperative multiagent systems", 1998, for
    # simultaneous action games, but it is difficult even in the case of
    # signaling games.
    payoffs = np.array([[11, -30, 0], [-30, 7, 6], [0, 0, 5]]) / 30
    payoffs_str = ",".join([str(x) for x in payoffs.flatten()])
  else:
    payoffs_str = FLAGS.payoffs
    try:
      payoffs_list = [float(x) for x in payoffs_str.split(",")]
      payoffs = np.array(payoffs_list).reshape((num_states, num_states))
    except ValueError:
      raise ValueError(
          "There should be {} (states * actions) elements in payoff. Found {} elements"
          .format(num_states * num_states, len(payoffs_list)))

  env_configs = {
      "num_states": num_states,
      "num_messages": num_messages,
      "payoffs": payoffs_str
  }

  env = rl_environment.Environment(game, **env_configs)
  state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]
  replay_buffer_capacity = FLAGS.replay_buffer_capacity

  # Results to store
  num_runs = FLAGS.num_runs
  training_episodes = FLAGS.num_episodes
  log_interval = FLAGS.log_interval
  rewards = np.zeros((num_runs, training_episodes // log_interval))
  opts = np.zeros((num_runs, training_episodes // log_interval))
  converge_point = np.zeros((num_states, num_states))
  percent_opt = 0

  # Repeat the experiment num_runs times
  for i in range(num_runs):
    with tf.Session() as sess:
      # pylint: disable=g-complex-comprehension
      agents = [
          dqn.DQN(
              sess,
              player_id=idx,
              state_representation_size=state_size,
              num_actions=num_actions,
              learning_rate=FLAGS.step_size,
              replay_buffer_capacity=replay_buffer_capacity,
              epsilon_start=FLAGS.eps_init,
              epsilon_end=FLAGS.eps_final,
              epsilon_decay_duration=FLAGS.eps_decay_steps * 2)
          for idx in range(num_players)
      ]

      # 1. Train the agents
      for cur_episode in range(training_episodes):
        time_step = env.reset()
        # Find cur_state for logging. See lewis_signaling.cc for info_state
        # details
        cur_state = time_step.observations["info_state"][0][3:].index(1)
        while not time_step.last():
          player_id = time_step.observations["current_player"]
          agent_output = agents[player_id].step(time_step)
          time_step = env.step([agent_output.action])

        # Episode is over, step all agents with final info state.
        for agent in agents:
          agent.step(time_step)

        # Store rewards
        reward = time_step.rewards[0]
        max_reward = payoffs[cur_state].max()
        cur_idx = (i, cur_episode // log_interval)
        rewards[cur_idx] += reward / log_interval
        opts[cur_idx] += np.isclose(reward, max_reward) / log_interval

      base_info_state0 = [1.0, 0.0, 0.0] + [0.0] * num_states
      base_info_state1 = [0.0, 1.0, 0.0] + [0.0] * num_states

      for s in range(num_states):
        info_state0 = copy.deepcopy(base_info_state0)
        info_state0[3 + s] = 1.0
        # pylint: disable=protected-access
        m, _ = agents[0]._epsilon_greedy(info_state0, np.arange(num_messages),
                                         0)
        info_state1 = copy.deepcopy(base_info_state1)
        info_state1[3 + m] = 1.0
        a, _ = agents[1]._epsilon_greedy(info_state1, np.arange(num_states), 0)
        converge_point[s, a] += 1
        best_act = payoffs[s].argmax()
        percent_opt += int(a == best_act) / num_runs / num_states

  if FLAGS.plot:
    # pylint: disable=g-import-not-at-top
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import scipy.stats as stats

    params = {
        "font.size": 13,
        "axes.labelsize": 13,
        "xtick.labelsize": 13,
        "ytick.labelsize": 13,
    }
    mpl.rcParams.update(params)

    def init_fig():
      fig, ax = plt.subplots(1, 1)
      ax.spines["top"].set_visible(False)
      ax.spines["right"].set_visible(False)
      return fig, ax

    def plot_scalars(scalars,
                     repetition_axis=0,
                     scalar_labels=None,
                     title=None,
                     ax_labels=None):
      """Plots scalar on ax by filling 1 standard error.

      Args:
          scalars: List of scalars to plot (mean taken over repetition
            axis)
          repetition_axis: Axis to take the mean over
          scalar_labels: Labels for the scalars (for legend)
          title: Figure title
          ax_labels: Labels for x and y axis (list of 2 strings)
      """
      if not all([len(s.shape) == 2 for s in scalars]):
        raise ValueError("Only 2D arrays supported for plotting")

      if scalar_labels is None:
        scalar_labels = [None] * len(scalars)

      if len(scalars) != len(scalar_labels):
        raise ValueError(
            "Wrong number of scalar labels, expected {} but received {}".format(
                len(scalars), len(scalar_labels)))

      _, plot_axis = init_fig()
      for i, scalar in enumerate(scalars):
        xs = np.arange(scalar.shape[1 - repetition_axis]) * FLAGS.log_interval
        mean = scalar.mean(axis=repetition_axis)
        sem = stats.sem(scalar, axis=repetition_axis)
        plot_axis.plot(xs, mean, label=scalar_labels[i])
        plot_axis.fill_between(xs, mean - sem, mean + sem, alpha=0.5)

      if title is not None:
        plot_axis.set_title(title)
      if ax_labels is not None:
        plot_axis.set_xlabel(ax_labels[0])
        plot_axis.set_ylabel(ax_labels[1])

    def plot_confusion_matrix(cm, cmap=plt.cm.Blues, title=None):
      """Plot the confusion matrix.

      Args:
          cm (np.ndarray): Confusion matrix to plot
          cmap: Color map to be used in matplotlib's imshow
          title: Figure title

      Returns:
          Figure and axis on which the confusion matrix is plotted.
      """
      fig, ax = plt.subplots()
      ax.imshow(cm, interpolation="nearest", cmap=cmap)
      ax.set_xticks([])
      ax.set_yticks([])
      ax.set_xlabel("Receiver's action", fontsize=14)
      ax.set_ylabel("Sender's state", fontsize=14)
      # Loop over data dimensions and create text annotations.
      fmt = "d"
      thresh = cm.max() / 2.
      for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
          ax.text(
              j,
              i,
              format(cm[i, j], fmt),
              ha="center",
              va="center",
              color="white" if cm[i, j] > thresh else "black")
      fig.tight_layout()
      if title is not None:
        ax.set_title(title)
      return fig, ax

    plot_scalars([rewards],
                 title="Reward graph (DQN)",
                 ax_labels=["Episodes", "Reward per episode"])
    plot_scalars([opts],
                 title="Percentage of optimal actions (DQN)",
                 ax_labels=["Episodes", "% optimal actions"])

    plot_confusion_matrix(
        converge_point.astype(np.int), title="Final policy (DQN)")

    plt.show()

  return percent_opt
예제 #17
0
def main(_):
    game = FLAGS.game  # Set the game
    num_players = 1
    train_games, train_rewards, test_games, test_rewards = mst.game_params(
        FLAGS.num_nodes)  # Load from files
    env_configs = train_games[0]
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = FLAGS.num_nodes * FLAGS.num_nodes * 3  #env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()[
        "num_actions"]  # number of possible actions

    print("Info State Size: ", info_state_size)
    print("Num Actions: ", num_actions)

    # random agents for evaluation
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(num_players)
    ]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.125)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
        # pylint: disable=g-complex-comprehension
        agents = [
            dqn.DQN(session=sess,
                    player_id=idx,
                    state_representation_size=info_state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=hidden_layers_sizes,
                    replay_buffer_capacity=FLAGS.replay_buffer_capacity,
                    batch_size=FLAGS.batch_size) for idx in range(num_players)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        #saver = tf.train.import_meta_graph('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/dqn_test-399999.meta')
        #saver.restore(sess, tf.train.latest_checkpoint('/home/jupyter/ORIE-GNN-bjk224/mst-game/dqn_checkpoints/dqn_20epochs_mst_medium/'))

        for ep in range(FLAGS.num_train_episodes):
            print(env_configs)
            #env_configs = train_games[ep % len(train_games)]
            #env = rl_environment.Environment(game, **env_configs)
            episode_reward = train_rewards[ep % len(train_games)]
            if (ep + 1) % FLAGS.eval_every == 0:
                r_mean = eval_against_random_bots(env, agents, random_agents,
                                                  0)
                logging.info("[%s] Mean episode rewards %s", ep + 1, r_mean)
                #saver.save(sess, FLAGS.checkpoint_dir, ep)
                print("Actual MST Value: ", episode_reward)
            if (ep + 1) % FLAGS.test_every == 0:
                test_accuracy = test_trained_bot(test_games, test_rewards,
                                                 agents[0], ep,
                                                 FLAGS.num_nodes, game,
                                                 FLAGS.game_version)
                logging.info("[%s] Test Accuracy: %s", ep + 1, test_accuracy)

            #env = rl_environment.Environment(game, **games[ep])
            time_step = env.reset()
            # print("TRAIN"+"*"*80)
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)
                #print("(Action, Reward): ", action_list[0], time_step.rewards[0])

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
예제 #18
0
def main_loop(unused_arg):
  """Trains a DQN agent in the catch environment."""
  env = catch.Environment()
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  train_episodes = FLAGS.num_episodes

  with tf.Session() as sess:
    if FLAGS.algorithm in {"rpg", "qpg", "rm", "a2c"}:
      agent = policy_gradient.PolicyGradient(
          sess,
          player_id=0,
          info_state_size=info_state_size,
          num_actions=num_actions,
          loss_str=FLAGS.algorithm,
          hidden_layers_sizes=[128, 128],
          batch_size=128,
          entropy_cost=0.01,
          critic_learning_rate=0.1,
          pi_learning_rate=0.1,
          num_critic_before_pi=3)
    elif FLAGS.algorithm == "dqn":
      agent = dqn.DQN(
          sess,
          player_id=0,
          state_representation_size=info_state_size,
          num_actions=num_actions,
          learning_rate=0.1,
          replay_buffer_capacity=10000,
          hidden_layers_sizes=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    elif FLAGS.algorithm == "eva":
      agent = eva.EVAAgent(
          sess,
          env,
          player_id=0,
          state_size=info_state_size,
          num_actions=num_actions,
          learning_rate=1e-3,
          trajectory_len=2,
          num_neighbours=2,
          mixing_parameter=0.95,
          memory_capacity=10000,
          dqn_hidden_layers=[32, 32],
          epsilon_decay_duration=2000,  # 10% total data
          update_target_network_every=250)
    else:
      raise ValueError("Algorithm not implemented!")

    sess.run(tf.global_variables_initializer())

    # Train agent
    for ep in range(train_episodes):
      time_step = env.reset()
      while not time_step.last():
        agent_output = agent.step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)
      # Episode is over, step agent with final info state.
      agent.step(time_step)

      if ep and ep % FLAGS.eval_every == 0:
        logging.info("-" * 80)
        logging.info("Episode %s", ep)
        logging.info("Loss: %s", agent.loss)
        avg_return = _eval_agent(env, agent, 100)
        logging.info("Avg return: %s", avg_return)
예제 #19
0
def main_loop(unused_arg):
    """RL main loop example."""
    logging.info("Registered games: %s", rl_environment.registered_games())
    logging.info("Creating game %s", FLAGS.game)

    #env_configs = {"players": FLAGS.num_players} if FLAGS.num_players else {}
    env_configs = {}
    env = rl_environment.Environment(FLAGS.game, **env_configs)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [512, 512]
    replay_buffer_capacity = int(1e4)
    train_episodes = FLAGS.num_episodes
    loss_report_interval = 1000

    logging.info("Env specs: %s", env.observation_spec())
    logging.info("Action specs: %s", env.action_spec())

    with tf.Session() as sess:
        agents = [
            dqn.DQN(  # pylint: disable=g-complex-comprehension
                sess,
                player_id,
                state_representation_size=state_size,
                num_actions=num_actions,
                #hidden_layers_sizes=[16],
                #replay_buffer_capacity=10,
                hidden_layers_sizes=hidden_layers_sizes,
                replay_buffer_capacity=replay_buffer_capacity,
                batch_size=128) for player_id in range(3)
        ]
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        #latest_checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        latest_checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_dir)
        if latest_checkpoint_path:
            print('Restoring checkpoint: {0}'.format(latest_checkpoint_path))
            saver.restore(sess, latest_checkpoint_path)

        # Train agent
        for ep in range(train_episodes):
            #if ep and ep % loss_report_interval == 0:
            if (ep + 1) % FLAGS.eval_every == 0:
                logging.info("[%s/%s] DQN loss: %s   %s  %s", ep,
                             train_episodes, agents[0].loss, agents[1].loss,
                             agents[2].loss)
                saver.save(sess, FLAGS.checkpoint_dir, ep)

            time_step = env.reset()
            while not time_step.last():
                current_player = time_step.observations["current_player"]
                #agent_output = [agent.step(time_step) for agent in agents]
                #time_step = env.step([agent_output[current_player].action])
                if env.is_turn_based:
                    agent_output = agents[current_player].step(time_step)
                    action_list = [agent_output.action]
                else:
                    agents_output = [agent.step(time_step) for agent in agents]
                    action_list = [
                        agent_output.action for agent_output in agents_output
                    ]
                #print_iteration(time_step, current_player, action_list)
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)