示例#1
0
    def test_run_kuhn(self):
        env = rl_environment.Environment("kuhn_poker")
        state_size = env.observation_spec()["info_state"][0]
        num_actions = env.action_spec()["num_actions"]

        with self.session() as sess:
            agents = [
                nfsp.NFSP(  # pylint: disable=g-complex-comprehension
                    sess,
                    player_id,
                    state_representation_size=state_size,
                    num_actions=num_actions,
                    hidden_layers_sizes=[16],
                    reservoir_buffer_capacity=10,
                    anticipatory_param=0.1) for player_id in [0, 1]
            ]
            sess.run(tf.global_variables_initializer())

            for unused_ep in range(10):
                time_step = env.reset()
                while not time_step.last():
                    current_player = time_step.observations["current_player"]
                    current_agent = agents[current_player]
                    agent_output = current_agent.step(time_step)
                    time_step = env.step([agent_output.action])

                for agent in agents:
                    agent.step(time_step)
def nfsp_train(unused_arg):
    env = rl_environment.Environment(FLAGS.game)
    state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]
    kwargs = {
        "replay_buffer_capacity": 2e5,
        "epsilon_decay_duration": FLAGS.episodes,
        "epsilon_start": 0.06,
        "epsilon_end": 0.001,
    }

    sess = tf.Session()
    players = [
        nfsp.NFSP(sess,
                  idx,
                  state_representation_size=state_size,
                  num_actions=num_actions,
                  hidden_layers_sizes=[64],
                  reservoir_buffer_capacity=2e6,
                  rl_learning_rate=0.1,
                  sl_learning_rate=0.005,
                  anticipatory_param=0.1,
                  batch_size=128,
                  learn_every=64,
                  **kwargs) for idx in range(2)
    ]
    expl_policies_avg = NFSPPolicies(env, players, nfsp.MODE.average_policy)
    run_agents(sess, env, players, expl_policies_avg)
    sess.close()
示例#3
0
def main(_):
  game = "leduc_poker"
  num_players = 2
  env = rl_environment.Environment(game)
  state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]
  hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]

  kwargs = {
      "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
      "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity,
      "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
      "anticipatory_param": FLAGS.anticipatory_param,
      "batch_size": FLAGS.batch_size,
      "learn_every": FLAGS.learn_every,
      "rl_learning_rate": FLAGS.rl_learning_rate,
      "sl_learning_rate": FLAGS.sl_learning_rate,
      "optimizer_str": FLAGS.optimizer_str,
      "loss_str": FLAGS.loss_str,
      "update_target_network_every": FLAGS.update_target_network_every,
      "discount_factor": FLAGS.discount_factor,
      "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
      "epsilon_start": FLAGS.epsilon_start,
      "epsilon_end": FLAGS.epsilon_end,
  }



  with tf.Session() as sess:
    agents = [
        nfsp.NFSP(sess, idx, state_size, num_actions, hidden_layers_sizes,
                  **kwargs) for idx in range(num_players)
    ]

    # for agent in agents[2:]:
    #     agent.restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained")

    for agent in agents:
      agent.restore(FLAGS.checkpoint_dir)
    # agents[1].restore("/home/benedikt/Dokumente/Uni/HCI/openspiel_saves/half_trained")




    # Evaluate against random agent
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions) for idx in range(num_players)
    ]

    r_mean = evaluateBotAgainstBot(env, agents[0], agents[1], 10000)
    logging.info("Mean episode rewards: %s", r_mean)

    #analyzeHistory()

    #r_mean = eval_against_random_bots(env, agents, random_agents, 10000)
    #logging.info("Mean episode rewards: %s", r_mean)

    '''if not FLAGS.iteractive_play:
示例#4
0
def main(unused_argv):
  logging.info("Loading %s", FLAGS.game_name)
  game = FLAGS.game_name
  num_players = FLAGS.num_players

  env_configs = {"players": num_players}
  env = rl_environment.Environment(game, **env_configs)
  info_state_size = env.observation_spec()["info_state"][0]
  num_actions = env.action_spec()["num_actions"]

  hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
  kwargs = {
      "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
      "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity,
      "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
      "anticipatory_param": FLAGS.anticipatory_param,
      "batch_size": FLAGS.batch_size,
      "learn_every": FLAGS.learn_every,
      "rl_learning_rate": FLAGS.rl_learning_rate,
      "sl_learning_rate": FLAGS.sl_learning_rate,
      "optimizer_str": FLAGS.optimizer_str,
      "loss_str": FLAGS.loss_str,
      "update_target_network_every": FLAGS.update_target_network_every,
      "discount_factor": FLAGS.discount_factor,
      "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
      "epsilon_start": FLAGS.epsilon_start,
      "epsilon_end": FLAGS.epsilon_end,
  }

  with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  **kwargs) for idx in range(num_players)
    ]
    expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

    sess.run(tf.global_variables_initializer())
    for ep in range(FLAGS.num_train_episodes):
      if (ep + 1) % FLAGS.eval_every == 0:
        losses = [agent.loss for agent in agents]
        logging.info("Losses: %s", losses)
        expl = exploitability.exploitability(env.game, expl_policies_avg)
        logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
        logging.info("_____________________________________________")

      time_step = env.reset()
      while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = agents[player_id].step(time_step)
        action_list = [agent_output.action]
        time_step = env.step(action_list)

      # Episode is over, step all agents with final info state.
      for agent in agents:
        agent.step(time_step)
示例#5
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = FLAGS.game_name
    num_players = FLAGS.num_players

    env_configs = {
        "players": num_players,
        "map": 0,
        "rng_seed": -1,
        "max_turns": 90,
        "dep_abs": False,
        "atk_abs": True,
        "redist_abs": True,
        "fort_abs": True,
        "dep_q": 31,
        "atk_q": 2,
        "redist_q": 2,
        "fort_q": 2
    }
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity,
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "anticipatory_param": FLAGS.anticipatory_param,
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "rl_learning_rate": FLAGS.rl_learning_rate,
        "sl_learning_rate": FLAGS.sl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": FLAGS.epsilon_start,
        "epsilon_end": FLAGS.epsilon_end,
    }

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions,
                      hidden_layers_sizes, **kwargs)
            for idx in range(num_players)
        ]
        joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy)
        sess.run(tf.global_variables_initializer())
        if FLAGS.use_checkpoints:
            for agent in agents:
                if agent.has_checkpoint(FLAGS.checkpoint_dir):
                    agent.restore(FLAGS.checkpoint_dir)

        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if FLAGS.human_player_id == player_id:
                print(time_step.observations['info_state'][player_id])
                print(time_step.observations['legal_actions'][player_id])
                visualise(time_step.observations['info_state'][player_id])
                human_action = input('Human action:')
                time_step = env.step([int(human_action)])
            else:
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                print(action_list)
                time_step = env.step(action_list)
示例#6
0
def main(unused_argv):
    logging.info("Loading %s", FLAGS.game_name)
    game = FLAGS.game_name
    num_players = FLAGS.num_players

    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "reservoir_buffer_capacity": FLAGS.reservoir_buffer_capacity,
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "anticipatory_param": FLAGS.anticipatory_param,
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "rl_learning_rate": FLAGS.rl_learning_rate,
        "sl_learning_rate": FLAGS.sl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": FLAGS.epsilon_start,
        "epsilon_end": FLAGS.epsilon_end,
    }

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions,
                      hidden_layers_sizes, **kwargs)
            for idx in range(num_players)
        ]
        joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

        sess.run(tf.global_variables_initializer())

        if FLAGS.use_checkpoints:
            for agent in agents:
                if agent.has_checkpoint(FLAGS.checkpoint_dir):
                    agent.restore(FLAGS.checkpoint_dir)

        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]
                logging.info("Losses: %s", losses)
                if FLAGS.evaluation_metric == "exploitability":
                    # Avg exploitability is implemented only for 2 players constant-sum
                    # games, use nash_conv otherwise.
                    expl = exploitability.exploitability(
                        env.game, joint_avg_policy)
                    logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
                elif FLAGS.evaluation_metric == "nash_conv":
                    nash_conv = exploitability.nash_conv(
                        env.game, joint_avg_policy)
                    logging.info("[%s] NashConv %s", ep + 1, nash_conv)
                else:
                    raise ValueError(" ".join(
                        ("Invalid evaluation metric, choose from",
                         "'exploitability', 'nash_conv'.")))
                if FLAGS.use_checkpoints:
                    for agent in agents:
                        agent.save(FLAGS.checkpoint_dir)
                logging.info("_____________________________________________")

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
示例#7
0
def neural_ficticious_self_play(seq_game,
                                num_epoch,
                                sess,
                                compute_metrics=False):
    env = rl_environment.Environment(seq_game)
    # Parameters from the game.
    num_players = env.num_players
    num_actions = env.action_spec()["num_actions"]
    info_state_size = env.observation_spec()["info_state"][0]

    # Parameters for the algorithm.
    hidden_layers_sizes = [int(l) for l in [128]]

    kwargs = {
        "replay_buffer_capacity": int(2e5),
        "reservoir_buffer_capacity": int(2e6),
        "min_buffer_size_to_learn": 1000,
        "anticipatory_param": 0.1,
        "batch_size": 128,
        "learn_every": 64,
        "rl_learning_rate": 0.01,
        "sl_learning_rate": 0.01,
        "optimizer_str": "sgd",
        "loss_str": "mse",
        "update_target_network_every": 19200,
        "discount_factor": 1.0,
        "epsilon_decay_duration": int(20e6),
        "epsilon_start": 0.06,
        "epsilon_end": 0.001,
    }

    # freq_epoch_printing = num_epoch // 10
    agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  **kwargs) for idx in range(num_players)
    ]
    joint_avg_policy = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

    sess.run(tf.global_variables_initializer())
    # print("TF initialized.")
    tick_time = time.time()
    for _ in range(num_epoch):
        # if ep % freq_epoch_printing == 0:
        #   print(f"Iteration {ep}")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
    timing = time.time() - tick_time
    # print("Finish.")
    if compute_metrics:
        tabular_policy = joint_avg_policy.TabularPolicy(seq_game)
        average_policy_values = expected_game_score.policy_value(
            seq_game.new_initial_state(), [tabular_policy])
        nash_conv = exploitability.nash_conv(env.game, joint_avg_policy)
        return timing, joint_avg_policy, average_policy_values, nash_conv
    return timing, joint_avg_policy
示例#8
0
def NFSP_Solving(game, iterations, save_every=0, save_prefix='base'):
    class NFSPPolicies(policy.Policy):
        """Joint policy to be evaluated."""

        def __init__(self, nfsp_policies, mode):
            player_ids = [0, 1]
            super(NFSPPolicies, self).__init__(game, player_ids)
            self._policies = nfsp_policies
            self._mode = mode
            self._obs = {"info_state": [None, None], "legal_actions": [None, None]}

        def action_probabilities(self, state, player_id=None):
            cur_player = state.current_player()
            legal_actions = state.legal_actions(cur_player)

            self._obs["current_player"] = cur_player
            self._obs["info_state"][cur_player] = (
                state.information_state_tensor(cur_player))
            self._obs["legal_actions"][cur_player] = legal_actions

            info_state = rl_environment.TimeStep(
                observations=self._obs, rewards=None, discounts=None, step_type=None)

            with self._policies[cur_player].temp_mode_as(self._mode):
                p = self._policies[cur_player].step(info_state, is_evaluation=True).probs
            prob_dict = {action: p[action] for action in legal_actions}
            return prob_dict

    def save_nfsp():
        tabular_policy = policy.tabular_policy_from_callable(game, expl_policies_avg)
        policy_handler.save_tabular_policy(game, tabular_policy, "policies/NFSP/{}/{}".format(save_prefix, it))

    num_players = 2
    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    hidden_layers_sizes = [128]
    replay_buffer_capacity = int(2e5)
    reservoir_buffer_capacity = int(2e6)
    anticipatory_param = 0.1

    hidden_layers_sizes = [int(l) for l in hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": replay_buffer_capacity,
        "epsilon_decay_duration": iterations,
        "epsilon_start": 0.06,
        "epsilon_end": 0.001,
    }

    with tf.Session() as sess:
        # pylint: disable=g-complex-comprehension
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                      reservoir_buffer_capacity, anticipatory_param,
                      **kwargs) for idx in range(num_players)
        ]
        expl_policies_avg = NFSPPolicies(agents, nfsp.MODE.average_policy)

        sess.run(tf.global_variables_initializer())
        for it in range(iterations + 1):
            if save_every != 0 and it % save_every == 0:  # order is important
                save_nfsp()

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)
            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)
        save_nfsp()
示例#9
0
文件: leduc.py 项目: onon6/ML_Project
def train_network(
    num_episodes,
    hidden_layers_sizes,
    replay_buffer_capacity,
    reservoir_buffer_capacity,
    anticipatory_param,
    epsilon_start,
):
    # Train the NFSP network with specific params
    logging.info(
        "Training network with hyperparameters: LAY_SIZE={}, REPBUFCAP={}, RESBUFCAP={}, ANTPARAM={}, ESTART={}"
        .format(
            hidden_layers_sizes,
            replay_buffer_capacity,
            reservoir_buffer_capacity,
            anticipatory_param,
            epsilon_start,
        ))

    game = FLAGS.game_name
    num_players = FLAGS.num_players

    # Set the environment
    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    # Set the arguments
    hidden_layers_sizes = [hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": int(replay_buffer_capacity),
        "reservoir_buffer_capacity": int(reservoir_buffer_capacity),
        "min_buffer_size_to_learn": FLAGS.min_buffer_size_to_learn,
        "anticipatory_param": float(anticipatory_param),
        "batch_size": FLAGS.batch_size,
        "learn_every": FLAGS.learn_every,
        "rl_learning_rate": FLAGS.rl_learning_rate,
        "sl_learning_rate": FLAGS.sl_learning_rate,
        "optimizer_str": FLAGS.optimizer_str,
        "loss_str": FLAGS.loss_str,
        "update_target_network_every": FLAGS.update_target_network_every,
        "discount_factor": FLAGS.discount_factor,
        "epsilon_decay_duration": FLAGS.epsilon_decay_duration,
        "epsilon_start": float(epsilon_start),
        "epsilon_end": FLAGS.epsilon_end,
    }

    # Start the training session
    with tf.Session() as sess:
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions,
                      hidden_layers_sizes, **kwargs)
            for idx in range(num_players)
        ]
        expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

        episodes = []
        exploits = []
        nashes = []

        sess.run(tf.global_variables_initializer())

        for ep in range(num_episodes):
            # Evalutate every x'th iteration, and calculate the exploitability
            # and nash convergence
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = [agent.loss for agent in agents]

                expl = exploitability.exploitability(env.game,
                                                     expl_policies_avg)
                nash = exploitability.nash_conv(env.game, expl_policies_avg)

                logging.info("[%s/%s] AVG Exploitability %s", ep + 1,
                             num_episodes, expl)

                episodes.append(ep + 1)
                exploits.append(expl)
                nashes.append(nash)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        # Export the trained policy
        policy_to_csv(
            pyspiel.load_game("leduc_poker"),
            expl_policies_avg,
            "./leduc_best_policy.csv",
        )

    return (episodes, exploits, nashes)
示例#10
0
def runNFSP(hidden_layers_sizes, replay_buffer_capacity,
            reservoir_buffer_capacity, epsilon_start, epsilon_end,
            anticipatory_param):
    # Define data storage arrays
    episodes = []
    exploits = []

    # Initialize the game
    game = FLAGS.game
    num_players = FLAGS.num_players

    env_configs = {"players": num_players}
    env = rl_environment.Environment(game, **env_configs)
    info_state_size = env.observation_spec()["info_state"][0]
    num_actions = env.action_spec()["num_actions"]

    kwargs = {
        "replay_buffer_capacity": replay_buffer_capacity,
        "epsilon_decay_duration": FLAGS.num_train_episodes,
        "epsilon_start": epsilon_start,
        "epsilon_end": epsilon_end,
    }

    # Start the TensorFlow session
    with tf.Session() as sess:
        # Initialize NFSP Agent
        agents = [
            nfsp.NFSP(sess, idx, info_state_size, num_actions,
                      hidden_layers_sizes, reservoir_buffer_capacity,
                      anticipatory_param, **kwargs)
            for idx in range(num_players)
        ]
        expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

        sess.run(tf.global_variables_initializer())
        for ep in range(FLAGS.num_train_episodes):
            # Evaluate Agents
            if ((ep + 1) % FLAGS.eval_every == 0) & ((ep + 1) >= 100):
                losses = [agent.loss for agent in agents]
                logging.info("Losses: %s", losses)
                expl = exploitability.exploitability(env.game,
                                                     expl_policies_avg)
                logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
                logging.info("_____________________________________________")

                episodes.append(ep + 1)
                exploits.append(expl)

            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = [agent_output.action]
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            for agent in agents:
                agent.step(time_step)

        for pid, agent in enumerate(agents):
            policy_to_csv(env.game, expl_policies_avg,
                          f"{FLAGS.modeldir}/test_p{pid+1}.csv")
        play(agents, env)

    return episodes, exploits