示例#1
0
def main(_):
    """ Runs {FLAGS.num_episodes} games of Hanabi-small using a trained ppo agent"""

    # --- Load ppo agent with trained policy --- #
    agent = TrainedPPOAgent(train_dir=FLAGS.root_dir)

    # --- Create Environment for evaluation --- #
    env = rl_env.make(environment_name=FLAGS.game_type,
                      num_players=FLAGS.num_players)
    eval_env = PyhanabiEnvWrapper(env)

    # --- Run single game for inspection --- #
    for i in range(FLAGS.num_episodes):
        turn = 1
        sum_rewards = 0
        time_step = eval_env.reset()
        while not time_step.is_last():
            # act
            action = agent.act(time_step)
            # step
            time_step = eval_env.step(action)
            if time_step.reward > 0:
                print(f'Got reward {time_step.reward} at turn {turn}')
            sum_rewards += time_step.reward
            turn += 1

        if time_step.is_last():
            print(f'Game {i+1} ended at turn {turn-1}')

        print(f'total reward was {sum_rewards}')
def main(_):
    experiment_logger = logger.Logger(f'{FLAGS.root_dir}')
    env = rl_env.make(environment_name=FLAGS.game_type,
                      num_players=FLAGS.num_players)
    obs_stacker = run_experiment.create_obs_stacker(environment=env,
                                                    history_size=1)
    agent = rainbow_agent.RainbowAgent(
        observation_size=obs_stacker.observation_size(),
        num_actions=env.num_moves(),
        num_players=env.players)

    # reload checkpoint if possible
    start_iter, checkpointer = run_experiment.initialize_checkpointing(
        agent=agent,
        experiment_logger=experiment_logger,
        checkpoint_dir=f'{FLAGS.base_dir}/checkpoints')
    run_experiment.run_experiment(
        agent=agent,
        environment=env,
        start_iteration=start_iter,
        obs_stacker=obs_stacker,
        experiment_logger=experiment_logger,
        experiment_checkpointer=checkpointer,
        checkpoint_dir=f'{FLAGS.base_dir}/checkpoints',
        summary_dir=f'{FLAGS.base_dir}/summary',
    )
示例#3
0
    def __init__(self, train_dir):
        # --- Tf session --- #
        tf.reset_default_graph()
        self.sess = tf.Session()
        tf.compat.v1.enable_resource_variables()

        # --- Environment Stub--- #
        env = rl_env.make(environment_name=FLAGS.game_type,
                          num_players=FLAGS.num_players)
        wrapped_env = PyhanabiEnvWrapper(env)
        tf_env = tf_py_environment.TFPyEnvironment(wrapped_env)

        with self.sess.as_default():
            # --- Init Networks --- #
            actor_net, value_net = load_networks(tf_env)

            # --- Init agent --- #
            agent = PPOAgent(  # set up ppo agent with tf_agents
                tf_env.time_step_spec(),
                tf_env.action_spec(),
                actor_net=actor_net,
                value_net=value_net,
                train_step_counter=tf.compat.v1.train.
                get_or_create_global_step(),
                normalize_observations=False)

            # --- init policy --- #
            self.policy = py_tf_policy.PyTFPolicy(agent.policy)
            self.policy.initialize(None)
            # --- restore from checkpoint --- #
            self.policy.restore(policy_dir=train_dir, assert_consumed=False)

            # Run tf graph
            self.sess.run(agent.initialize())
 def __init__(self, flags):
     """Initialize runner."""
     self.flags = flags
     self.agent_config = {'players': flags['players']}
     self.environment = rl_env.make('Hanabi-Full',
                                    num_players=flags['players'])
     self.agent_class = AGENT_CLASSES[flags['agent_class']]
    def __init__(self,
                 root_dir,
                 game_type='Hanabi-Full',
                 num_players=4,
                 actor_fc_layers=(100, ),
                 value_fc_layers=(100, ),
                 use_value_network=False
                 ):
        tf.reset_default_graph()
        self.sess = tf.Session()
        tf.compat.v1.enable_resource_variables()

        pyhanabi_env = rl_env.make(environment_name=game_type, num_players=num_players)
        py_env = PyhanabiEnvWrapper(pyhanabi_env)
        tf_env = tf_py_environment.TFPyEnvironment(py_env)

        with self.sess.as_default():
            # init the agent
            actor_net = masked_networks.MaskedActorDistributionNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                fc_layer_params=actor_fc_layers
            )
            value_network = None
            if use_value_network:
                value_network = MaskedValueNetwork(
                    tf_env.observation_spec(),
                    fc_layer_params=value_fc_layers
                )

            global_step = tf.compat.v1.train.get_or_create_global_step()  # necessary ??? => Yes baby

            tf_agent = reinforce_agent.ReinforceAgent(
                tf_env.time_step_spec(),
                tf_env.action_spec(),
                actor_network=actor_net,
                value_network=value_network if use_value_network else None,
                value_estimation_loss_coef=.2,
                gamma=.9,
                optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3),
                debug_summaries=False,
                summarize_grads_and_vars=False,
                train_step_counter=global_step
                )

            self.policy = py_tf_policy.PyTFPolicy(tf_agent.policy)

            # load checkpoint
            #train_dir = os.path.join(root_dir, 'train')
            self.policy.initialize(None)
            self.policy.restore(root_dir)
            init_agent_op = tf_agent.initialize()

            self.sess.run(init_agent_op)
示例#6
0
 def __init__(self, flags):
     """Initialize runner."""
     self.flags = flags
     self.agent_config = {
         'players': flags['players'],
         'alpha': flags['alpha'],
         'gamma': flags['gamma']
     }
     self.environment = rl_env.make('Hanabi-My-Small',
                                    num_players=flags['players'],
                                    seed=flags['seed'])
     self.agent_class = AGENT_CLASSES[flags['agent_class']]
 def __init__(self, flags):
     """Initialize runner"""
     self.flags = flags
     self.environment = rl_env.make('Hanabi-Very-Small',
                                    num_players=self.flags['players'])
     self.agent_config = {
         "players": flags['players'],
         'state_size': self.environment.vectorized_observation_shape()[0],
         'action_size': self.environment.num_moves(),
         'env': self.environment
     }
     self.agent_class = AGENT_CLASSES[flags['agent_class']]
def create_environment(game_type='Hanabi-Full', num_players=2):
    """Creates the Hanabi environment.

  Args:
    game_type: Type of game to play. Currently the following are supported:
      Hanabi-Full: Regular game.
      Hanabi-Small: The small version of Hanabi, with 2 cards and 2 colours.
    num_players: Int, number of players to play this game.

  Returns:
    A Hanabi environment.
  """
    return rl_env.make(environment_name=game_type,
                       num_players=num_players,
                       pyhanabi_path=None)
    def __init__(self, flags):
        """Initialize runner."""
        self.flags = flags
        self.env = rl_env.make('Hanabi-Full', num_players=flags['players'])

        # create configurations
        self.agent_config, self.agent_2_config = self.generate_config(flags)

        # use configurations to create agent
        self.agent = load_agent(flags['agent_class'])(self.agent_config)

        if flags['agent2_class'] != flags['agent_class']:
            # use configurations to create second agent
            self.agent2 = load_agent(flags['agent2_class'])(
                self.agent_2_config)
    def __init__(self, game_type, agents):

        self.num_players = len(agents)
        self.game_type = game_type
        self.history_size = 1

        self.env = rl_env.make(environment_name=self.game_type, num_players=self.num_players)
        self.py_env = PyhanabiEnvWrapper(self.env)

        self.observation_size = 1041
        self.max_moves = self.env.num_moves()

        self.agents = agents

        for agent in agents:
            agent.eval_mode = True
 def __init__(self, flags):
     """Initialize runner."""
     self.num_episodes = flags['num_episodes']
     self.environment = batched_py_environment.BatchedPyEnvironment(
         [rl_env.make('Hanabi-Full', num_players=flags['players'])])
     self.agent_config = {
         'max_information_tokens':
         self.environment.envs[0].game.max_information_tokens(),
         'action_spec':
         self.environment.action_spec(),
         'observation_spec':
         self.environment.observation_spec(),
         'environment_batch_size':
         self.environment.batch_size
     }
     self.agent_1 = AGENT_CLASSES[flags['agent_1']](self.agent_config)
     self.agent_2 = AGENT_CLASSES[flags['agent_2']](self.agent_config)
示例#12
0
    def run(self):
        for episode in range(self.episodes):
            environment = rl_env.make('Hanabi-Full',
                                      num_players=len(self.agents))
            observations = environment.reset()
            agents = self.agents
            for agent in agents:
                agent.reset(self.config)
            done = False
            episode_reward = 0
            turns = 0
            player_id = -1
            current_player_action = None
            while not done:
                for agent_id, agent in enumerate(agents):
                    agent.id = agent_id
                    observation = observations['player_observations'][agent_id]
                    if observations['current_player'] == agent_id:
                        action = agent.act(observation)
                        assert action is not None
                        current_player_action = action
                        player_id = agent_id
                observations, reward, done, unused_info = environment.step(
                    current_player_action)
                if current_player_action is not None:
                    hand = None
                    # If an action is a reveal we need to pass the target hand
                    if current_player_action["action_type"].startswith(
                            "REVEAL"):
                        observed_hands = observations['player_observations'][
                            player_id]["observed_hands"]
                        hand = observed_hands[
                            current_player_action["target_offset"]]
                    # Pass action to agents
                    for agent_id, agent in enumerate(agents):
                        agent.on_action(player_id, current_player_action, hand)

                    # Make an environment step.
                episode_reward += reward
                turns += 1
            self.rewards[episode] = episode_reward
示例#13
0
    def __init__(
            self,
            root_dir,
            game_type,
            num_players,
            actor_fc_layers=(150, 75),
            value_fc_layers=(150, 75),
            actor_fc_layers_rnn=(150, ),
            value_fc_layers_rnn=(150, ),
            lstm_size=(75, 75),
            use_rnns=False,
    ):

        tf.reset_default_graph()
        self.sess = tf.Session()
        tf.compat.v1.enable_resource_variables()

        pyhanabi_env = rl_env.make(environment_name=game_type,
                                   num_players=num_players)
        py_env = PyhanabiEnvWrapper(pyhanabi_env)
        tf_env = tf_py_environment.TFPyEnvironment(py_env)

        with self.sess.as_default():
            # init the agent
            if use_rnns:
                actor_net = masked_networks.MaskedActorDistributionRnnNetwork(
                    tf_env.observation_spec(),
                    tf_env.action_spec(),
                    input_fc_layer_params=actor_fc_layers_rnn,
                    output_fc_layer_params=None,
                    lstm_size=lstm_size,
                )
                value_net = MaskedValueNetwork(tf_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)
            else:
                actor_net = masked_networks.MaskedActorDistributionNetwork(
                    tf_env.observation_spec(),
                    tf_env.action_spec(),
                    fc_layer_params=actor_fc_layers)
                value_net = MaskedValueNetwork(tf_env.observation_spec(),
                                               fc_layer_params=value_fc_layers)

            global_step = tf.compat.v1.train.get_or_create_global_step(
            )  # necessary ???
            tf_agent = ppo_agent.PPOAgent(
                tf_env.time_step_spec(),
                tf_env.action_spec(),
                actor_net=actor_net,
                value_net=value_net,
                train_step_counter=global_step,
                normalize_observations=False
            )  # cause the observations also include the 0-1 mask

            self.policy = py_tf_policy.PyTFPolicy(tf_agent.policy)

            # load checkpoint
            train_dir = os.path.join(root_dir, 'train')
            self.policy.initialize(None)
            self.policy.restore(root_dir)
            init_agent_op = tf_agent.initialize()

            self.sess.run(init_agent_op)
示例#14
0
def load_hanabi_env(env_name="Hanabi-Full", num_players=4):
  pyhanabi_env = rl_env.make(environment_name=env_name, num_players=num_players)
  py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env)
  return py_env
示例#15
0
def load_env(variant="Hanabi-Small", players=4):
    pyhanabi_env = rl_env.make(environment_name=variant, num_players=players)
    py_env = pyhanabi_env_wrapper.PyhanabiEnvWrapper(pyhanabi_env)

    return py_env
示例#16
0
    def __init__(self, agent_class, numAgents=-1, load=False, size=1000000):
        """
        Args:
            agent_class (string): the class of the agent, which can be one of:
                - 'SimpleAgent'
                - 'RainbowAgent'
                - 'RandomAgent'
            numAgents (int, optional): the number of agents
            load (boolean, optional): whether we have to load possible
                existent data of the given class of agents.
            size (int, optional): how many steps are going to be saved,
                default is 100K. This size is used to allocate memory at the beginning
        """

        self.size = size
        self.ptr = 0
        self.ep_start_id = self.ptr
        self.full = False
        self.path = os.path.join(self.path, agent_class)

        if not load and numAgents == -1:
            print(
                "Bad parameter initialization. Use either 'numAgents' or 'load' to initialize the object.")
            exit()
        else:
            if load:
                # load the configurations from file
                self.config = pickle.load(
                    open(os.path.join(self.path, "config.pickle"), "rb"))
                numAgents = self.config["numAgents"]

            else:
                self.config = {}                        # create empty dict
                self.config["numAgents"] = numAgents    # insert config data

        try:
            # detect the size of the observations
            env = rl_env.make(num_players=numAgents)
            obs = env.reset()
            self.config["size_obs"] = len(
                obs['player_observations'][0]['vectorized'])

            # detect the size of move
            self.n_moves = env.num_moves()

            # initialize matrices for all values
            self.moves = np.empty(size, dtype=np.uint8)
            self.rs = np.empty(size)
            self.obs = np.empty((size, self.config["size_obs"]), dtype=bool)
            self.eps = []

            # initialize last episode
            self.last_ep = -1
        except BaseException:
            # if the environment can't be create, we still can load
            if numAgents == 2 or numAgents == 3:
                self.n_cards = 5
            elif numAgents == 4 or numAgents == 4:
                self.n_cards = 4
            else:
                print("ERROR: invalid number of players")
                return

            self.n_moves = numAgents * 10 + self.n_cards * 2

            print("WARNING: the environment could not be created.")
            print("Some functionality may be compromised. You CAN still load data.")
def create_environment(game_type='Hanabi-Full', num_players=2):
    """Creates the Hanabi environment."""
    return rl_env.make(environment_name=game_type,
                       num_players=num_players,
                       pyhanabi_path=None)
示例#18
0
def run_verbose_mode(agent_1, agent_2):
    env = rl_env.make('Hanabi-Full-CardKnowledge', num_players=2)
    tf_env = tf_py_environment.TFPyEnvironment(env)

    state = tf.env.reset()